Update integrated_cache.rs to use new hashmap API

Merge branch 'quantumish/lfc-resizable-map' into quantumish/comm-lfc-integration
Make neon-shmem tests thread-safe and report errno in panics
2026-07-03 12:10:36 +00:00 · 2025-07-02 12:18:37 -07:00 · 2025-07-02 12:05:24 -07:00 · 2025-07-02 11:57:49 -07:00 · 2025-07-02 11:44:38 -07:00 · 2025-06-30 17:07:07 -07:00
141 changed files with 16596 additions and 2799 deletions
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -189,7 +189,6 @@ jobs:
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}

      - name: Build all
-        if: steps.cache_pg_14.outputs.cache-hit != 'true'
        # Note: the Makefile picks up BUILD_TYPE and CARGO_PROFILE from the env variables
        run: mold -run make ${make_vars} all -j$(nproc) CARGO_BUILD_FLAGS="$CARGO_FLAGS"

--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ neon.iml
 /.neon
 /integration_tests/.neon
 compaction-suite-results.*
+pgxn/neon/communicator/communicator_bindings.h

 # Coverage
 *.profraw
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,6 +8,7 @@ members = [
    "pageserver/compaction",
    "pageserver/ctl",
    "pageserver/client",
+    "pageserver/client_grpc",
    "pageserver/pagebench",
    "pageserver/page_api",
    "proxy",
@@ -23,7 +24,6 @@ members = [
    "libs/pageserver_api",
    "libs/postgres_ffi",
    "libs/postgres_ffi_types",
-    "libs/postgres_versioninfo",
    "libs/safekeeper_api",
    "libs/desim",
    "libs/neon-shmem",
@@ -34,6 +34,7 @@ members = [
    "libs/pq_proto",
    "libs/tenant_size_model",
    "libs/metrics",
+    "libs/neonart",
    "libs/postgres_connection",
    "libs/remote_storage",
    "libs/tracing-utils",
@@ -46,6 +47,7 @@ members = [
    "libs/proxy/postgres-types2",
    "libs/proxy/tokio-postgres2",
    "endpoint_storage",
+    "pgxn/neon/communicator",
 ]

 [workspace.package]
@@ -89,6 +91,7 @@ clap = { version = "4.0", features = ["derive", "env"] }
 clashmap = { version = "1.0", features = ["raw-api"] }
 comfy-table = "7.1"
 const_format = "0.2"
+crossbeam-utils = "0.8.21"
 crc32c = "0.6"
 diatomic-waker = { version = "0.2.3" }
 either = "1.8"
@@ -147,6 +150,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pem = "3.0.3"
+peekable = "0.3.0"
 pin-project-lite = "0.2"
 pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
 procfs = "0.16"
@@ -175,13 +179,13 @@ serde_json = "1"
 serde_path_to_error = "0.1"
 serde_with = { version = "3", features = [ "base64" ] }
 serde_assert = "0.5.0"
-serde_repr = "0.1.20"
 sha2 = "0.10.2"
 signal-hook = "0.3"
 smallvec = "1.11"
 smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 spki = "0.7.3"
+spin = "0.9.8"
 strum = "0.26"
 strum_macros = "0.26"
 "subtle"  = "2.5.0"
@@ -193,13 +197,12 @@ thiserror = "1.0"
 tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
 tokio = { version = "1.43.1", features = ["macros"] }
-tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
 tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
 tokio-stream = "0.1"
 tokio-tar = "0.3"
-tokio-util = { version = "0.7.10", features = ["io", "rt"] }
+tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
 toml = "0.8"
 toml_edit = "0.22"
 tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "gzip", "prost", "router", "server", "tls-ring", "tls-native-roots", "zstd"] }
@@ -235,6 +238,9 @@ x509-cert = { version = "0.2.5" }
 env_logger = "0.11"
 log = "0.4"

+tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
+uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
+
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
@@ -254,16 +260,18 @@ desim = { version = "0.1", path = "./libs/desim" }
 endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" }
 http-utils = { version = "0.1", path = "./libs/http-utils/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
+neonart = { version = "0.1", path = "./libs/neonart/" }
+neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
 pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
+pageserver_client_grpc = { path = "./pageserver/client_grpc" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 pageserver_page_api = { path = "./pageserver/page_api" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
 postgres_ffi_types = { version = "0.1", path = "./libs/postgres_ffi_types/" }
-postgres_versioninfo = { version = "0.1", path = "./libs/postgres_versioninfo/" }
 postgres_initdb = { path = "./libs/postgres_initdb" }
 posthog_client_lite = { version = "0.1", path = "./libs/posthog_client_lite" }
 pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
@@ -283,6 +291,7 @@ walproposer = { version = "0.1", path = "./libs/walproposer/" }
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
+cbindgen = "0.28.0"
 criterion = "0.5.1"
 rcgen = "0.13"
 rstest = "0.18"
--- a/15
+++ b/15
@@ -27,11 +27,18 @@ ifeq ($(BUILD_TYPE),release)
 	PG_CFLAGS += -O2 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
 	CARGO_PROFILE ?= --profile=release
+	# NEON_CARGO_ARTIFACT_TARGET_DIR is the directory where `cargo build` places
+	# the final build artifacts. There is unfortunately no easy way of changing
+	# it to a fully predictable path, nor to extract the path with a simple
+	# command. See https://github.com/rust-lang/cargo/issues/9661 and
+	# https://github.com/rust-lang/cargo/issues/6790.
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
 else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
 	PG_CFLAGS += -O0 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
 	CARGO_PROFILE ?= --profile=dev
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
 else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif
@@ -104,6 +111,7 @@ all: neon postgres neon-pg-ext
 neon: postgres-headers walproposer-lib cargo-target-dir
 	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE)
+
 .PHONY: cargo-target-dir
 cargo-target-dir:
 	# https://github.com/rust-lang/cargo/issues/14281
@@ -181,10 +189,13 @@ postgres-check-%: postgres-%
 	$(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 check

 .PHONY: neon-pg-ext-%
-neon-pg-ext-%: postgres-%
+neon-pg-ext-%: postgres-% cargo-target-dir
 	+@echo "Compiling neon-specific Postgres extensions for $*"
 	mkdir -p $(BUILD_DIR)/pgxn-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
+	$(MAKE) PG_CONFIG="$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config" COPT='$(COPT)' \
+		NEON_CARGO_ARTIFACT_TARGET_DIR="$(NEON_CARGO_ARTIFACT_TARGET_DIR)" \
+		CARGO_BUILD_FLAGS="$(CARGO_BUILD_FLAGS)" \
+		CARGO_PROFILE="$(CARGO_PROFILE)" \
 		-C $(BUILD_DIR)/pgxn-$*\
 		-f $(ROOT_PROJECT_DIR)/pgxn/Makefile  install

--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -38,6 +38,7 @@ once_cell.workspace = true
 opentelemetry.workspace = true
 opentelemetry_sdk.workspace = true
 p256 = { version = "0.13", features = ["pem"] }
+pageserver_page_api.workspace = true
 postgres.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["json"] }
@@ -53,6 +54,7 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
 tokio-util.workspace = true
 tokio-stream.workspace = true
+tonic.workspace = true
 tower-otel.workspace = true
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
@@ -64,7 +66,6 @@ uuid.workspace = true
 walkdir.workspace = true
 x509-cert.workspace = true

-postgres_versioninfo.workspace = true
 postgres_initdb.workspace = true
 compute_api.workspace = true
 utils.workspace = true
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -29,7 +29,7 @@ use anyhow::{Context, bail};
 use aws_config::BehaviorVersion;
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
-use compute_tools::extension_server::get_pg_version;
+use compute_tools::extension_server::{PostgresMajorVersion, get_pg_version};
 use nix::unistd::Pid;
 use std::ops::Not;
 use tracing::{Instrument, error, info, info_span, warn};
@@ -179,8 +179,12 @@ impl PostgresProcess {
            .await
            .context("create pgdata directory")?;

-        let pg_version = get_pg_version(self.pgbin.as_ref());
-
+        let pg_version = match get_pg_version(self.pgbin.as_ref()) {
+            PostgresMajorVersion::V14 => 14,
+            PostgresMajorVersion::V15 => 15,
+            PostgresMajorVersion::V16 => 16,
+            PostgresMajorVersion::V17 => 17,
+        };
        postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
            superuser: initdb_user,
            locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,4 +1,4 @@
-use anyhow::{Context, Result};
+use anyhow::{Context, Result, anyhow};
 use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
 use compute_api::responses::{
@@ -15,6 +15,7 @@ use itertools::Itertools;
 use nix::sys::signal::{Signal, kill};
 use nix::unistd::Pid;
 use once_cell::sync::Lazy;
+use pageserver_page_api as page_api;
 use postgres;
 use postgres::NoTls;
 use postgres::error::SqlState;
@@ -30,6 +31,7 @@ use std::sync::{Arc, Condvar, Mutex, RwLock};
 use std::time::{Duration, Instant};
 use std::{env, fs};
 use tokio::spawn;
+use tokio_util::io::StreamReader;
 use tracing::{Instrument, debug, error, info, instrument, warn};
 use url::Url;
 use utils::id::{TenantId, TimelineId};
@@ -418,7 +420,7 @@ impl ComputeNode {

        let mut new_state = ComputeState::new();
        if let Some(spec) = config.spec {
-            let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
+            let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow!(msg))?;
            new_state.pspec = Some(pspec);
        }

@@ -1004,6 +1006,75 @@ impl ComputeNode {
    #[instrument(skip_all, fields(%lsn))]
    fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");
+        let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
+
+        match Url::parse(shard0_connstr)?.scheme() {
+            "postgres" | "postgresql" => self.try_get_basebackup_libpq(spec, lsn),
+            "grpc" => self.try_get_basebackup_grpc(spec, lsn),
+            scheme => return Err(anyhow!("unknown URL scheme {scheme}")),
+        }
+    }
+
+    fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<()> {
+        let start_time = Instant::now();
+
+        let shard0_connstr = spec
+            .pageserver_connstr
+            .split(',')
+            .next()
+            .unwrap()
+            .to_string();
+
+        let chunks = tokio::runtime::Handle::current().block_on(async move {
+            let mut client = page_api::proto::PageServiceClient::connect(shard0_connstr).await?;
+
+            let req = page_api::proto::GetBaseBackupRequest {
+                lsn: lsn.0,
+                replica: false, // TODO: handle replicas, with LSN 0
+                full: false,
+            };
+            let mut req = tonic::Request::new(req);
+            let metadata = req.metadata_mut();
+            metadata.insert("neon-tenant-id", spec.tenant_id.to_string().parse()?);
+            metadata.insert("neon-timeline-id", spec.timeline_id.to_string().parse()?);
+            metadata.insert("neon-shard-id", "0000".to_string().parse()?); // TODO: shard count
+            if let Some(auth) = spec.storage_auth_token.as_ref() {
+                metadata.insert("authorization", format!("Bearer {auth}").parse()?);
+            }
+
+            let chunks = client.get_base_backup(req).await?.into_inner();
+            anyhow::Ok(chunks)
+        })?;
+        let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
+
+        // Convert the chunks stream into an AsyncRead
+        let stream_reader = StreamReader::new(
+            chunks.map(|chunk| chunk.map(|c| c.chunk).map_err(std::io::Error::other)),
+        );
+
+        // Wrap the AsyncRead into a blocking reader for compatibility with tar::Archive
+        let reader = tokio_util::io::SyncIoBridge::new(stream_reader);
+        let mut measured_reader = MeasuredReader::new(reader);
+        let mut bufreader = std::io::BufReader::new(&mut measured_reader);
+
+        // Read the archive directly from the `CopyOutReader`
+        //
+        // Set `ignore_zeros` so that unpack() reads all the Copy data and
+        // doesn't stop at the end-of-archive marker. Otherwise, if the server
+        // sends an Error after finishing the tarball, we will not notice it.
+        let mut ar = tar::Archive::new(&mut bufreader);
+        ar.set_ignore_zeros(true);
+        ar.unpack(&self.params.pgdata)?;
+
+        // Report metrics
+        let mut state = self.state.lock().unwrap();
+        state.metrics.pageserver_connect_micros = pageserver_connect_micros;
+        state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
+        state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
+        Ok(())
+    }
+
+    fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<()> {
        let start_time = Instant::now();

        let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
@@ -1019,12 +1090,10 @@ impl ComputeNode {
        }

        config.application_name("compute_ctl");
-        if let Some(spec) = &compute_state.pspec {
-            config.options(&format!(
-                "-c neon.compute_mode={}",
-                spec.spec.mode.to_type_str()
-            ));
-        }
+        config.options(&format!(
+            "-c neon.compute_mode={}",
+            spec.spec.mode.to_type_str()
+        ));

        // Connect to pageserver
        let mut client = config.connect(NoTls)?;
@@ -1098,10 +1167,7 @@ impl ComputeNode {
                    return result;
                }
                Err(ref e) if attempts < max_attempts => {
-                    warn!(
-                        "Failed to get basebackup: {} (attempt {}/{})",
-                        e, attempts, max_attempts
-                    );
+                    warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})");
                    std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
                    retry_period_ms *= 1.5;
                }
@@ -1979,7 +2045,7 @@ LIMIT 100",
            self.params
                .remote_ext_base_url
                .as_ref()
-                .ok_or(DownloadError::BadInput(anyhow::anyhow!(
+                .ok_or(DownloadError::BadInput(anyhow!(
                    "Remote extensions storage is not configured",
                )))?;

@@ -2175,7 +2241,7 @@ LIMIT 100",
        let remote_extensions = spec
            .remote_extensions
            .as_ref()
-            .ok_or(anyhow::anyhow!("Remote extensions are not configured"))?;
+            .ok_or(anyhow!("Remote extensions are not configured"))?;

        info!("parse shared_preload_libraries from spec.cluster.settings");
        let mut libs_vec = Vec::new();
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -74,11 +74,9 @@ More specifically, here is an example ext_index.json
 use std::path::Path;
 use std::str;

-use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
 use anyhow::{Context, Result, bail};
 use bytes::Bytes;
 use compute_api::spec::RemoteExtSpec;
-use postgres_versioninfo::PgMajorVersion;
 use regex::Regex;
 use remote_storage::*;
 use reqwest::StatusCode;
@@ -88,6 +86,8 @@ use tracing::log::warn;
 use url::Url;
 use zstd::stream::read::Decoder;

+use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
+
 fn get_pg_config(argument: &str, pgbin: &str) -> String {
    // gives the result of `pg_config [argument]`
    // where argument is a flag like `--version` or `--sharedir`
@@ -106,7 +106,7 @@ fn get_pg_config(argument: &str, pgbin: &str) -> String {
        .to_string()
 }

-pub fn get_pg_version(pgbin: &str) -> PgMajorVersion {
+pub fn get_pg_version(pgbin: &str) -> PostgresMajorVersion {
    // pg_config --version returns a (platform specific) human readable string
    // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc.
    let human_version = get_pg_config("--version", pgbin);
@@ -114,11 +114,25 @@ pub fn get_pg_version(pgbin: &str) -> PgMajorVersion {
 }

 pub fn get_pg_version_string(pgbin: &str) -> String {
-    get_pg_version(pgbin).v_str()
+    match get_pg_version(pgbin) {
+        PostgresMajorVersion::V14 => "v14",
+        PostgresMajorVersion::V15 => "v15",
+        PostgresMajorVersion::V16 => "v16",
+        PostgresMajorVersion::V17 => "v17",
+    }
+    .to_owned()
 }

-fn parse_pg_version(human_version: &str) -> PgMajorVersion {
-    use PgMajorVersion::*;
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum PostgresMajorVersion {
+    V14,
+    V15,
+    V16,
+    V17,
+}
+
+fn parse_pg_version(human_version: &str) -> PostgresMajorVersion {
+    use PostgresMajorVersion::*;
    // Normal releases have version strings like "PostgreSQL 15.4". But there
    // are also pre-release versions like "PostgreSQL 17devel" or "PostgreSQL
    // 16beta2" or "PostgreSQL 17rc1". And with the --with-extra-version
@@ -129,10 +143,10 @@ fn parse_pg_version(human_version: &str) -> PgMajorVersion {
        .captures(human_version)
    {
        Some(captures) if captures.len() == 2 => match &captures["major"] {
-            "14" => return PG14,
-            "15" => return PG15,
-            "16" => return PG16,
-            "17" => return PG17,
+            "14" => return V14,
+            "15" => return V15,
+            "16" => return V16,
+            "17" => return V17,
            _ => {}
        },
        _ => {}
@@ -335,25 +349,25 @@ mod tests {

    #[test]
    fn test_parse_pg_version() {
-        use postgres_versioninfo::PgMajorVersion::*;
-        assert_eq!(parse_pg_version("PostgreSQL 15.4"), PG15);
-        assert_eq!(parse_pg_version("PostgreSQL 15.14"), PG15);
+        use super::PostgresMajorVersion::*;
+        assert_eq!(parse_pg_version("PostgreSQL 15.4"), V15);
+        assert_eq!(parse_pg_version("PostgreSQL 15.14"), V15);
        assert_eq!(
            parse_pg_version("PostgreSQL 15.4 (Ubuntu 15.4-0ubuntu0.23.04.1)"),
-            PG15
+            V15
        );

-        assert_eq!(parse_pg_version("PostgreSQL 14.15"), PG14);
-        assert_eq!(parse_pg_version("PostgreSQL 14.0"), PG14);
+        assert_eq!(parse_pg_version("PostgreSQL 14.15"), V14);
+        assert_eq!(parse_pg_version("PostgreSQL 14.0"), V14);
        assert_eq!(
            parse_pg_version("PostgreSQL 14.9 (Debian 14.9-1.pgdg120+1"),
-            PG14
+            V14
        );

-        assert_eq!(parse_pg_version("PostgreSQL 16devel"), PG16);
-        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), PG16);
-        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), PG16);
-        assert_eq!(parse_pg_version("PostgreSQL 16extra"), PG16);
+        assert_eq!(parse_pg_version("PostgreSQL 16devel"), V16);
+        assert_eq!(parse_pg_version("PostgreSQL 16beta1"), V16);
+        assert_eq!(parse_pg_version("PostgreSQL 16rc2"), V16);
+        assert_eq!(parse_pg_version("PostgreSQL 16extra"), V16);
    }

    #[test]
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -48,7 +48,7 @@ use postgres_connection::parse_host_port;
 use safekeeper_api::membership::{SafekeeperGeneration, SafekeeperId};
 use safekeeper_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
-    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, PgMajorVersion, PgVersionId,
+    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
 };
 use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
 use tokio::task::JoinSet;
@@ -64,7 +64,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);

-const DEFAULT_PG_VERSION: PgMajorVersion = PgMajorVersion::PG17;
+const DEFAULT_PG_VERSION: u32 = 17;

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";

@@ -169,7 +169,7 @@ struct TenantCreateCmdArgs {

    #[arg(default_value_t = DEFAULT_PG_VERSION)]
    #[clap(long, help = "Postgres version to use for the initial timeline")]
-    pg_version: PgMajorVersion,
+    pg_version: u32,

    #[clap(
        long,
@@ -292,7 +292,7 @@ struct TimelineCreateCmdArgs {

    #[arg(default_value_t = DEFAULT_PG_VERSION)]
    #[clap(long, help = "Postgres version")]
-    pg_version: PgMajorVersion,
+    pg_version: u32,
 }

 #[derive(clap::Args)]
@@ -324,7 +324,7 @@ struct TimelineImportCmdArgs {

    #[arg(default_value_t = DEFAULT_PG_VERSION)]
    #[clap(long, help = "Postgres version of the backup being imported")]
-    pg_version: PgMajorVersion,
+    pg_version: u32,
 }

 #[derive(clap::Subcommand)]
@@ -603,7 +603,7 @@ struct EndpointCreateCmdArgs {

    #[arg(default_value_t = DEFAULT_PG_VERSION)]
    #[clap(long, help = "Postgres version")]
-    pg_version: PgMajorVersion,
+    pg_version: u32,

    /// Use gRPC to communicate with Pageservers, by generating grpc:// connstrings.
    ///
@@ -1295,7 +1295,7 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
                    },
                    new_members: None,
                };
-                let pg_version = PgVersionId::from(args.pg_version);
+                let pg_version = args.pg_version * 10000;
                let req = safekeeper_api::models::TimelineCreateRequest {
                    tenant_id,
                    timeline_id,
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -67,7 +67,6 @@ use nix::sys::signal::{Signal, kill};
 use pageserver_api::shard::ShardStripeSize;
 use pem::Pem;
 use reqwest::header::CONTENT_TYPE;
-use safekeeper_api::PgMajorVersion;
 use safekeeper_api::membership::SafekeeperGeneration;
 use serde::{Deserialize, Serialize};
 use sha2::{Digest, Sha256};
@@ -90,7 +89,7 @@ pub struct EndpointConf {
    pg_port: u16,
    external_http_port: u16,
    internal_http_port: u16,
-    pg_version: PgMajorVersion,
+    pg_version: u32,
    grpc: bool,
    skip_pg_catalog_updates: bool,
    reconfigure_concurrency: usize,
@@ -193,7 +192,7 @@ impl ComputeControlPlane {
        pg_port: Option<u16>,
        external_http_port: Option<u16>,
        internal_http_port: Option<u16>,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
        mode: ComputeMode,
        grpc: bool,
        skip_pg_catalog_updates: bool,
@@ -313,7 +312,7 @@ pub struct Endpoint {
    pub internal_http_address: SocketAddr,

    // postgres major version in the format: 14, 15, etc.
-    pg_version: PgMajorVersion,
+    pg_version: u32,

    // These are not part of the endpoint as such, but the environment
    // the endpoint runs in.
@@ -558,7 +557,7 @@ impl Endpoint {
                conf.append("hot_standby", "on");
                // prefetching of blocks referenced in WAL doesn't make sense for us
                // Neon hot standby ignores pages that are not in the shared_buffers
-                if self.pg_version >= PgMajorVersion::PG15 {
+                if self.pg_version >= 15 {
                    conf.append("recovery_prefetch", "off");
                }
            }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -15,7 +15,6 @@ use clap::ValueEnum;
 use pem::Pem;
 use postgres_backend::AuthType;
 use reqwest::{Certificate, Url};
-use safekeeper_api::PgMajorVersion;
 use serde::{Deserialize, Serialize};
 use utils::auth::encode_from_key_file;
 use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
@@ -422,21 +421,25 @@ impl LocalEnv {
        self.pg_distrib_dir.clone()
    }

-    pub fn pg_distrib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<PathBuf> {
+    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

-        Ok(path.join(pg_version.v_str()))
+        #[allow(clippy::manual_range_patterns)]
+        match pg_version {
+            14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
+        }
    }

-    pub fn pg_dir(&self, pg_version: PgMajorVersion, dir_name: &str) -> anyhow::Result<PathBuf> {
+    pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result<PathBuf> {
        Ok(self.pg_distrib_dir(pg_version)?.join(dir_name))
    }

-    pub fn pg_bin_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<PathBuf> {
+    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        self.pg_dir(pg_version, "bin")
    }

-    pub fn pg_lib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<PathBuf> {
+    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
        self.pg_dir(pg_version, "lib")
    }

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -22,7 +22,6 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
 use postgres_connection::{PgConnectionConfig, parse_host_port};
-use safekeeper_api::PgMajorVersion;
 use utils::auth::{Claims, Scope};
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
@@ -608,7 +607,7 @@ impl PageServerNode {
        timeline_id: TimelineId,
        base: (Lsn, PathBuf),
        pg_wal: Option<(Lsn, PathBuf)>,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
    ) -> anyhow::Result<()> {
        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -6,8 +6,6 @@ use std::str::FromStr;
 use std::sync::OnceLock;
 use std::time::{Duration, Instant};

-use crate::background_process;
-use crate::local_env::{LocalEnv, NeonStorageControllerConf};
 use camino::{Utf8Path, Utf8PathBuf};
 use hyper0::Uri;
 use nix::unistd::Pid;
@@ -24,7 +22,6 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use pem::Pem;
 use postgres_backend::AuthType;
 use reqwest::{Method, Response};
-use safekeeper_api::PgMajorVersion;
 use serde::de::DeserializeOwned;
 use serde::{Deserialize, Serialize};
 use tokio::process::Command;
@@ -34,6 +31,9 @@ use utils::auth::{Claims, Scope, encode_from_key_file};
 use utils::id::{NodeId, TenantId};
 use whoami::username;

+use crate::background_process;
+use crate::local_env::{LocalEnv, NeonStorageControllerConf};
+
 pub struct StorageController {
    env: LocalEnv,
    private_key: Option<Pem>,
@@ -48,7 +48,7 @@ pub struct StorageController {

 const COMMAND: &str = "storage_controller";

-const STORAGE_CONTROLLER_POSTGRES_VERSION: PgMajorVersion = PgMajorVersion::PG16;
+const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;

 const DB_NAME: &str = "storage_controller";

@@ -184,15 +184,9 @@ impl StorageController {
    /// to other versions if that one isn't found.  Some automated tests create circumstances
    /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
    async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
-        const PREFER_VERSIONS: [PgMajorVersion; 5] = [
-            STORAGE_CONTROLLER_POSTGRES_VERSION,
-            PgMajorVersion::PG16,
-            PgMajorVersion::PG15,
-            PgMajorVersion::PG14,
-            PgMajorVersion::PG17,
-        ];
+        let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 16, 15, 14];

-        for v in PREFER_VERSIONS {
+        for v in prefer_versions {
            let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
            if tokio::fs::try_exists(&path).await? {
                return Ok(path);
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -4,6 +4,7 @@
 //! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
 //! compute_ctl can fetch it by calling the control plane's API.
 use std::collections::HashMap;
+use std::fmt::Display;

 use indexmap::IndexMap;
 use regex::Regex;
@@ -319,6 +320,12 @@ impl ComputeMode {
    }
 }

+impl Display for ComputeMode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.to_type_str())
+    }
+}
+
 /// Log level for audit logging
 #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeAudit {
--- a/libs/neon-shmem/Cargo.toml
+++ b/libs/neon-shmem/Cargo.toml
@@ -6,8 +6,27 @@ license.workspace = true

 [dependencies]
 thiserror.workspace = true
-nix.workspace=true
+nix.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+rustc-hash = { version = "2.1.1" }
+rand = "0.9.1"
+libc.workspace = true
+lock_api = "0.4.13"
+
+[dev-dependencies]
+criterion = { workspace = true, features = ["html_reports"] }
+rand_distr = "0.5.1"
+xxhash-rust = { version = "0.8.15", features = ["xxh3"] }
+ahash.workspace = true
+twox-hash = { version = "2.1.1" }
+seahash = "4.1.0"
+hashbrown = { git = "https://github.com/quantumish/hashbrown.git", rev = "6610e6d" }
+foldhash = "0.1.5"
+

 [target.'cfg(target_os = "macos")'.dependencies]
 tempfile = "3.14.0"
+
+[[bench]]
+name = "hmap_resize"
+harness = false
--- a/libs/neon-shmem/benches/hmap_resize.rs
+++ b/libs/neon-shmem/benches/hmap_resize.rs
@@ -0,0 +1,282 @@
+use criterion::{criterion_group, criterion_main, BatchSize, Criterion, BenchmarkId};
+use neon_shmem::hash::HashMapAccess;
+use neon_shmem::hash::HashMapInit;
+use neon_shmem::hash::entry::Entry;
+use rand::prelude::*;
+use rand::distr::{Distribution, StandardUniform};
+use std::hash::BuildHasher;
+use std::default::Default;
+	
+// Taken from bindings to C code
+
+#[derive(Clone, Debug, Hash, Eq, PartialEq)]
+#[repr(C)]
+pub struct FileCacheKey {
+    pub _spc_id: u32,
+    pub _db_id: u32,
+    pub _rel_number: u32,
+    pub _fork_num: u32,
+    pub _block_num: u32,
+}
+
+impl Distribution<FileCacheKey> for StandardUniform {
+	// questionable, but doesn't need to be good randomness
+	fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> FileCacheKey {
+		FileCacheKey {
+			_spc_id: rng.random(),
+			_db_id: rng.random(),
+			_rel_number: rng.random(),
+			_fork_num: rng.random(),
+			_block_num: rng.random()
+		}
+    }
+}
+
+#[derive(Clone, Debug)]
+#[repr(C)]
+pub struct FileCacheEntry {
+    pub _offset: u32,
+    pub _access_count: u32,
+    pub _prev: *mut FileCacheEntry,
+    pub _next: *mut FileCacheEntry,
+    pub _state: [u32; 8],
+}
+
+impl FileCacheEntry {
+	fn dummy() -> Self {
+		Self {
+			_offset: 0,
+			_access_count: 0,
+			_prev: std::ptr::null_mut(),
+			_next: std::ptr::null_mut(),
+			_state: [0; 8]				
+		}
+	}
+}
+
+// Utilities for applying operations.
+
+#[derive(Clone, Debug)]
+struct TestOp<K,V>(K, Option<V>);
+
+fn apply_op<K: Clone + std::hash::Hash + Eq, V, S: std::hash::BuildHasher>(
+    op: TestOp<K,V>,
+    map: &mut HashMapAccess<K,V,S>,
+) {
+	let entry = map.entry(op.0);
+
+    match op.1 {
+		Some(new) => {
+			match entry {
+				Entry::Occupied(mut e) => Some(e.insert(new)),
+				Entry::Vacant(e) => { _ = e.insert(new).unwrap(); None },
+			}
+		},
+		None => {
+			match entry {
+				Entry::Occupied(e) => Some(e.remove()),
+				Entry::Vacant(_) => None,
+			}
+		},
+	};
+}
+
+// Hash utilities
+
+struct SeaRandomState {
+	k1: u64,
+	k2: u64,
+	k3: u64,
+	k4: u64 
+}
+
+impl std::hash::BuildHasher for SeaRandomState {
+	type Hasher = seahash::SeaHasher;
+	
+	fn build_hasher(&self) -> Self::Hasher {
+		seahash::SeaHasher::with_seeds(self.k1, self.k2, self.k3, self.k4)
+	}
+}
+
+impl SeaRandomState {
+	fn new() -> Self {
+		let mut rng = rand::rng();
+		Self { k1: rng.random(), k2: rng.random(), k3: rng.random(), k4: rng.random() }
+	}
+}
+
+fn small_benchs(c: &mut Criterion) {
+	let mut group = c.benchmark_group("Small maps");
+    group.sample_size(10);
+        
+	group.bench_function("small_rehash", |b| {
+		let ideal_filled = 4_000_000;
+		let size = 5_000_000;
+		let mut writer = HashMapInit::new_resizeable(size, size * 2).attach_writer();
+		let mut rng = rand::rng();		
+		while writer.get_num_buckets_in_use() < ideal_filled as usize {
+			let key: FileCacheKey = rng.random();
+			let val = FileCacheEntry::dummy();
+			apply_op(TestOp(key, Some(val)), &mut writer);
+		}
+		b.iter(|| writer.shuffle());
+	});
+	
+
+	group.bench_function("small_rehash_xxhash", |b| {
+		let ideal_filled = 4_000_000;
+		let size = 5_000_000;
+		let mut writer = HashMapInit::new_resizeable(size, size * 2)
+			.with_hasher(twox_hash::xxhash64::RandomState::default())
+			.attach_writer();
+		let mut rng = rand::rng();		
+		while writer.get_num_buckets_in_use() < ideal_filled as usize {
+			let key: FileCacheKey = rng.random();
+			let val = FileCacheEntry::dummy();
+			apply_op(TestOp(key, Some(val)), &mut writer);
+		}
+		b.iter(|| writer.shuffle());
+	});
+
+	
+	group.bench_function("small_rehash_ahash", |b| {
+		let ideal_filled = 4_000_000;
+		let size = 5_000_000;
+		let mut writer = HashMapInit::new_resizeable(size, size * 2)
+			.with_hasher(ahash::RandomState::default())
+			.attach_writer();
+		let mut rng = rand::rng();		
+		while writer.get_num_buckets_in_use() < ideal_filled as usize {
+			let key: FileCacheKey = rng.random();
+			let val = FileCacheEntry::dummy();
+			apply_op(TestOp(key, Some(val)), &mut writer);
+		}
+		b.iter(|| writer.shuffle());
+	});
+
+	group.bench_function("small_rehash_seahash", |b| {
+		let ideal_filled = 4_000_000;
+		let size = 5_000_000;
+		let mut writer = HashMapInit::new_resizeable(size, size * 2)
+			.with_hasher(SeaRandomState::new())
+			.attach_writer();
+		let mut rng = rand::rng();
+		while writer.get_num_buckets_in_use() < ideal_filled as usize {
+			let key: FileCacheKey = rng.random();
+			let val = FileCacheEntry::dummy();
+			apply_op(TestOp(key, Some(val)), &mut writer);
+		}
+		b.iter(|| writer.shuffle());
+	});
+
+	group.finish();	
+}
+
+fn real_benchs(c: &mut Criterion) {
+	let mut group = c.benchmark_group("Realistic workloads");
+	group.sample_size(10);	
+    group.bench_function("real_bulk_insert", |b| {
+		let size = 125_000_000;
+		let ideal_filled = 100_000_000;		
+		let mut rng = rand::rng();		
+		b.iter_batched(
+			|| HashMapInit::new_resizeable(size, size * 2).attach_writer(),
+			|writer| {
+				for _ in 0..ideal_filled {
+					let key: FileCacheKey = rng.random();
+					let val = FileCacheEntry::dummy();
+					let entry = writer.entry(key);
+					std::hint::black_box(match entry {
+						Entry::Occupied(mut e) => { e.insert(val); },
+						Entry::Vacant(e) => { _ = e.insert(val).unwrap(); },
+					})
+				}	
+			},
+			BatchSize::SmallInput,
+		)
+	});
+
+	group.bench_function("real_rehash", |b| {
+		let size = 125_000_000;
+		let ideal_filled = 100_000_000;		
+		let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
+		let mut rng = rand::rng();		
+		while writer.get_num_buckets_in_use() < ideal_filled {
+			let key: FileCacheKey = rng.random();
+			let val = FileCacheEntry::dummy();
+			apply_op(TestOp(key, Some(val)), &mut writer);
+		}
+		b.iter(|| writer.shuffle());
+	});
+	
+	group.bench_function("real_rehash_hashbrown", |b| {
+		let size = 125_000_000;
+		let ideal_filled = 100_000_000;
+		let mut writer = hashbrown::raw::RawTable::new();
+		let mut rng = rand::rng();
+		let hasher = rustc_hash::FxBuildHasher::default();
+		unsafe {
+			writer.resize(size, |(k,_)| hasher.hash_one(&k),
+						  hashbrown::raw::Fallibility::Infallible).unwrap();
+		}
+		while writer.len() < ideal_filled as usize {
+			let key: FileCacheKey = rng.random();
+			let val = FileCacheEntry::dummy();
+			writer.insert(hasher.hash_one(&key), (key, val), |(k,_)| hasher.hash_one(&k));
+		}
+		b.iter(|| unsafe { writer.table.rehash_in_place(
+			&|table, index| hasher.hash_one(&table.bucket::<(FileCacheKey, FileCacheEntry)>(index).as_ref().0),
+			std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
+            if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
+                Some(|ptr| std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry)))
+            } else {
+                None
+            },
+		) });
+	});
+
+	for elems in [2, 4, 8, 16, 32, 64, 96, 112] {
+		group.bench_with_input(BenchmarkId::new("real_rehash_varied", elems), &elems, |b, &size| {
+			let ideal_filled = size * 1_000_000;
+			let size = 125_000_000;
+			let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
+			let mut rng = rand::rng();		
+			while writer.get_num_buckets_in_use() < ideal_filled as usize {
+				let key: FileCacheKey = rng.random();
+				let val = FileCacheEntry::dummy();
+				apply_op(TestOp(key, Some(val)), &mut writer);
+			}
+			b.iter(|| writer.shuffle());
+		});
+		group.bench_with_input(BenchmarkId::new("real_rehash_varied_hashbrown", elems), &elems, |b, &size| {
+			let ideal_filled = size * 1_000_000;
+			let size = 125_000_000;
+			let mut writer = hashbrown::raw::RawTable::new();
+			let mut rng = rand::rng();
+			let hasher = rustc_hash::FxBuildHasher::default();
+			unsafe {
+				writer.resize(size, |(k,_)| hasher.hash_one(&k),
+							  hashbrown::raw::Fallibility::Infallible).unwrap();
+			}
+			while writer.len() < ideal_filled as usize {
+				let key: FileCacheKey = rng.random();
+				let val = FileCacheEntry::dummy();
+				writer.insert(hasher.hash_one(&key), (key, val), |(k,_)| hasher.hash_one(&k));
+			}
+			b.iter(|| unsafe { writer.table.rehash_in_place(
+				&|table, index| hasher.hash_one(&table.bucket::<(FileCacheKey, FileCacheEntry)>(index).as_ref().0),
+				std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
+				if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
+					Some(|ptr| std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry)))
+				} else {
+					None
+				},
+			) });
+		});
+	}
+	
+	group.finish();
+}
+	
+criterion_group!(benches, small_benchs, real_benchs);
+criterion_main!(benches);
--- a/libs/neon-shmem/src/hash.rs
+++ b/libs/neon-shmem/src/hash.rs
@@ -0,0 +1,533 @@
+//! Resizable hash table implementation on top of byte-level storage (either a [`ShmemHandle`] or a fixed byte array).
+//!
+//! This hash table has two major components: the bucket array and the dictionary. Each bucket within the
+//! bucket array contains a `Option<(K, V)>` and an index of another bucket. In this way there is both an 
+//! implicit freelist within the bucket array (`None` buckets point to other `None` entries) and various hash
+//! chains within the bucket array (a Some bucket will point to other Some buckets that had the same hash).
+//!
+//! Buckets are never moved unless they are within a region that is being shrunk, and so the actual hash-
+//! dependent component is done with the dictionary. When a new key is inserted into the map, a position
+//! within the dictionary is decided based on its hash, the data is inserted into an empty bucket based
+//! off of the freelist, and then the index of said bucket is placed in the dictionary.
+//!
+//! This map is resizable (if initialized on top of a [`ShmemHandle`]). Both growing and shrinking happen
+//! in-place and are at a high level achieved by expanding/reducing the bucket array and rebuilding the
+//! dictionary by rehashing all keys.
+
+use std::hash::{Hash, BuildHasher};
+use std::mem::MaybeUninit;
+
+use crate::{shmem, sync::*};
+use crate::shmem::ShmemHandle;
+
+mod core;
+pub mod entry;
+
+#[cfg(test)]
+mod tests;
+
+use core::{Bucket, CoreHashMap, INVALID_POS};
+use entry::{Entry, OccupiedEntry, VacantEntry, PrevPos};
+
+/// Builder for a [`HashMapAccess`].
+#[must_use]
+pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut RwLock<HashMapShared<'a, K, V>>,
+	shared_size: usize,
+	hasher: S,
+	num_buckets: u32,
+}
+
+/// Accessor for a hash table. 
+pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+	hasher: S,
+}
+
+unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<'_, K, V, S> {}
+unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<'_, K, V, S> {}
+
+impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
+	pub fn with_hasher<T: BuildHasher>(self, hasher: T) -> HashMapInit<'a, K, V, T> {
+		HashMapInit {
+			hasher,
+			shmem_handle: self.shmem_handle,
+			shared_ptr: self.shared_ptr,
+			shared_size: self.shared_size,
+			num_buckets: self.num_buckets,
+		}
+	}
+
+	/// Loosely (over)estimate the size needed to store a hash table with `num_buckets` buckets.
+	pub fn estimate_size(num_buckets: u32) -> usize {
+        // add some margin to cover alignment etc.
+        CoreHashMap::<K, V>::estimate_size(num_buckets) + size_of::<HashMapShared<K, V>>() + 1000
+    }
+
+	/// Initialize a table for writing.
+    pub fn attach_writer(self) -> HashMapAccess<'a, K, V, S> {
+        let mut ptr: *mut u8 = self.shared_ptr.cast();
+        let end_ptr: *mut u8 = unsafe { ptr.add(self.shared_size) };
+
+		// carve out area for the One Big Lock (TM) and the HashMapShared.
+		ptr = unsafe { ptr.add(ptr.align_offset(align_of::<libc::pthread_rwlock_t>())) };
+		let raw_lock_ptr = ptr;
+		ptr = unsafe { ptr.add(size_of::<libc::pthread_rwlock_t>()) };
+		ptr = unsafe { ptr.add(ptr.align_offset(align_of::<HashMapShared<K, V>>())) };
+		let shared_ptr: *mut HashMapShared<K, V> = ptr.cast();
+        ptr = unsafe { ptr.add(size_of::<HashMapShared<K, V>>()) };
+						
+        // carve out the buckets
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<core::Bucket<K, V>>())) };
+        let buckets_ptr = ptr;
+        ptr = unsafe { ptr.add(size_of::<core::Bucket<K, V>>() * self.num_buckets as usize) };
+
+        // use remaining space for the dictionary
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<u32>())) };
+        assert!(ptr.addr() < end_ptr.addr());
+        let dictionary_ptr = ptr;
+        let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::<u32>() as isize };
+        assert!(dictionary_size > 0);
+
+        let buckets =
+            unsafe { std::slice::from_raw_parts_mut(buckets_ptr.cast(), self.num_buckets as usize) };
+        let dictionary = unsafe {
+            std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize)
+        };
+        let hashmap = CoreHashMap::new(buckets, dictionary);
+		let lock = RwLock::from_raw(PthreadRwLock::new(raw_lock_ptr.cast()), hashmap);
+		unsafe {
+			std::ptr::write(shared_ptr, lock);
+		}
+		
+        HashMapAccess {
+            shmem_handle: self.shmem_handle,
+            shared_ptr,
+			hasher: self.hasher,
+        }
+    }
+
+	/// Initialize a table for reading. Currently identical to [`HashMapInit::attach_writer`].
+    pub fn attach_reader(self) -> HashMapAccess<'a, K, V, S> {
+        self.attach_writer()
+    }
+}
+
+/// Hash table data that is actually stored in the shared memory area.
+///
+/// NOTE: We carve out the parts from a contiguous chunk. Growing and shrinking the hash table
+/// relies on the memory layout! The data structures are laid out in the contiguous shared memory
+/// area as follows:
+///
+/// [`libc::pthread_rwlock_t`]
+/// [`HashMapShared`]
+/// [buckets]
+/// [dictionary]
+///
+/// In between the above parts, there can be padding bytes to align the parts correctly.
+type HashMapShared<'a, K, V> = RwLock<CoreHashMap<'a, K, V>>;
+
+impl<'a, K, V> HashMapInit<'a, K, V, rustc_hash::FxBuildHasher>
+where
+	K: Clone + Hash + Eq
+{
+	/// Place the hash table within a user-supplied fixed memory area.
+	pub fn with_fixed(
+		num_buckets: u32,
+        area: &'a mut [MaybeUninit<u8>],
+    ) -> Self {
+		Self {
+			num_buckets,
+			shmem_handle: None,
+			shared_ptr: area.as_mut_ptr().cast(),
+			shared_size: area.len(),
+			hasher: rustc_hash::FxBuildHasher,
+		}		
+    }
+
+    /// Place a new hash map in the given shared memory area
+	///
+	/// # Panics
+	/// Will panic on failure to resize area to expected map size.
+    pub fn with_shmem(num_buckets: u32, shmem: ShmemHandle) -> Self {
+		let size = Self::estimate_size(num_buckets);
+		shmem
+            .set_size(size)
+            .expect("could not resize shared memory area");
+		Self {
+			num_buckets,
+			shared_ptr: shmem.data_ptr.as_ptr().cast(),
+			shmem_handle: Some(shmem),
+			shared_size: size,
+			hasher: rustc_hash::FxBuildHasher
+		}
+    }
+
+	/// Make a resizable hash map within a new shared memory area with the given name.
+	pub fn new_resizeable_named(num_buckets: u32, max_buckets: u32, name: &str) -> Self {
+		let size = Self::estimate_size(num_buckets);
+		let max_size = Self::estimate_size(max_buckets);
+		let shmem = ShmemHandle::new(name, size, max_size)
+			.expect("failed to make shared memory area");
+		
+		Self {
+			num_buckets,
+			shared_ptr: shmem.data_ptr.as_ptr().cast(),
+			shmem_handle: Some(shmem),
+			shared_size: size,
+			hasher: rustc_hash::FxBuildHasher
+		}
+	}
+
+	/// Make a resizable hash map within a new anonymous shared memory area.
+	pub fn new_resizeable(num_buckets: u32, max_buckets: u32) -> Self {
+		use std::sync::atomic::{AtomicUsize, Ordering};
+		static COUNTER: AtomicUsize = AtomicUsize::new(0);
+		let val = COUNTER.fetch_add(1, Ordering::Relaxed);
+		let name = format!("neon_shmem_hmap{val}");
+		Self::new_resizeable_named(num_buckets, max_buckets, &name)
+	}
+}
+
+impl<'a, K, V, S: BuildHasher> HashMapAccess<'a, K, V, S>
+where
+    K: Clone + Hash + Eq,
+{
+	/// Hash a key using the map's hasher.
+	#[inline]
+    fn get_hash_value(&self, key: &K) -> u64 {
+		self.hasher.hash_one(key)        
+    }
+
+	fn entry_with_hash(&self, key: K, hash: u64) -> Entry<'a, '_, K, V> {
+		let mut map = unsafe { self.shared_ptr.as_ref() }.unwrap().write();
+        let dict_pos = hash as usize % map.dictionary.len();
+        let first = map.dictionary[dict_pos];
+        if first == INVALID_POS {
+            // no existing entry
+            return Entry::Vacant(VacantEntry {
+                map,
+                key,
+                dict_pos: dict_pos as u32,
+            });
+        }
+
+        let mut prev_pos = PrevPos::First(dict_pos as u32);
+        let mut next = first;
+        loop {
+            let bucket = &mut map.buckets[next as usize];
+            let (bucket_key, _bucket_value) = bucket.inner.as_mut().expect("entry is in use");
+            if *bucket_key == key {
+                // found existing entry
+                return Entry::Occupied(OccupiedEntry {
+                    map,
+                    _key: key,
+                    prev_pos,
+                    bucket_pos: next,
+                });
+            }
+
+            if bucket.next == INVALID_POS {
+                // No existing entry
+                return Entry::Vacant(VacantEntry {
+                    map,
+                    key,
+                    dict_pos: dict_pos as u32,
+                });
+            }
+            prev_pos = PrevPos::Chained(next);
+            next = bucket.next;
+        }
+	}
+	
+	/// Get a reference to the corresponding value for a key.
+    pub fn get<'e>(&'e self, key: &K) -> Option<ValueReadGuard<'e, V>> {
+		let hash = self.get_hash_value(key);
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+		RwLockReadGuard::try_map(map, |m| m.get_with_hash(key, hash)).ok()
+    }
+
+	/// Get a reference to the entry containing a key.
+    pub fn entry(&self, key: K) -> Entry<'a, '_, K, V> {
+		let hash = self.get_hash_value(&key);
+		self.entry_with_hash(key, hash)
+    }
+
+	/// Remove a key given its hash. Returns the associated value if it existed.
+    pub fn remove(&self, key: &K) -> Option<V> {
+		let hash = self.get_hash_value(&key);
+        match self.entry_with_hash(key.clone(), hash) {
+            Entry::Occupied(e) => Some(e.remove()),
+            Entry::Vacant(_) => None
+        }
+    }
+
+	/// Insert/update a key. Returns the previous associated value if it existed.
+	///
+	/// # Errors
+	/// Will return [`core::FullError`] if there is no more space left in the map.
+    pub fn insert(&self, key: K, value: V) -> Result<Option<V>, core::FullError> {
+		let hash = self.get_hash_value(&key);
+        match self.entry_with_hash(key.clone(), hash) {
+            Entry::Occupied(mut e) => Ok(Some(e.insert(value))),
+            Entry::Vacant(e) => {
+				_ = e.insert(value)?;
+				Ok(None)
+			}
+        }
+    }
+	
+	/// Optionally return the entry for a bucket at a given index if it exists.
+	///
+	/// Has more overhead than one would intuitively expect: performs both a clone of the key
+	/// due to the [`OccupiedEntry`] type owning the key and also a hash of the key in order
+	/// to enable repairing the hash chain if the entry is removed.
+    pub fn entry_at_bucket(&self, pos: usize) -> Option<OccupiedEntry<'a, '_, K, V>> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+		if pos >= map.buckets.len() {
+			return None;
+		}
+
+		let entry = map.buckets[pos].inner.as_ref();
+		match entry {
+			Some((key, _)) => Some(OccupiedEntry {
+				_key: key.clone(),
+				bucket_pos: pos as u32,
+				prev_pos: entry::PrevPos::Unknown(
+					self.get_hash_value(&key)
+				),
+				map,
+			}),
+			_ => None,
+		}
+    }
+
+	/// Returns the number of buckets in the table.
+    pub fn get_num_buckets(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+        map.get_num_buckets()
+    }
+
+    /// Return the key and value stored in bucket with given index. This can be used to
+    /// iterate through the hash map.
+	// TODO: An Iterator might be nicer. The communicator's clock algorithm needs to
+	// _slowly_ iterate through all buckets with its clock hand,  without holding a lock.
+	// If we switch to an Iterator, it must not hold the lock.
+    pub fn get_at_bucket(&self, pos: usize) -> Option<ValueReadGuard<(K, V)>> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+        if pos >= map.buckets.len() {
+            return None;
+        }
+		RwLockReadGuard::try_map(map, |m| m.buckets[pos].inner.as_ref()).ok()
+    }
+
+	/// Returns the index of the bucket a given value corresponds to.
+    pub fn get_bucket_for_value(&self, val_ptr: *const V) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+
+        let origin = map.buckets.as_ptr();
+        let idx = (val_ptr as usize - origin as usize) / size_of::<Bucket<K, V>>();
+        assert!(idx < map.buckets.len());
+
+        idx
+    }
+
+    /// Returns the number of occupied buckets in the table.
+    pub fn get_num_buckets_in_use(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+        map.buckets_in_use as usize
+    }
+
+	/// Clears all entries in a table. Does not reset any shrinking operations.
+	pub fn clear(&self) {
+		let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        map.clear();
+	}
+	
+	/// Perform an in-place rehash of some region (0..`rehash_buckets`) of the table and reset
+	/// the `buckets` and `dictionary` slices to be as long as `num_buckets`. Resets the freelist
+	/// in the process.
+	fn rehash_dict(
+		&self,
+		inner: &mut CoreHashMap<'a, K, V>,
+		buckets_ptr: *mut core::Bucket<K, V>,
+		end_ptr: *mut u8,
+		num_buckets: u32,
+		rehash_buckets: u32,
+	) {
+		inner.free_head = INVALID_POS;
+		
+        let buckets;
+        let dictionary;
+        unsafe {
+            let buckets_end_ptr = buckets_ptr.add(num_buckets as usize);
+            let dictionary_ptr: *mut u32 = buckets_end_ptr
+                .byte_add(buckets_end_ptr.align_offset(align_of::<u32>()))
+                .cast();
+            let dictionary_size: usize =
+                end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::<u32>();
+
+            buckets = std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize);
+            dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size);
+        }		
+        for e in dictionary.iter_mut() {
+            *e = INVALID_POS;
+        }
+		
+        for (i, bucket) in buckets.iter_mut().enumerate().take(rehash_buckets as usize) {
+            if bucket.inner.is_none() {
+				bucket.next = inner.free_head;
+                inner.free_head = i as u32;
+				continue;
+            }
+
+			let hash = self.hasher.hash_one(&bucket.inner.as_ref().unwrap().0);
+            let pos: usize = (hash % dictionary.len() as u64) as usize;
+            bucket.next = dictionary[pos];
+            dictionary[pos] = i as u32;
+        }
+
+        inner.dictionary = dictionary;
+        inner.buckets = buckets;
+	}
+
+	/// Rehash the map without growing or shrinking. 
+	pub fn shuffle(&self) {
+		let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+		let num_buckets = map.get_num_buckets() as u32;
+		let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+		let end_ptr: *mut u8 = unsafe { self.shared_ptr.byte_add(size_bytes).cast() };
+        let buckets_ptr = map.buckets.as_mut_ptr();
+		self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, num_buckets);
+	}
+
+    /// Grow the number of buckets within the table. 
+    ///
+    /// 1. Grows the underlying shared memory area
+    /// 2. Initializes new buckets and overwrites the current dictionary
+    /// 3. Rehashes the dictionary
+	///
+	/// # Panics 
+	/// Panics if called on a map initialized with [`HashMapInit::with_fixed`].
+	///
+	/// # Errors
+	/// Returns an [`shmem::Error`] if any errors occur resizing the memory region.
+    pub fn grow(&self, num_buckets: u32) -> Result<(), shmem::Error> {
+        let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        let old_num_buckets = map.buckets.len() as u32;
+
+        assert!(num_buckets >= old_num_buckets, "grow called with a smaller number of buckets");
+        if num_buckets == old_num_buckets {
+            return Ok(());
+        }
+        let shmem_handle = self
+            .shmem_handle
+            .as_ref()
+            .expect("grow called on a fixed-size hash table");
+
+        let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+        shmem_handle.set_size(size_bytes)?;
+        let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
+
+        // Initialize new buckets. The new buckets are linked to the free list.
+		// NB: This overwrites the dictionary!
+        let buckets_ptr = map.buckets.as_mut_ptr();
+        unsafe {
+            for i in old_num_buckets..num_buckets {
+                let bucket = buckets_ptr.add(i as usize);
+                bucket.write(core::Bucket {
+                    next: if i < num_buckets-1 {
+                        i + 1
+                    } else {
+                        map.free_head
+                    },
+                    inner: None,
+                });
+            }
+        }
+
+		self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, old_num_buckets);
+        map.free_head = old_num_buckets;
+
+        Ok(())
+    }
+
+	/// Begin a shrink, limiting all new allocations to be in buckets with index below `num_buckets`.
+	///
+	/// # Panics
+	/// Panics if called on a map initialized with [`HashMapInit::with_fixed`] or if `num_buckets` is
+	/// greater than the number of buckets in the map.
+	pub fn begin_shrink(&mut self, num_buckets: u32) {
+		let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+		assert!(
+			num_buckets <= map.get_num_buckets() as u32,
+            "shrink called with a larger number of buckets"
+        );
+		_ = self
+            .shmem_handle
+            .as_ref()
+            .expect("shrink called on a fixed-size hash table");
+		map.alloc_limit = num_buckets;
+	}
+
+	/// If a shrink operation is underway, returns the target size of the map. Otherwise, returns None.
+	pub fn shrink_goal(&self) -> Option<usize> {
+		let map = unsafe { self.shared_ptr.as_mut() }.unwrap().read();
+        let goal = map.alloc_limit;
+		if goal == INVALID_POS { None } else { Some(goal as usize) }
+	}
+	
+	/// Complete a shrink after caller has evicted entries, removing the unused buckets and rehashing.
+	///
+	/// # Panics
+	/// The following cases result in a panic: 
+	/// - Calling this function on a map initialized with [`HashMapInit::with_fixed`].
+	/// - Calling this function on a map when no shrink operation is in progress.
+	/// - Calling this function on a map with `shrink_mode` set to [`HashMapShrinkMode::Remap`] and
+	///   there are more buckets in use than the value returned by [`HashMapAccess::shrink_goal`].
+	///
+	/// # Errors
+	/// Returns an [`shmem::Error`] if any errors occur resizing the memory region.
+	pub fn finish_shrink(&self) -> Result<(), shmem::Error> {
+		let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+		assert!(
+			map.alloc_limit != INVALID_POS,
+			"called finish_shrink when no shrink is in progress"
+		);
+
+		let num_buckets = map.alloc_limit; 
+
+		if map.get_num_buckets() == num_buckets as usize {
+            return Ok(());
+        }
+
+		assert!(
+			map.buckets_in_use <= num_buckets,
+			"called finish_shrink before enough entries were removed"
+		);
+		
+		for i in (num_buckets as usize)..map.buckets.len() {
+			if let Some((k, v)) = map.buckets[i].inner.take() {
+				// alloc_bucket increases count, so need to decrease since we're just moving
+				map.buckets_in_use -= 1;
+				map.alloc_bucket(k, v).unwrap();
+			}
+		}
+
+        let shmem_handle = self
+            .shmem_handle
+            .as_ref()
+            .expect("shrink called on a fixed-size hash table");
+
+		let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+        shmem_handle.set_size(size_bytes)?;
+        let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
+		let buckets_ptr = map.buckets.as_mut_ptr();
+		self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, num_buckets);
+		map.alloc_limit = INVALID_POS;
+		
+		Ok(())
+	}
+}
--- a/libs/neon-shmem/src/hash/core.rs
+++ b/libs/neon-shmem/src/hash/core.rs
@@ -0,0 +1,177 @@
+//! Simple hash table with chaining.
+
+use std::hash::Hash;
+use std::mem::MaybeUninit;
+
+use crate::hash::entry::*;
+
+/// Invalid position within the map (either within the dictionary or bucket array).
+pub(crate) const INVALID_POS: u32 = u32::MAX;
+
+/// Fundamental storage unit within the hash table. Either empty or contains a key-value pair.
+/// Always part of a chain of some kind (either a freelist if empty or a hash chain if full).
+pub(crate) struct Bucket<K, V> {
+	/// Index of next bucket in the chain.
+	pub(crate) next: u32,
+	/// Key-value pair contained within bucket.
+    pub(crate) inner: Option<(K, V)>,
+}
+
+/// Core hash table implementation.
+pub(crate) struct CoreHashMap<'a, K, V> {
+	/// Dictionary used to map hashes to bucket indices.
+    pub(crate) dictionary: &'a mut [u32],
+	/// Buckets containing key-value pairs.
+    pub(crate) buckets: &'a mut [Bucket<K, V>],
+	/// Head of the freelist.
+    pub(crate) free_head: u32,
+	/// Maximum index of a bucket allowed to be allocated. [`INVALID_POS`] if no limit.
+	pub(crate) alloc_limit: u32,
+    /// The number of currently occupied buckets.
+    pub(crate) buckets_in_use: u32,
+	// pub(crate) lock: libc::pthread_mutex_t,
+	// Unclear what the purpose of this is.
+    pub(crate) _user_list_head: u32,
+}
+
+/// Error for when there are no empty buckets left but one is needed.
+#[derive(Debug, PartialEq)]
+pub struct FullError();
+
+impl<'a, K: Clone + Hash + Eq, V> CoreHashMap<'a, K, V> {
+    const FILL_FACTOR: f32 = 0.60;
+
+	/// Estimate the size of data contained within the the hash map.
+    pub fn estimate_size(num_buckets: u32) -> usize {
+        let mut size = 0;
+
+        // buckets
+        size += size_of::<Bucket<K, V>>() * num_buckets as usize;
+
+        // dictionary
+        size += (f32::ceil((size_of::<u32>() * num_buckets as usize) as f32 / Self::FILL_FACTOR))
+            as usize;
+
+        size
+    }	
+
+    pub fn new(
+        buckets: &'a mut [MaybeUninit<Bucket<K, V>>],
+        dictionary: &'a mut [MaybeUninit<u32>],
+    ) -> Self {
+        // Initialize the buckets
+        for i in 0..buckets.len() {
+            buckets[i].write(Bucket {
+                next: if i < buckets.len() - 1 {
+                    i as u32 + 1
+                } else {
+                    INVALID_POS
+                },				
+                inner: None,
+            });
+        }
+
+        // Initialize the dictionary
+        for e in dictionary.iter_mut() {
+            e.write(INVALID_POS);
+        }
+
+        // TODO: use std::slice::assume_init_mut() once it stabilizes
+        let buckets =
+            unsafe { std::slice::from_raw_parts_mut(buckets.as_mut_ptr().cast(), buckets.len()) };
+        let dictionary = unsafe {
+            std::slice::from_raw_parts_mut(dictionary.as_mut_ptr().cast(), dictionary.len())
+        };
+
+        Self {
+            dictionary,
+            buckets,
+            free_head: 0,
+            buckets_in_use: 0,
+            _user_list_head: INVALID_POS,
+			alloc_limit: INVALID_POS,
+        }
+    }
+
+	/// Get the value associated with a key (if it exists) given its hash.
+    pub fn get_with_hash(&self, key: &K, hash: u64) -> Option<&V> {
+        let mut next = self.dictionary[hash as usize % self.dictionary.len()];
+        loop {
+            if next == INVALID_POS {
+                return None;
+            }
+
+            let bucket = &self.buckets[next as usize];
+            let (bucket_key, bucket_value) = bucket.inner.as_ref().expect("entry is in use");
+            if bucket_key == key {
+                return Some(bucket_value);
+            }
+            next = bucket.next;
+        }
+    }
+
+	/// Get number of buckets in map.
+    pub fn get_num_buckets(&self) -> usize {
+        self.buckets.len()
+    }
+
+	/// Clears all entries from the hashmap.
+	///
+	/// Does not reset any allocation limits, but does clear any entries beyond them.
+	pub fn clear(&mut self) {
+		for i in 0..self.buckets.len() {
+            self.buckets[i] = Bucket {
+                next: if i < self.buckets.len() - 1 {
+                    i as u32 + 1
+                } else {
+                    INVALID_POS
+                },				
+                inner: None,
+            }
+        }
+        for i in 0..self.dictionary.len() {
+            self.dictionary[i] = INVALID_POS;
+        }
+
+		self.free_head = 0;
+		self.buckets_in_use = 0;
+	}
+
+	/// Find the position of an unused bucket via the freelist and initialize it. 
+    pub(crate) fn alloc_bucket(&mut self, key: K, value: V) -> Result<u32, FullError> {
+        let mut pos = self.free_head;
+
+		// Find the first bucket we're *allowed* to use.
+		let mut prev = PrevPos::First(self.free_head);
+		while pos != INVALID_POS && pos >= self.alloc_limit {
+			let bucket = &mut self.buckets[pos as usize];
+			prev = PrevPos::Chained(pos);
+			pos = bucket.next;
+		}
+		if pos == INVALID_POS {
+			return Err(FullError());
+		}
+
+		// Repair the freelist.
+		match prev {
+			PrevPos::First(_) => {
+				let next_pos = self.buckets[pos as usize].next;
+				self.free_head = next_pos;				
+			}
+			PrevPos::Chained(p) => if p != INVALID_POS {
+				let next_pos = self.buckets[pos as usize].next;
+				self.buckets[p as usize].next = next_pos;
+			},
+			_ => unreachable!()
+		}
+
+		// Initialize the bucket.
+		let bucket = &mut self.buckets[pos as usize];
+		self.buckets_in_use += 1;
+        bucket.next = INVALID_POS;
+        bucket.inner = Some((key, value));
+
+        Ok(pos)
+    }
+}
+
--- a/libs/neon-shmem/src/hash/entry.rs
+++ b/libs/neon-shmem/src/hash/entry.rs
@@ -0,0 +1,139 @@
+//! Equivalent of [`std::collections::hash_map::Entry`] for this hashmap.
+
+use crate::hash::core::{CoreHashMap, FullError, INVALID_POS};
+use crate::sync::{RwLockWriteGuard, ValueWriteGuard};
+
+use std::hash::Hash;
+use std::mem;
+
+
+pub enum Entry<'a, 'b, K, V> {
+	Occupied(OccupiedEntry<'a, 'b, K, V>),
+    Vacant(VacantEntry<'a, 'b, K, V>),
+}
+
+/// Enum representing the previous position within a chain.
+#[derive(Clone, Copy)]
+pub(crate) enum PrevPos {
+	/// Starting index within the dictionary.  
+    First(u32),
+	/// Regular index within the buckets.
+    Chained(u32),
+	/// Unknown - e.g. the associated entry was retrieved by index instead of chain.
+	Unknown(u64),
+}
+
+pub struct OccupiedEntry<'a, 'b, K, V> {
+	/// Mutable reference to the map containing this entry.
+	pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
+	/// The key of the occupied entry
+    pub(crate) _key: K,
+	/// The index of the previous entry in the chain.
+    pub(crate) prev_pos: PrevPos,
+	/// The position of the bucket in the [`CoreHashMap`] bucket array.
+    pub(crate) bucket_pos: u32,
+}
+
+impl<K, V> OccupiedEntry<'_, '_, K, V> {
+    pub fn get(&self) -> &V {
+        &self.map.buckets[self.bucket_pos as usize]
+            .inner
+            .as_ref()
+            .unwrap()
+            .1
+    }
+
+    pub fn get_mut(&mut self) -> &mut V {
+        &mut self.map.buckets[self.bucket_pos as usize]
+            .inner
+            .as_mut()
+            .unwrap()
+            .1
+    }
+
+	/// Inserts a value into the entry, replacing (and returning) the existing value.
+    pub fn insert(&mut self, value: V) -> V {
+        let bucket = &mut self.map.buckets[self.bucket_pos as usize];
+        // This assumes inner is Some, which it must be for an OccupiedEntry
+        mem::replace(&mut bucket.inner.as_mut().unwrap().1, value)
+    }
+
+	/// Removes the entry from the hash map, returning the value originally stored within it.
+	///
+	/// This may result in multiple bucket accesses if the entry was obtained by index as the
+	/// previous chain entry needs to be discovered in this case.
+	///
+	/// # Panics
+	/// Panics if the `prev_pos` field is equal to [`PrevPos::Unknown`]. In practice, this means
+	/// the entry was obtained via calling something like [`CoreHashMap::entry_at_bucket`].
+    pub fn remove(mut self) -> V {
+		// If this bucket was queried by index, go ahead and follow its chain from the start.
+		let prev = if let PrevPos::Unknown(hash) = self.prev_pos {
+			let dict_idx = hash as usize % self.map.dictionary.len();
+			let mut prev = PrevPos::First(dict_idx as u32);
+			let mut curr = self.map.dictionary[dict_idx];
+			while curr != self.bucket_pos {
+				curr = self.map.buckets[curr as usize].next;
+				prev = PrevPos::Chained(curr);
+			}
+			prev 	
+		} else {
+			self.prev_pos
+		};
+		
+        // CoreHashMap::remove returns Option<(K, V)>. We know it's Some for an OccupiedEntry.
+        let bucket = &mut self.map.buckets[self.bucket_pos as usize];
+		
+        // unlink it from the chain
+        match prev {
+            PrevPos::First(dict_pos) => {
+				self.map.dictionary[dict_pos as usize] = bucket.next;
+			},
+            PrevPos::Chained(bucket_pos) => {
+				// println!("we think prev of {} is {bucket_pos}", self.bucket_pos);
+                self.map.buckets[bucket_pos as usize].next = bucket.next;
+            },
+			_ => unreachable!(),			
+        }
+
+        // and add it to the freelist
+		let free = self.map.free_head;
+        let bucket = &mut self.map.buckets[self.bucket_pos as usize];
+        let old_value = bucket.inner.take();
+		bucket.next = free;
+        self.map.free_head = self.bucket_pos;
+        self.map.buckets_in_use -= 1;
+
+        old_value.unwrap().1
+    }
+}
+
+/// An abstract view into a vacant entry within the map.
+pub struct VacantEntry<'a, 'b, K, V> {
+	/// Mutable reference to the map containing this entry.
+	pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
+	/// The key to be inserted into this entry.
+    pub(crate) key: K,
+	/// The position within the dictionary corresponding to the key's hash.
+    pub(crate) dict_pos: u32,
+}
+
+impl<'b, K: Clone + Hash + Eq, V> VacantEntry<'_, 'b, K, V> {
+	/// Insert a value into the vacant entry, finding and populating an empty bucket in the process.
+	///
+	/// # Errors
+	/// Will return [`FullError`] if there are no unoccupied buckets in the map.
+    pub fn insert(mut self, value: V) -> Result<ValueWriteGuard<'b, V>, FullError> {
+        let pos = self.map.alloc_bucket(self.key, value)?;
+        if pos == INVALID_POS {
+            return Err(FullError());
+        }
+        self.map.buckets[pos as usize].next = self.map.dictionary[self.dict_pos as usize];
+        self.map.dictionary[self.dict_pos as usize] = pos;
+
+		Ok(RwLockWriteGuard::map(
+			self.map,
+			|m| &mut m.buckets[pos as usize].inner.as_mut().unwrap().1
+		))
+    }
+}
--- a/libs/neon-shmem/src/hash/tests.rs
+++ b/libs/neon-shmem/src/hash/tests.rs
@@ -0,0 +1,426 @@
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fmt::Debug;
+use std::mem::MaybeUninit;
+
+use crate::hash::HashMapAccess;
+use crate::hash::HashMapInit;
+use crate::hash::Entry;
+use crate::hash::core::FullError;
+
+use rand::seq::SliceRandom;
+use rand::{Rng, RngCore};
+use rand_distr::Zipf;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl From<&TestKey> for u128 {
+    fn from(val: &TestKey) -> u128 {
+        u128::from_be_bytes(val.0)
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl<'a> From<&'a [u8]> for TestKey {
+    fn from(bytes: &'a [u8]) -> TestKey {
+        TestKey(bytes.try_into().unwrap())
+    }
+}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {	
+    let w = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		100000, 120000, "test_inserts"
+	).attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+		let res = w.entry((*k).into());
+		match res {
+			Entry::Occupied(mut e) => { e.insert(idx); }
+			Entry::Vacant(e) => {
+				let res = e.insert(idx);
+				assert!(res.is_ok());
+			},
+		};
+    }
+
+    for (idx, k) in keys.iter().enumerate() {
+        let x = w.get(&(*k).into());
+        let value = x.as_deref().copied();
+        assert_eq!(value, Some(idx));
+    }
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut rand::rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.get(&key).is_some() {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
+
+#[derive(Clone, Debug)]
+struct TestOp(TestKey, Option<usize>);
+
+fn apply_op(
+    op: &TestOp,
+    map: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    // apply the change to the shadow tree first
+    let shadow_existing = if let Some(v) = op.1 {
+        shadow.insert(op.0, v)
+    } else {
+        shadow.remove(&op.0)
+    };
+
+	let entry = map.entry(op.0);
+    let hash_existing = match op.1 {
+		Some(new) => {
+			match entry {
+				Entry::Occupied(mut e) => Some(e.insert(new)),
+				Entry::Vacant(e) => { _ = e.insert(new).unwrap(); None },
+			}
+		},
+		None => {
+			match entry {
+				Entry::Occupied(e) => Some(e.remove()),
+				Entry::Vacant(_) => None,
+			}
+		},
+	};
+
+	assert_eq!(shadow_existing, hash_existing);
+}
+
+fn do_random_ops(
+	num_ops: usize,
+	size: u32,
+	del_prob: f64,
+	writer: &mut HashMapAccess<TestKey, usize>,
+	shadow: &mut BTreeMap<TestKey, usize>,
+	rng: &mut rand::rngs::ThreadRng,
+) {
+	for i in 0..num_ops {
+        let key: TestKey = ((rng.next_u32() % size) as u128).into();
+        let op = TestOp(key, if rng.random_bool(del_prob) { Some(i) } else { None });
+        apply_op(&op, writer, shadow);
+    }
+}
+
+fn do_deletes(
+	num_ops: usize,
+	writer: &mut HashMapAccess<TestKey, usize>,
+	shadow: &mut BTreeMap<TestKey, usize>,
+) {
+	for _ in 0..num_ops {
+		let (k, _) = shadow.pop_first().unwrap();
+		writer.remove(&k);
+	}
+}
+
+fn do_shrink(
+	writer: &mut HashMapAccess<TestKey, usize>,
+	shadow: &mut BTreeMap<TestKey, usize>,
+	to: u32
+) {
+	assert!(writer.shrink_goal().is_none());
+	writer.begin_shrink(to);
+	assert_eq!(writer.shrink_goal(), Some(to as usize));
+	while writer.get_num_buckets_in_use() > to as usize {
+		let (k, _) = shadow.pop_first().unwrap();
+		let entry = writer.entry(k);
+		if let Entry::Occupied(e) = entry {
+			e.remove();
+		}
+	}
+	let old_usage = writer.get_num_buckets_in_use();
+	writer.finish_shrink().unwrap();
+	assert!(writer.shrink_goal().is_none());
+	assert_eq!(writer.get_num_buckets_in_use(), old_usage);
+}
+
+#[test]
+fn random_ops() {
+	let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		100000, 120000, "test_random"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+	
+    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
+    let mut rng = rand::rng();
+    for i in 0..100000 {
+        let key: TestKey = (rng.sample(distribution) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &mut writer, &mut shadow);
+    }
+}
+
+
+#[test]
+fn test_shuffle() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1000, 1200, "test_shuf"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+    writer.shuffle();
+	do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_grow() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1000, 2000, "test_grow"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+	let old_usage = writer.get_num_buckets_in_use();
+    writer.grow(1500).unwrap();
+	assert_eq!(writer.get_num_buckets_in_use(), old_usage);
+	assert_eq!(writer.get_num_buckets(), 1500);
+	do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_clear() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2000, "test_clear"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+    do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+	writer.clear();
+	assert_eq!(writer.get_num_buckets_in_use(), 0);
+	assert_eq!(writer.get_num_buckets(), 1500);
+	while let Some((key, _)) = shadow.pop_first() {
+		assert!(writer.get(&key).is_none());
+	}
+	do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+	for i in 0..(1500 - writer.get_num_buckets_in_use()) {
+		writer.insert((1500 + i as u128).into(), 0).unwrap();
+	}
+	assert_eq!(writer.insert(5000.into(), 0), Err(FullError {}));
+	writer.clear();
+	assert!(writer.insert(5000.into(), 0).is_ok());
+}
+
+#[test]
+fn test_idx_remove() {
+	let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2000, "test_clear"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+    do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+	for _ in 0..100 {
+		let idx = (rng.next_u32() % 1500) as usize;
+		if let Some(e) = writer.entry_at_bucket(idx) {
+			shadow.remove(&e._key);
+			e.remove();
+		}
+		
+	}
+	while let Some((key, val)) = shadow.pop_first() {
+		assert_eq!(*writer.get(&key).unwrap(), val);
+	}
+}
+
+#[test]
+fn test_idx_get() {
+	let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2000, "test_clear"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+    do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+	for _ in 0..100 {
+		let idx = (rng.next_u32() % 1500) as usize;
+		if let Some(pair) = writer.get_at_bucket(idx) {
+			{ 
+				let v: *const usize = &pair.1;
+				assert_eq!(writer.get_bucket_for_value(v), idx);
+			}
+			{
+				let v: *const usize = &pair.1;
+				assert_eq!(writer.get_bucket_for_value(v), idx);
+			}
+		}
+	}
+}
+
+#[test]
+fn test_shrink() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2000, "test_shrink"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+	
+    do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);	
+	do_shrink(&mut writer, &mut shadow, 1000);	
+	assert_eq!(writer.get_num_buckets(), 1000);
+	do_deletes(500, &mut writer, &mut shadow);
+	do_random_ops(10000, 500, 0.75, &mut writer, &mut shadow, &mut rng);
+	assert!(writer.get_num_buckets_in_use() <= 1000);
+}
+
+#[test]
+fn test_shrink_grow_seq() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1000, 20000, "test_grow_seq"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(500, 1000, 0.1, &mut writer, &mut shadow, &mut rng);
+	eprintln!("Shrinking to 750");
+    do_shrink(&mut writer, &mut shadow, 750);
+	do_random_ops(200, 1000, 0.5, &mut writer, &mut shadow, &mut rng);
+	eprintln!("Growing to 1500");
+	writer.grow(1500).unwrap();
+	do_random_ops(600, 1500, 0.1, &mut writer, &mut shadow, &mut rng);
+	eprintln!("Shrinking to 200");
+	while shadow.len() > 100 {
+		do_deletes(1, &mut writer, &mut shadow);
+	}
+	do_shrink(&mut writer, &mut shadow, 200);
+	do_random_ops(50, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+	eprintln!("Growing to 10k");
+	writer.grow(10000).unwrap();
+	do_random_ops(10000, 5000, 0.25, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_bucket_ops() {
+	let writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1000, 1200, "test_bucket_ops"
+	).attach_writer();
+	match writer.entry(1.into()) {
+		Entry::Occupied(mut e) => { e.insert(2); },
+		Entry::Vacant(e) => { _ = e.insert(2).unwrap(); },
+	}
+	assert_eq!(writer.get_num_buckets_in_use(), 1);
+	assert_eq!(writer.get_num_buckets(), 1000);
+	assert_eq!(*writer.get(&1.into()).unwrap(), 2);
+	let pos = match writer.entry(1.into()) {
+		Entry::Occupied(e) => {
+			assert_eq!(e._key, 1.into());
+			let pos = e.bucket_pos as usize;
+			pos
+		},
+		Entry::Vacant(_) => { panic!("Insert didn't affect entry"); },
+	};
+	assert_eq!(writer.entry_at_bucket(pos).unwrap()._key, 1.into());
+	assert_eq!(*writer.get_at_bucket(pos).unwrap(), (1.into(), 2));
+	{
+		let ptr: *const usize = &*writer.get(&1.into()).unwrap();
+		assert_eq!(writer.get_bucket_for_value(ptr), pos);
+	}
+	writer.remove(&1.into());
+	assert!(writer.get(&1.into()).is_none());
+}
+
+#[test]
+fn test_shrink_zero() {
+	let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2000, "test_shrink_zero"
+	).attach_writer();
+	writer.begin_shrink(0);
+	for i in 0..1500 {
+		writer.entry_at_bucket(i).map(|x| x.remove());
+	}
+	writer.finish_shrink().unwrap();
+	assert_eq!(writer.get_num_buckets_in_use(), 0);
+	let entry = writer.entry(1.into());
+	if let Entry::Vacant(v) = entry {
+		assert!(v.insert(2).is_err());
+	} else {
+		panic!("Somehow got non-vacant entry in empty map.")
+	}
+	writer.grow(50).unwrap();
+	let entry = writer.entry(1.into());
+	if let Entry::Vacant(v) = entry {
+		assert!(v.insert(2).is_ok());
+	} else {
+		panic!("Somehow got non-vacant entry in empty map.")
+	}
+	assert_eq!(writer.get_num_buckets_in_use(), 1);
+}
+
+#[test]
+#[should_panic]
+fn test_grow_oom() {
+    let writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2000, "test_grow_oom"
+	).attach_writer();
+	writer.grow(20000).unwrap();
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_bigger() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2500, "test_shrink_bigger"
+	).attach_writer();
+	writer.begin_shrink(2000);
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_early_finish() {
+    let writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2500, "test_shrink_early_finish"
+	).attach_writer();
+	writer.finish_shrink().unwrap();
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_fixed_size() {
+	let mut area = [MaybeUninit::uninit(); 10000];
+    let init_struct = HashMapInit::<TestKey, usize>::with_fixed(3, &mut area);
+    let mut writer = init_struct.attach_writer();
+	writer.begin_shrink(1);
+}
--- a/libs/neon-shmem/src/lib.rs
+++ b/libs/neon-shmem/src/lib.rs
@@ -1,418 +1,5 @@
 //! Shared memory utilities for neon communicator

-use std::num::NonZeroUsize;
-use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
-use std::ptr::NonNull;
-use std::sync::atomic::{AtomicUsize, Ordering};
-
-use nix::errno::Errno;
-use nix::sys::mman::MapFlags;
-use nix::sys::mman::ProtFlags;
-use nix::sys::mman::mmap as nix_mmap;
-use nix::sys::mman::munmap as nix_munmap;
-use nix::unistd::ftruncate as nix_ftruncate;
-
-/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
-/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
-/// specified at creation.
-///
-/// The area is backed by an anonymous file created with memfd_create(). The full address space for
-/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
-/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
-/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
-/// future.
-pub struct ShmemHandle {
-    /// memfd file descriptor
-    fd: OwnedFd,
-
-    max_size: usize,
-
-    // Pointer to the beginning of the shared memory area. The header is stored there.
-    shared_ptr: NonNull<SharedStruct>,
-
-    // Pointer to the beginning of the user data
-    pub data_ptr: NonNull<u8>,
-}
-
-/// This is stored at the beginning in the shared memory area.
-struct SharedStruct {
-    max_size: usize,
-
-    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
-    current_size: AtomicUsize,
-}
-
-const RESIZE_IN_PROGRESS: usize = 1 << 63;
-
-const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
-
-/// Error type returned by the ShmemHandle functions.
-#[derive(thiserror::Error, Debug)]
-#[error("{msg}: {errno}")]
-pub struct Error {
-    pub msg: String,
-    pub errno: Errno,
-}
-
-impl Error {
-    fn new(msg: &str, errno: Errno) -> Error {
-        Error {
-            msg: msg.to_string(),
-            errno,
-        }
-    }
-}
-
-impl ShmemHandle {
-    /// Create a new shared memory area. To communicate between processes, the processes need to be
-    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
-    ///
-    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
-    /// processes can continue using it, however.
-    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
-        // create the backing anonymous file.
-        let fd = create_backing_file(name)?;
-
-        Self::new_with_fd(fd, initial_size, max_size)
-    }
-
-    fn new_with_fd(
-        fd: OwnedFd,
-        initial_size: usize,
-        max_size: usize,
-    ) -> Result<ShmemHandle, Error> {
-        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
-        // is a little larger than this because of the SharedStruct header. Make the upper limit
-        // somewhat smaller than that, because with anything close to that, you'll run out of
-        // memory anyway.
-        if max_size >= 1 << 48 {
-            panic!("max size {} too large", max_size);
-        }
-        if initial_size > max_size {
-            panic!("initial size {initial_size} larger than max size {max_size}");
-        }
-
-        // The actual initial / max size is the one given by the caller, plus the size of
-        // 'SharedStruct'.
-        let initial_size = HEADER_SIZE + initial_size;
-        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
-
-        // Reserve address space for it with mmap
-        //
-        // TODO: Use MAP_HUGETLB if possible
-        let start_ptr = unsafe {
-            nix_mmap(
-                None,
-                max_size,
-                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
-                MapFlags::MAP_SHARED,
-                &fd,
-                0,
-            )
-        }
-        .map_err(|e| Error::new("mmap failed: {e}", e))?;
-
-        // Reserve space for the initial size
-        enlarge_file(fd.as_fd(), initial_size as u64)?;
-
-        // Initialize the header
-        let shared: NonNull<SharedStruct> = start_ptr.cast();
-        unsafe {
-            shared.write(SharedStruct {
-                max_size: max_size.into(),
-                current_size: AtomicUsize::new(initial_size),
-            })
-        };
-
-        // The user data begins after the header
-        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
-
-        Ok(ShmemHandle {
-            fd,
-            max_size: max_size.into(),
-            shared_ptr: shared,
-            data_ptr,
-        })
-    }
-
-    // return reference to the header
-    fn shared(&self) -> &SharedStruct {
-        unsafe { self.shared_ptr.as_ref() }
-    }
-
-    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
-    /// when creating the area.
-    ///
-    /// This may only be called from one process/thread concurrently. We detect that case
-    /// and return an Error.
-    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
-        let new_size = new_size + HEADER_SIZE;
-        let shared = self.shared();
-
-        if new_size > self.max_size {
-            panic!(
-                "new size ({} is greater than max size ({})",
-                new_size, self.max_size
-            );
-        }
-        assert_eq!(self.max_size, shared.max_size);
-
-        // Lock the area by setting the bit in 'current_size'
-        //
-        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
-        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
-        // since this is not performance-critical, better safe than sorry .
-        let mut old_size = shared.current_size.load(Ordering::Acquire);
-        loop {
-            if (old_size & RESIZE_IN_PROGRESS) != 0 {
-                return Err(Error::new(
-                    "concurrent resize detected",
-                    Errno::UnknownErrno,
-                ));
-            }
-            match shared.current_size.compare_exchange(
-                old_size,
-                new_size,
-                Ordering::Acquire,
-                Ordering::Relaxed,
-            ) {
-                Ok(_) => break,
-                Err(x) => old_size = x,
-            }
-        }
-
-        // Ok, we got the lock.
-        //
-        // NB: If anything goes wrong, we *must* clear the bit!
-        let result = {
-            use std::cmp::Ordering::{Equal, Greater, Less};
-            match new_size.cmp(&old_size) {
-                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
-                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
-                }),
-                Equal => Ok(()),
-                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
-            }
-        };
-
-        // Unlock
-        shared.current_size.store(
-            if result.is_ok() { new_size } else { old_size },
-            Ordering::Release,
-        );
-
-        result
-    }
-
-    /// Returns the current user-visible size of the shared memory segment.
-    ///
-    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
-    /// responsibility not to access the area beyond the current size.
-    pub fn current_size(&self) -> usize {
-        let total_current_size =
-            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
-        total_current_size - HEADER_SIZE
-    }
-}
-
-impl Drop for ShmemHandle {
-    fn drop(&mut self) {
-        // SAFETY: The pointer was obtained from mmap() with the given size.
-        // We unmap the entire region.
-        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
-        // The fd is dropped automatically by OwnedFd.
-    }
-}
-
-/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
-/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
-/// development and testing, but in production we want the file to stay in memory.
-///
-/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
-#[allow(unused_variables)]
-fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
-            .map_err(|e| Error::new("memfd_create failed: {e}", e))
-    }
-    #[cfg(target_os = "macos")]
-    {
-        let file = tempfile::tempfile().map_err(|e| {
-            Error::new(
-                "could not create temporary file to back shmem area: {e}",
-                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
-            )
-        })?;
-        Ok(OwnedFd::from(file))
-    }
-}
-
-fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
-    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
-    // we don't get a segfault later when trying to actually use it.
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
-            Error::new(
-                "could not grow shmem segment, posix_fallocate failed: {e}",
-                e,
-            )
-        })
-    }
-    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
-    #[cfg(target_os = "macos")]
-    {
-        nix::unistd::ftruncate(fd, size as i64)
-            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use nix::unistd::ForkResult;
-    use std::ops::Range;
-
-    /// check that all bytes in given range have the expected value.
-    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
-        for i in range {
-            let b = unsafe { *(ptr.add(i)) };
-            assert_eq!(expected, b, "unexpected byte at offset {}", i);
-        }
-    }
-
-    /// Write 'b' to all bytes in the given range
-    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
-        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
-    }
-
-    // simple single-process test of growing and shrinking
-    #[test]
-    fn test_shmem_resize() -> Result<(), Error> {
-        let max_size = 1024 * 1024;
-        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
-
-        assert_eq!(init_struct.current_size(), 0);
-
-        // Initial grow
-        let size1 = 10000;
-        init_struct.set_size(size1).unwrap();
-        assert_eq!(init_struct.current_size(), size1);
-
-        // Write some data
-        let data_ptr = init_struct.data_ptr.as_ptr();
-        write_range(data_ptr, 0xAA, 0..size1);
-        assert_range(data_ptr, 0xAA, 0..size1);
-
-        // Shrink
-        let size2 = 5000;
-        init_struct.set_size(size2).unwrap();
-        assert_eq!(init_struct.current_size(), size2);
-
-        // Grow again
-        let size3 = 20000;
-        init_struct.set_size(size3).unwrap();
-        assert_eq!(init_struct.current_size(), size3);
-
-        // Try to read it. The area that was shrunk and grown again should read as all zeros now
-        assert_range(data_ptr, 0xAA, 0..5000);
-        assert_range(data_ptr, 0, 5000..size1);
-
-        // Try to grow beyond max_size
-        //let size4 = max_size + 1;
-        //assert!(init_struct.set_size(size4).is_err());
-
-        // Dropping init_struct should unmap the memory
-        drop(init_struct);
-
-        Ok(())
-    }
-
-    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
-    /// but is stored in the shared memory area and works across processes. It's implemented by
-    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
-    struct SimpleBarrier {
-        num_procs: usize,
-        count: AtomicUsize,
-    }
-
-    impl SimpleBarrier {
-        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
-            unsafe {
-                *ptr = SimpleBarrier {
-                    num_procs,
-                    count: AtomicUsize::new(0),
-                }
-            }
-        }
-
-        pub fn wait(&self) {
-            let old = self.count.fetch_add(1, Ordering::Relaxed);
-
-            let generation = old / self.num_procs;
-
-            let mut current = old + 1;
-            while current < (generation + 1) * self.num_procs {
-                std::thread::sleep(std::time::Duration::from_millis(10));
-                current = self.count.load(Ordering::Relaxed);
-            }
-        }
-    }
-
-    #[test]
-    fn test_multi_process() {
-        // Initialize
-        let max_size = 1_000_000_000_000;
-        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
-        let ptr = init_struct.data_ptr.as_ptr();
-
-        // Store the SimpleBarrier in the first 1k of the area.
-        init_struct.set_size(10000).unwrap();
-        let barrier_ptr: *mut SimpleBarrier = unsafe {
-            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
-                .cast()
-        };
-        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
-        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
-
-        // Fork another test process. The code after this runs in both processes concurrently.
-        let fork_result = unsafe { nix::unistd::fork().unwrap() };
-
-        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, 1000..2000);
-        } else {
-            write_range(ptr, 0xBB, 2000..3000);
-        }
-        barrier.wait();
-        // Verify the contents. (in both processes)
-        assert_range(ptr, 0xAA, 1000..2000);
-        assert_range(ptr, 0xBB, 2000..3000);
-
-        // Grow, from the child this time
-        let size = 10_000_000;
-        if !fork_result.is_parent() {
-            init_struct.set_size(size).unwrap();
-        }
-        barrier.wait();
-
-        // make some writes at the end
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, (size - 10)..size);
-        } else {
-            write_range(ptr, 0xBB, (size - 20)..(size - 10));
-        }
-        barrier.wait();
-
-        // Verify the contents. (This runs in both processes)
-        assert_range(ptr, 0, (size - 1000)..(size - 20));
-        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
-        assert_range(ptr, 0xAA, (size - 10)..size);
-
-        if let ForkResult::Parent { child } = fork_result {
-            nix::sys::wait::waitpid(child, None).unwrap();
-        }
-    }
-}
+pub mod hash;
+pub mod shmem;
+pub mod sync;
--- a/libs/neon-shmem/src/shmem.rs
+++ b/libs/neon-shmem/src/shmem.rs
@@ -0,0 +1,418 @@
+//! Dynamically resizable contiguous chunk of shared memory
+
+use std::num::NonZeroUsize;
+use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use nix::errno::Errno;
+use nix::sys::mman::MapFlags;
+use nix::sys::mman::ProtFlags;
+use nix::sys::mman::mmap as nix_mmap;
+use nix::sys::mman::munmap as nix_munmap;
+use nix::unistd::ftruncate as nix_ftruncate;
+
+/// `ShmemHandle` represents a shared memory area that can be shared by processes over `fork()`.
+/// Unlike shared memory allocated by Postgres, this area is resizable, up to `max_size` that's
+/// specified at creation.
+///
+/// The area is backed by an anonymous file created with `memfd_create()`. The full address space for
+/// `max_size` is reserved up-front with `mmap()`, but whenever you call [`ShmemHandle::set_size`],
+/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
+/// will cause the file to be expanded, but we might use `mprotect()` etc. to enforce that in the
+/// future.
+pub struct ShmemHandle {
+    /// memfd file descriptor
+    fd: OwnedFd,
+
+    max_size: usize,
+
+    // Pointer to the beginning of the shared memory area. The header is stored there.
+    shared_ptr: NonNull<SharedStruct>,
+
+    // Pointer to the beginning of the user data
+    pub data_ptr: NonNull<u8>,
+}
+
+/// This is stored at the beginning in the shared memory area.
+struct SharedStruct {
+    max_size: usize,
+
+    /// Current size of the backing file. The high-order bit is used for the [`RESIZE_IN_PROGRESS`] flag.
+    current_size: AtomicUsize,
+}
+
+const RESIZE_IN_PROGRESS: usize = 1 << 63;
+
+const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
+
+/// Error type returned by the [`ShmemHandle`] functions.
+#[derive(thiserror::Error, Debug)]
+#[error("{msg}: {errno}")]
+pub struct Error {
+    pub msg: String,
+    pub errno: Errno,
+}
+
+impl Error {
+    fn new(msg: &str, errno: Errno) -> Self {
+        Self {
+            msg: msg.to_string(),
+            errno,
+        }
+    }
+}
+
+impl ShmemHandle {
+    /// Create a new shared memory area. To communicate between processes, the processes need to be
+    /// `fork()`'d after calling this, so that the `ShmemHandle` is inherited by all processes.
+    ///
+    /// If the `ShmemHandle` is dropped, the memory is unmapped from the current process. Other
+    /// processes can continue using it, however.
+    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<Self, Error> {
+        // create the backing anonymous file.
+        let fd = create_backing_file(name)?;
+
+        Self::new_with_fd(fd, initial_size, max_size)
+    }
+
+    fn new_with_fd(
+        fd: OwnedFd,
+        initial_size: usize,
+        max_size: usize,
+    ) -> Result<Self, Error> {
+        // We reserve the high-order bit for the `RESIZE_IN_PROGRESS` flag, and the actual size
+        // is a little larger than this because of the SharedStruct header. Make the upper limit
+        // somewhat smaller than that, because with anything close to that, you'll run out of
+        // memory anyway.
+        assert!(max_size < 1 << 48, "max size {max_size} too large");
+        
+        assert!(
+			initial_size <= max_size,
+            "initial size {initial_size} larger than max size {max_size}"
+        );
+
+        // The actual initial / max size is the one given by the caller, plus the size of
+        // 'SharedStruct'.
+        let initial_size = HEADER_SIZE + initial_size;
+        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
+
+        // Reserve address space for it with mmap
+        //
+        // TODO: Use MAP_HUGETLB if possible
+        let start_ptr = unsafe {
+            nix_mmap(
+                None,
+                max_size,
+                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
+                MapFlags::MAP_SHARED,
+                &fd,
+                0,
+            )
+        }
+        .map_err(|e| Error::new("mmap failed", e))?;
+
+        // Reserve space for the initial size
+        enlarge_file(fd.as_fd(), initial_size as u64)?;
+
+        // Initialize the header
+        let shared: NonNull<SharedStruct> = start_ptr.cast();
+        unsafe {
+            shared.write(SharedStruct {
+                max_size: max_size.into(),
+                current_size: AtomicUsize::new(initial_size),
+            });
+        }
+
+        // The user data begins after the header
+        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
+
+        Ok(Self {
+            fd,
+            max_size: max_size.into(),
+            shared_ptr: shared,
+            data_ptr,
+        })
+    }
+
+    // return reference to the header
+    fn shared(&self) -> &SharedStruct {
+        unsafe { self.shared_ptr.as_ref() }
+    }
+
+    /// Resize the shared memory area. `new_size` must not be larger than the `max_size` specified
+    /// when creating the area.
+    ///
+    /// This may only be called from one process/thread concurrently. We detect that case
+    /// and return an [`shmem::Error`](Error).
+    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
+        let new_size = new_size + HEADER_SIZE;
+        let shared = self.shared();
+
+        assert!(
+			new_size <= self.max_size,
+            "new size ({new_size}) is greater than max size ({})",
+			self.max_size
+        );
+
+		assert_eq!(self.max_size, shared.max_size);
+
+        // Lock the area by setting the bit in `current_size`
+        //
+        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
+        // and the `posix_fallocate`/`ftruncate` call is surely a synchronization point anyway. But
+        // since this is not performance-critical, better safe than sorry.
+        let mut old_size = shared.current_size.load(Ordering::Acquire);
+        loop {
+            if (old_size & RESIZE_IN_PROGRESS) != 0 {
+                return Err(Error::new(
+                    "concurrent resize detected",
+                    Errno::UnknownErrno,
+                ));
+            }
+            match shared.current_size.compare_exchange(
+                old_size,
+                new_size,
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => break,
+                Err(x) => old_size = x,
+            }
+        }
+
+        // Ok, we got the lock.
+        //
+        // NB: If anything goes wrong, we *must* clear the bit!
+        let result = {
+            use std::cmp::Ordering::{Equal, Greater, Less};
+            match new_size.cmp(&old_size) {
+                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
+                    Error::new("could not shrink shmem segment, ftruncate failed", e)
+                }),
+                Equal => Ok(()),
+                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
+            }
+        };
+
+        // Unlock
+        shared.current_size.store(
+            if result.is_ok() { new_size } else { old_size },
+            Ordering::Release,
+        );
+
+        result
+    }
+
+    /// Returns the current user-visible size of the shared memory segment.
+    ///
+    /// NOTE: a concurrent [`ShmemHandle::set_size()`] call can change the size at any time.
+	/// It is the caller's responsibility not to access the area beyond the current size.
+    pub fn current_size(&self) -> usize {
+        let total_current_size =
+            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
+        total_current_size - HEADER_SIZE
+    }
+}
+
+impl Drop for ShmemHandle {
+    fn drop(&mut self) {
+        // SAFETY: The pointer was obtained from mmap() with the given size.
+        // We unmap the entire region.
+        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
+        // The fd is dropped automatically by OwnedFd.
+    }
+}
+
+/// Create a "backing file" for the shared memory area. On Linux, use `memfd_create()`, to create an
+/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
+/// development and testing, but in production we want the file to stay in memory.
+///
+/// Disable unused variables warnings because `name` is unused in the macos path.
+#[allow(unused_variables)]
+fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
+            .map_err(|e| Error::new("memfd_create failed", e))
+    }
+    #[cfg(target_os = "macos")]
+    {
+        let file = tempfile::tempfile().map_err(|e| {
+            Error::new(
+                "could not create temporary file to back shmem area",
+                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
+            )
+        })?;
+        Ok(OwnedFd::from(file))
+    }
+}
+
+fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
+    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
+    // we don't get a segfault later when trying to actually use it.
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
+            Error::new(
+                "could not grow shmem segment, posix_fallocate failed",
+                e,
+            )
+        })
+    }
+    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
+    #[cfg(target_os = "macos")]
+    {
+        nix::unistd::ftruncate(fd, size as i64)
+            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed", e))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use nix::unistd::ForkResult;
+    use std::ops::Range;
+
+    /// check that all bytes in given range have the expected value.
+    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
+        for i in range {
+            let b = unsafe { *(ptr.add(i)) };
+            assert_eq!(expected, b, "unexpected byte at offset {}", i);
+        }
+    }
+
+    /// Write 'b' to all bytes in the given range
+    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
+        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
+    }
+
+    // simple single-process test of growing and shrinking
+    #[test]
+    fn test_shmem_resize() -> Result<(), Error> {
+        let max_size = 1024 * 1024;
+        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
+
+        assert_eq!(init_struct.current_size(), 0);
+
+        // Initial grow
+        let size1 = 10000;
+        init_struct.set_size(size1).unwrap();
+        assert_eq!(init_struct.current_size(), size1);
+
+        // Write some data
+        let data_ptr = init_struct.data_ptr.as_ptr();
+        write_range(data_ptr, 0xAA, 0..size1);
+        assert_range(data_ptr, 0xAA, 0..size1);
+
+        // Shrink
+        let size2 = 5000;
+        init_struct.set_size(size2).unwrap();
+        assert_eq!(init_struct.current_size(), size2);
+
+        // Grow again
+        let size3 = 20000;
+        init_struct.set_size(size3).unwrap();
+        assert_eq!(init_struct.current_size(), size3);
+
+        // Try to read it. The area that was shrunk and grown again should read as all zeros now
+        assert_range(data_ptr, 0xAA, 0..5000);
+        assert_range(data_ptr, 0, 5000..size1);
+
+        // Try to grow beyond max_size
+        //let size4 = max_size + 1;
+        //assert!(init_struct.set_size(size4).is_err());
+
+        // Dropping init_struct should unmap the memory
+        drop(init_struct);
+
+        Ok(())
+    }
+
+    /// This is used in tests to coordinate between test processes. It's like `std::sync::Barrier`,
+    /// but is stored in the shared memory area and works across processes. It's implemented by
+    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
+    struct SimpleBarrier {
+        num_procs: usize,
+        count: AtomicUsize,
+    }
+
+    impl SimpleBarrier {
+        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
+            unsafe {
+                *ptr = SimpleBarrier {
+                    num_procs,
+                    count: AtomicUsize::new(0),
+                }
+            }
+        }
+
+        pub fn wait(&self) {
+            let old = self.count.fetch_add(1, Ordering::Relaxed);
+
+            let generation = old / self.num_procs;
+
+            let mut current = old + 1;
+            while current < (generation + 1) * self.num_procs {
+                std::thread::sleep(std::time::Duration::from_millis(10));
+                current = self.count.load(Ordering::Relaxed);
+            }
+        }
+    }
+
+    #[test]
+    fn test_multi_process() {
+        // Initialize
+        let max_size = 1_000_000_000_000;
+        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
+        let ptr = init_struct.data_ptr.as_ptr();
+
+        // Store the SimpleBarrier in the first 1k of the area.
+        init_struct.set_size(10000).unwrap();
+        let barrier_ptr: *mut SimpleBarrier = unsafe {
+            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
+                .cast()
+        };
+        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
+        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
+
+        // Fork another test process. The code after this runs in both processes concurrently.
+        let fork_result = unsafe { nix::unistd::fork().unwrap() };
+
+        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, 1000..2000);
+        } else {
+            write_range(ptr, 0xBB, 2000..3000);
+        }
+        barrier.wait();
+        // Verify the contents. (in both processes)
+        assert_range(ptr, 0xAA, 1000..2000);
+        assert_range(ptr, 0xBB, 2000..3000);
+
+        // Grow, from the child this time
+        let size = 10_000_000;
+        if !fork_result.is_parent() {
+            init_struct.set_size(size).unwrap();
+        }
+        barrier.wait();
+
+        // make some writes at the end
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, (size - 10)..size);
+        } else {
+            write_range(ptr, 0xBB, (size - 20)..(size - 10));
+        }
+        barrier.wait();
+
+        // Verify the contents. (This runs in both processes)
+        assert_range(ptr, 0, (size - 1000)..(size - 20));
+        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
+        assert_range(ptr, 0xAA, (size - 10)..size);
+
+        if let ForkResult::Parent { child } = fork_result {
+            nix::sys::wait::waitpid(child, None).unwrap();
+        }
+    }
+}
--- a/libs/neon-shmem/src/sync.rs
+++ b/libs/neon-shmem/src/sync.rs
@@ -0,0 +1,105 @@
+//! Simple utilities akin to what's in [`std::sync`] but designed to work with shared memory.
+
+use std::mem::MaybeUninit;
+use std::ptr::NonNull;
+
+use nix::errno::Errno;
+
+pub type RwLock<T> = lock_api::RwLock<PthreadRwLock, T>;
+pub(crate) type RwLockReadGuard<'a, T> = lock_api::RwLockReadGuard<'a, PthreadRwLock, T>;
+pub type RwLockWriteGuard<'a, T> = lock_api::RwLockWriteGuard<'a, PthreadRwLock, T>;
+pub type ValueReadGuard<'a, T> = lock_api::MappedRwLockReadGuard<'a, PthreadRwLock, T>;
+pub type ValueWriteGuard<'a, T> = lock_api::MappedRwLockWriteGuard<'a, PthreadRwLock, T>;
+
+/// Shared memory read-write lock.
+pub struct PthreadRwLock(Option<NonNull<libc::pthread_rwlock_t>>);
+
+impl PthreadRwLock {
+	pub fn new(lock: *mut libc::pthread_rwlock_t) -> Self {
+		unsafe {
+			let mut attrs = MaybeUninit::uninit();
+			// Ignoring return value here - only possible error is OOM.
+			libc::pthread_rwlockattr_init(attrs.as_mut_ptr());
+			libc::pthread_rwlockattr_setpshared(
+				attrs.as_mut_ptr(),
+				libc::PTHREAD_PROCESS_SHARED
+			);
+			// TODO(quantumish): worth making this function return Result?
+			libc::pthread_rwlock_init(lock, attrs.as_mut_ptr());
+			// Safety: POSIX specifies that "any function affecting the attributes
+			// object (including destruction) shall not affect any previously
+			// initialized read-write locks". 
+			libc::pthread_rwlockattr_destroy(attrs.as_mut_ptr());
+			Self(Some(NonNull::new_unchecked(lock)))
+		}
+	}
+	
+	fn inner(&self) -> NonNull<libc::pthread_rwlock_t> {
+		match self.0 {
+			None => panic!("PthreadRwLock constructed badly - something likely used RawMutex::INIT"),
+			Some(x) => x,
+		}
+	}
+}
+
+unsafe impl lock_api::RawRwLock for PthreadRwLock {
+	type GuardMarker = lock_api::GuardSend;
+	const INIT: Self = Self(None);	
+	
+	fn lock_shared(&self) {
+		unsafe {
+			let res = libc::pthread_rwlock_rdlock(self.inner().as_ptr());
+			if res != 0 {
+				panic!("rdlock failed with {}", Errno::from_raw(res));
+			}
+		}
+	}
+
+	fn try_lock_shared(&self) -> bool {
+		unsafe {
+			let res = libc::pthread_rwlock_tryrdlock(self.inner().as_ptr());
+			match res {
+				0 => true,
+				libc::EAGAIN => false,
+				o => panic!("try_rdlock failed with {}", Errno::from_raw(res)),
+			}
+		}
+	}
+
+	fn lock_exclusive(&self) {
+		unsafe {
+			let res = libc::pthread_rwlock_wrlock(self.inner().as_ptr());
+			if res != 0 {
+				panic!("wrlock failed with {}", Errno::from_raw(res));
+			}
+		}
+	}
+
+	fn try_lock_exclusive(&self) -> bool {
+		unsafe {
+			let res = libc::pthread_rwlock_trywrlock(self.inner().as_ptr());
+			match res {
+				0 => true,
+				libc::EAGAIN => false,
+				o => panic!("try_wrlock failed with {}", Errno::from_raw(res)),
+			}
+		}
+	}
+
+	unsafe fn unlock_exclusive(&self) {
+		unsafe { 
+			let res = libc::pthread_rwlock_unlock(self.inner().as_ptr());
+			if res != 0 {
+				panic!("unlock failed with {}", Errno::from_raw(res));
+			}
+		}
+	}
+	unsafe fn unlock_shared(&self) {
+		unsafe {
+			let res = libc::pthread_rwlock_unlock(self.inner().as_ptr());
+			if res != 0 {
+				panic!("unlock failed with {}", Errno::from_raw(res));
+			}
+		}
+	}
+}
--- a/libs/neonart/Cargo.toml
+++ b/libs/neonart/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "neonart"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+crossbeam-utils.workspace = true
+spin.workspace = true
+tracing.workspace = true
+
+[dev-dependencies]
+rand = "0.9.1"
+rand_distr = "0.5.1"
--- a/libs/neonart/src/algorithm.rs
+++ b/libs/neonart/src/algorithm.rs
@@ -0,0 +1,594 @@
+mod lock_and_version;
+pub(crate) mod node_ptr;
+mod node_ref;
+
+use std::vec::Vec;
+
+use crate::algorithm::lock_and_version::ConcurrentUpdateError;
+use crate::algorithm::node_ptr::MAX_PREFIX_LEN;
+use crate::algorithm::node_ref::{NewNodeRef, NodeRef, ReadLockedNodeRef, WriteLockedNodeRef};
+use crate::allocator::OutOfMemoryError;
+
+use crate::TreeWriteGuard;
+use crate::UpdateAction;
+use crate::allocator::ArtAllocator;
+use crate::epoch::EpochPin;
+use crate::{Key, Value};
+
+pub(crate) type RootPtr<V> = node_ptr::NodePtr<V>;
+
+#[derive(Debug)]
+pub enum ArtError {
+    ConcurrentUpdate, // need to retry
+    OutOfMemory,
+}
+
+impl From<ConcurrentUpdateError> for ArtError {
+    fn from(_: ConcurrentUpdateError) -> ArtError {
+        ArtError::ConcurrentUpdate
+    }
+}
+
+impl From<OutOfMemoryError> for ArtError {
+    fn from(_: OutOfMemoryError) -> ArtError {
+        ArtError::OutOfMemory
+    }
+}
+
+pub fn new_root<V: Value>(
+    allocator: &impl ArtAllocator<V>,
+) -> Result<RootPtr<V>, OutOfMemoryError> {
+    node_ptr::new_root(allocator)
+}
+
+pub(crate) fn search<'e, K: Key, V: Value>(
+    key: &K,
+    root: RootPtr<V>,
+    epoch_pin: &'e EpochPin,
+) -> Option<&'e V> {
+    loop {
+        let root_ref = NodeRef::from_root_ptr(root);
+        if let Ok(result) = lookup_recurse(key.as_bytes(), root_ref, None, epoch_pin) {
+            break result;
+        }
+        // retry
+    }
+}
+
+pub(crate) fn iter_next<'e, V: Value>(
+    key: &[u8],
+    root: RootPtr<V>,
+    epoch_pin: &'e EpochPin,
+) -> Option<(Vec<u8>, &'e V)> {
+    loop {
+        let mut path = Vec::new();
+        let root_ref = NodeRef::from_root_ptr(root);
+
+        match next_recurse(key, &mut path, root_ref, epoch_pin) {
+            Ok(Some(v)) => {
+                assert_eq!(path.len(), key.len());
+                break Some((path, v));
+            }
+            Ok(None) => break None,
+            Err(ConcurrentUpdateError()) => {
+                // retry
+                continue;
+            }
+        }
+    }
+}
+
+pub(crate) fn update_fn<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>, F>(
+    key: &K,
+    value_fn: F,
+    root: RootPtr<V>,
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), OutOfMemoryError>
+where
+    F: FnOnce(Option<&V>) -> UpdateAction<V>,
+{
+    let value_fn_cell = std::cell::Cell::new(Some(value_fn));
+    loop {
+        let root_ref = NodeRef::from_root_ptr(root);
+        let this_value_fn = |arg: Option<&V>| value_fn_cell.take().unwrap()(arg);
+        let key_bytes = key.as_bytes();
+
+        match update_recurse(
+            key_bytes,
+            this_value_fn,
+            root_ref,
+            None,
+            None,
+            guard,
+            0,
+            key_bytes,
+        ) {
+            Ok(()) => break Ok(()),
+            Err(ArtError::ConcurrentUpdate) => {
+                continue; // retry
+            }
+            Err(ArtError::OutOfMemory) => break Err(OutOfMemoryError()),
+        }
+    }
+}
+
+// Error means you must retry.
+//
+// This corresponds to the 'lookupOpt' function in the paper
+fn lookup_recurse<'e, V: Value>(
+    key: &[u8],
+    node: NodeRef<'e, V>,
+    parent: Option<ReadLockedNodeRef<V>>,
+    epoch_pin: &'e EpochPin,
+) -> Result<Option<&'e V>, ConcurrentUpdateError> {
+    let rnode = node.read_lock_or_restart()?;
+    if let Some(parent) = parent {
+        parent.read_unlock_or_restart()?;
+    }
+
+    // check if the prefix matches, may increment level
+    let prefix_len = if let Some(prefix_len) = rnode.prefix_matches(key) {
+        prefix_len
+    } else {
+        rnode.read_unlock_or_restart()?;
+        return Ok(None);
+    };
+
+    if rnode.is_leaf() {
+        assert_eq!(key.len(), prefix_len);
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let v = unsafe { vptr.as_ref().unwrap() };
+        return Ok(Some(v));
+    }
+
+    let key = &key[prefix_len..];
+
+    // find child (or leaf value)
+    let next_node = rnode.find_child_or_restart(key[0])?;
+
+    match next_node {
+        None => Ok(None), // key not found
+        Some(child) => lookup_recurse(&key[1..], child, Some(rnode), epoch_pin),
+    }
+}
+
+fn next_recurse<'e, V: Value>(
+    min_key: &[u8],
+    path: &mut Vec<u8>,
+    node: NodeRef<'e, V>,
+    epoch_pin: &'e EpochPin,
+) -> Result<Option<&'e V>, ConcurrentUpdateError> {
+    let rnode = node.read_lock_or_restart()?;
+    let prefix = rnode.get_prefix();
+    if prefix.len() != 0 {
+        path.extend_from_slice(prefix);
+    }
+
+    use std::cmp::Ordering;
+    let comparison = path.as_slice().cmp(&min_key[0..path.len()]);
+    if comparison == Ordering::Less {
+        rnode.read_unlock_or_restart()?;
+        return Ok(None);
+    }
+
+    if rnode.is_leaf() {
+        assert_eq!(path.len(), min_key.len());
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let v = unsafe { vptr.as_ref().unwrap() };
+        return Ok(Some(v));
+    }
+
+    let mut min_key_byte = match comparison {
+        Ordering::Less => unreachable!(), // checked this above already
+        Ordering::Equal => min_key[path.len()],
+        Ordering::Greater => 0,
+    };
+
+    loop {
+        match rnode.find_next_child_or_restart(min_key_byte)? {
+            None => {
+                return Ok(None);
+            }
+            Some((key_byte, child_ref)) => {
+                let path_len = path.len();
+                path.push(key_byte);
+                let result = next_recurse(min_key, path, child_ref, epoch_pin)?;
+                if result.is_some() {
+                    return Ok(result);
+                }
+                if key_byte == u8::MAX {
+                    return Ok(None);
+                }
+                path.truncate(path_len);
+                min_key_byte = key_byte + 1;
+            }
+        }
+    }
+}
+
+// This corresponds to the 'insertOpt' function in the paper
+pub(crate) fn update_recurse<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>, F>(
+    key: &[u8],
+    value_fn: F,
+    node: NodeRef<'e, V>,
+    rparent: Option<(ReadLockedNodeRef<V>, u8)>,
+    rgrandparent: Option<(ReadLockedNodeRef<V>, u8)>,
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+    level: usize,
+    orig_key: &[u8],
+) -> Result<(), ArtError>
+where
+    F: FnOnce(Option<&V>) -> UpdateAction<V>,
+{
+    let rnode = node.read_lock_or_restart()?;
+
+    let prefix_match_len = rnode.prefix_matches(key);
+    if prefix_match_len.is_none() {
+        let (rparent, parent_key) = rparent.expect("direct children of the root have no prefix");
+        let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+        let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+        match value_fn(None) {
+            UpdateAction::Nothing => {}
+            UpdateAction::Insert(new_value) => {
+                insert_split_prefix(key, new_value, &mut wnode, &mut wparent, parent_key, guard)?;
+            }
+            UpdateAction::Remove => {
+                panic!("unexpected Remove action on insertion");
+            }
+        }
+        wnode.write_unlock();
+        wparent.write_unlock();
+        return Ok(());
+    }
+    let prefix_match_len = prefix_match_len.unwrap();
+    let key = &key[prefix_match_len as usize..];
+    let level = level + prefix_match_len as usize;
+
+    if rnode.is_leaf() {
+        assert_eq!(key.len(), 0);
+        let (rparent, parent_key) = rparent.expect("root cannot be leaf");
+        let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+        let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+        // safety: Now that we have acquired the write lock, we have exclusive access to the
+        // value. XXX: There might be concurrent reads though?
+        let value_mut = wnode.get_leaf_value_mut();
+
+        match value_fn(Some(value_mut)) {
+            UpdateAction::Nothing => {
+                wparent.write_unlock();
+                wnode.write_unlock();
+            }
+            UpdateAction::Insert(_) => panic!("cannot insert over existing value"),
+            UpdateAction::Remove => {
+                guard.remember_obsolete_node(wnode.as_ptr());
+                wparent.delete_child(parent_key);
+                wnode.write_unlock_obsolete();
+
+                if let Some(rgrandparent) = rgrandparent {
+                    // FIXME: Ignore concurrency error. It doesn't lead to
+                    // corruption, but it means we might leak something. Until
+                    // another update cleans it up.
+                    let _ = cleanup_parent(wparent, rgrandparent, guard);
+                }
+            }
+        }
+
+        return Ok(());
+    }
+
+    let next_node = rnode.find_child_or_restart(key[0])?;
+
+    if next_node.is_none() {
+        if rnode.is_full() {
+            let (rparent, parent_key) = rparent.expect("root node cannot become full");
+            let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+            let wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+            match value_fn(None) {
+                UpdateAction::Nothing => {
+                    wnode.write_unlock();
+                    wparent.write_unlock();
+                }
+                UpdateAction::Insert(new_value) => {
+                    insert_and_grow(key, new_value, wnode, &mut wparent, parent_key, guard)?;
+                    wparent.write_unlock();
+                }
+                UpdateAction::Remove => {
+                    panic!("unexpected Remove action on insertion");
+                }
+            };
+        } else {
+            let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+            if let Some((rparent, _)) = rparent {
+                rparent.read_unlock_or_restart()?;
+            }
+            match value_fn(None) {
+                UpdateAction::Nothing => {}
+                UpdateAction::Insert(new_value) => {
+                    insert_to_node(&mut wnode, key, new_value, guard)?;
+                }
+                UpdateAction::Remove => {
+                    panic!("unexpected Remove action on insertion");
+                }
+            };
+            wnode.write_unlock();
+        }
+        return Ok(());
+    } else {
+        let next_child = next_node.unwrap(); // checked above it's not None
+        if let Some((ref rparent, _)) = rparent {
+            rparent.check_or_restart()?;
+        }
+
+        // recurse to next level
+        update_recurse(
+            &key[1..],
+            value_fn,
+            next_child,
+            Some((rnode, key[0])),
+            rparent,
+            guard,
+            level + 1,
+            orig_key,
+        )
+    }
+}
+
+#[derive(Clone)]
+enum PathElement {
+    Prefix(Vec<u8>),
+    KeyByte(u8),
+}
+
+impl std::fmt::Debug for PathElement {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            PathElement::Prefix(prefix) => write!(fmt, "{:?}", prefix),
+            PathElement::KeyByte(key_byte) => write!(fmt, "{}", key_byte),
+        }
+    }
+}
+
+pub(crate) fn dump_tree<'e, V: Value + std::fmt::Debug>(
+    root: RootPtr<V>,
+    epoch_pin: &'e EpochPin,
+    dst: &mut dyn std::io::Write,
+) {
+    let root_ref = NodeRef::from_root_ptr(root);
+
+    let _ = dump_recurse(&[], root_ref, &epoch_pin, 0, dst);
+}
+
+// TODO: return an Err if writeln!() returns error, instead of unwrapping
+fn dump_recurse<'e, V: Value + std::fmt::Debug>(
+    path: &[PathElement],
+    node: NodeRef<'e, V>,
+    epoch_pin: &'e EpochPin,
+    level: usize,
+    dst: &mut dyn std::io::Write,
+) -> Result<(), ConcurrentUpdateError> {
+    let indent = str::repeat(" ", level);
+
+    let rnode = node.read_lock_or_restart()?;
+    let mut path = Vec::from(path);
+    let prefix = rnode.get_prefix();
+    if prefix.len() != 0 {
+        path.push(PathElement::Prefix(Vec::from(prefix)));
+    }
+
+    if rnode.is_leaf() {
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let val = unsafe { vptr.as_ref().unwrap() };
+        writeln!(dst, "{} {:?}: {:?}", indent, path, val).unwrap();
+        return Ok(());
+    }
+
+    for key_byte in 0..=u8::MAX {
+        match rnode.find_child_or_restart(key_byte)? {
+            None => continue,
+            Some(child_ref) => {
+                let rchild = child_ref.read_lock_or_restart()?;
+                writeln!(
+                    dst,
+                    "{} {:?}, {}: prefix {:?}",
+                    indent,
+                    &path,
+                    key_byte,
+                    rchild.get_prefix()
+                )
+                .unwrap();
+
+                let mut child_path = path.clone();
+                child_path.push(PathElement::KeyByte(key_byte));
+
+                dump_recurse(&child_path, child_ref, epoch_pin, level + 1, dst)?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+///```text
+///        [fooba]r -> value
+///
+/// [foo]b -> [a]r  -> value
+///      e -> [ls]e -> value
+///```
+fn insert_split_prefix<'e, K: Key, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    node: &mut WriteLockedNodeRef<V>,
+    parent: &mut WriteLockedNodeRef<V>,
+    parent_key: u8,
+    guard: &'e TreeWriteGuard<K, V, A>,
+) -> Result<(), OutOfMemoryError> {
+    let old_node = node;
+    let old_prefix = old_node.get_prefix();
+    let common_prefix_len = common_prefix(key, old_prefix);
+
+    // Allocate a node for the new value.
+    let new_value_node = allocate_node_for_value(
+        &key[common_prefix_len + 1..],
+        value,
+        guard.tree_writer.allocator,
+    )?;
+
+    // Allocate a new internal node with the common prefix
+    // FIXME: deallocate 'new_value_node' on OOM
+    let mut prefix_node =
+        node_ref::new_internal(&key[..common_prefix_len], guard.tree_writer.allocator)?;
+
+    // Add the old node and the new nodes to the new internal node
+    prefix_node.insert_old_child(old_prefix[common_prefix_len], old_node);
+    prefix_node.insert_new_child(key[common_prefix_len], new_value_node);
+
+    // Modify the prefix of the old child in place
+    old_node.truncate_prefix(old_prefix.len() - common_prefix_len - 1);
+
+    // replace the pointer in the parent
+    parent.replace_child(parent_key, prefix_node.into_ptr());
+
+    Ok(())
+}
+
+fn insert_to_node<'e, K: Key, V: Value, A: ArtAllocator<V>>(
+    wnode: &mut WriteLockedNodeRef<V>,
+    key: &[u8],
+    value: V,
+    guard: &'e TreeWriteGuard<K, V, A>,
+) -> Result<(), OutOfMemoryError> {
+    let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
+    wnode.insert_child(key[0], value_child.into_ptr());
+    Ok(())
+}
+
+// On entry: 'parent' and 'node' are locked
+fn insert_and_grow<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    wnode: WriteLockedNodeRef<V>,
+    parent: &mut WriteLockedNodeRef<V>,
+    parent_key_byte: u8,
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), ArtError> {
+    let mut bigger_node = wnode.grow(guard.tree_writer.allocator)?;
+
+    // FIXME: deallocate 'bigger_node' on OOM
+    let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
+    bigger_node.insert_new_child(key[0], value_child);
+
+    // Replace the pointer in the parent
+    parent.replace_child(parent_key_byte, bigger_node.into_ptr());
+
+    guard.remember_obsolete_node(wnode.as_ptr());
+    wnode.write_unlock_obsolete();
+
+    Ok(())
+}
+
+fn cleanup_parent<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
+    wparent: WriteLockedNodeRef<V>,
+    rgrandparent: (ReadLockedNodeRef<V>, u8),
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), ArtError> {
+    let (rgrandparent, grandparent_key_byte) = rgrandparent;
+
+    // If the parent becomes completely empty after the deletion, remove the parent from the
+    // grandparent. (This case is possible because we reserve only 8 bytes for the prefix.)
+    // TODO: not implemented.
+
+    // If the parent has only one child, replace the parent with the remaining child. (This is not
+    // possible if the child's prefix field cannot absorb the parent's)
+    if wparent.num_children() == 1 {
+        // Try to lock the remaining child. This can fail if the child is updated
+        // concurrently.
+        let (key_byte, remaining_child) = wparent.find_remaining_child();
+
+        let mut wremaining_child = remaining_child.write_lock_or_restart()?;
+
+        if 1 + wremaining_child.get_prefix().len() + wparent.get_prefix().len() <= MAX_PREFIX_LEN {
+            let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
+
+            // Ok, we have locked the leaf, the parent, the grandparent, and the parent's only
+            // remaining leaf. Proceed with the updates.
+
+            // Update the prefix on the remaining leaf
+            wremaining_child.prepend_prefix(wparent.get_prefix(), key_byte);
+
+            // Replace the pointer in the grandparent to point directly to the remaining leaf
+            wgrandparent.replace_child(grandparent_key_byte, wremaining_child.as_ptr());
+
+            // Mark the parent as deleted.
+            guard.remember_obsolete_node(wparent.as_ptr());
+            wparent.write_unlock_obsolete();
+            return Ok(());
+        }
+    }
+
+    // If the parent's children would fit on a smaller node type after the deletion, replace it with
+    // a smaller node.
+    if wparent.can_shrink() {
+        let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
+        let smaller_node = wparent.shrink(guard.tree_writer.allocator)?;
+
+        // Replace the pointer in the grandparent
+        wgrandparent.replace_child(grandparent_key_byte, smaller_node.into_ptr());
+
+        guard.remember_obsolete_node(wparent.as_ptr());
+        wparent.write_unlock_obsolete();
+        return Ok(());
+    }
+
+    // nothing to do
+    wparent.write_unlock();
+    Ok(())
+}
+
+// Allocate a new leaf node to hold 'value'. If the key is long, we
+// may need to allocate new internal nodes to hold it too
+fn allocate_node_for_value<'a, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError> {
+    let mut prefix_off = key.len().saturating_sub(MAX_PREFIX_LEN);
+
+    let leaf_node = node_ref::new_leaf(&key[prefix_off..key.len()], value, allocator)?;
+
+    let mut node = leaf_node;
+    while prefix_off > 0 {
+        // Need another internal node
+        let remain_prefix = &key[0..prefix_off];
+
+        prefix_off = remain_prefix.len().saturating_sub(MAX_PREFIX_LEN + 1);
+        let mut internal_node = node_ref::new_internal(
+            &remain_prefix[prefix_off..remain_prefix.len() - 1],
+            allocator,
+        )?;
+        internal_node.insert_new_child(*remain_prefix.last().unwrap(), node);
+        node = internal_node;
+    }
+
+    Ok(node)
+}
+
+fn common_prefix(a: &[u8], b: &[u8]) -> usize {
+    for i in 0..MAX_PREFIX_LEN {
+        if a[i] != b[i] {
+            return i;
+        }
+    }
+    panic!("prefixes are equal");
+}
--- a/libs/neonart/src/algorithm/lock_and_version.rs
+++ b/libs/neonart/src/algorithm/lock_and_version.rs
@@ -0,0 +1,117 @@
+//! Each node in the tree has contains one atomic word that stores three things:
+//!
+//! Bit 0: set if the node is "obsolete". An obsolete node has been removed from the tree,
+//!        but might still be accessed by concurrent readers until the epoch expires.
+//! Bit 1: set if the node is currently write-locked. Used as a spinlock.
+//! Bits 2-63: Version number, incremented every time the node is modified.
+//!
+//! AtomicLockAndVersion represents that.
+
+use std::sync::atomic::{AtomicU64, Ordering};
+
+pub(crate) struct ConcurrentUpdateError();
+
+pub(crate) struct AtomicLockAndVersion {
+    inner: AtomicU64,
+}
+
+impl AtomicLockAndVersion {
+    pub(crate) fn new() -> AtomicLockAndVersion {
+        AtomicLockAndVersion {
+            inner: AtomicU64::new(0),
+        }
+    }
+}
+
+impl AtomicLockAndVersion {
+    pub(crate) fn read_lock_or_restart(&self) -> Result<u64, ConcurrentUpdateError> {
+        let version = self.await_node_unlocked();
+        if is_obsolete(version) {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(version)
+    }
+
+    pub(crate) fn check_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
+        self.read_unlock_or_restart(version)
+    }
+
+    pub(crate) fn read_unlock_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
+        if self.inner.load(Ordering::Acquire) != version {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn upgrade_to_write_lock_or_restart(
+        &self,
+        version: u64,
+    ) -> Result<(), ConcurrentUpdateError> {
+        if self
+            .inner
+            .compare_exchange(
+                version,
+                set_locked_bit(version),
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            )
+            .is_err()
+        {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn write_lock_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
+        let old = self.inner.load(Ordering::Relaxed);
+        if is_obsolete(old) || is_locked(old) {
+            return Err(ConcurrentUpdateError());
+        }
+        if self
+            .inner
+            .compare_exchange(
+                old,
+                set_locked_bit(old),
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            )
+            .is_err()
+        {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn write_unlock(&self) {
+        // reset locked bit and overflow into version
+        self.inner.fetch_add(2, Ordering::Release);
+    }
+
+    pub(crate) fn write_unlock_obsolete(&self) {
+        // set obsolete, reset locked, overflow into version
+        self.inner.fetch_add(3, Ordering::Release);
+    }
+
+    // Helper functions
+    fn await_node_unlocked(&self) -> u64 {
+        let mut version = self.inner.load(Ordering::Acquire);
+        while is_locked(version) {
+            // spinlock
+            std::thread::yield_now();
+            version = self.inner.load(Ordering::Acquire)
+        }
+        version
+    }
+}
+
+fn set_locked_bit(version: u64) -> u64 {
+    return version + 2;
+}
+
+fn is_obsolete(version: u64) -> bool {
+    return (version & 1) == 1;
+}
+
+fn is_locked(version: u64) -> bool {
+    return (version & 2) == 2;
+}
--- a/libs/neonart/src/algorithm/node_ptr.rs
+++ b/libs/neonart/src/algorithm/node_ptr.rs
--- a/libs/neonart/src/algorithm/node_ref.rs
+++ b/libs/neonart/src/algorithm/node_ref.rs
@@ -0,0 +1,349 @@
+use std::fmt::Debug;
+use std::marker::PhantomData;
+
+use super::node_ptr;
+use super::node_ptr::NodePtr;
+use crate::EpochPin;
+use crate::Value;
+use crate::algorithm::lock_and_version::AtomicLockAndVersion;
+use crate::algorithm::lock_and_version::ConcurrentUpdateError;
+use crate::allocator::ArtAllocator;
+use crate::allocator::OutOfMemoryError;
+
+pub struct NodeRef<'e, V> {
+    ptr: NodePtr<V>,
+
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V> Debug for NodeRef<'e, V> {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.ptr)
+    }
+}
+
+impl<'e, V: Value> NodeRef<'e, V> {
+    pub(crate) fn from_root_ptr(root_ptr: NodePtr<V>) -> NodeRef<'e, V> {
+        NodeRef {
+            ptr: root_ptr,
+            phantom: PhantomData,
+        }
+    }
+
+    pub(crate) fn read_lock_or_restart(
+        &self,
+    ) -> Result<ReadLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        let version = self.lockword().read_lock_or_restart()?;
+        Ok(ReadLockedNodeRef {
+            ptr: self.ptr,
+            version,
+            phantom: self.phantom,
+        })
+    }
+
+    pub(crate) fn write_lock_or_restart(
+        &self,
+    ) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        self.lockword().write_lock_or_restart()?;
+        Ok(WriteLockedNodeRef {
+            ptr: self.ptr,
+            phantom: self.phantom,
+        })
+    }
+
+    fn lockword(&self) -> &AtomicLockAndVersion {
+        self.ptr.lockword()
+    }
+}
+
+/// A reference to a node that has been optimistically read-locked. The functions re-check
+/// the version after each read.
+pub struct ReadLockedNodeRef<'e, V> {
+    ptr: NodePtr<V>,
+    version: u64,
+
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V: Value> ReadLockedNodeRef<'e, V> {
+    pub(crate) fn is_leaf(&self) -> bool {
+        self.ptr.is_leaf()
+    }
+
+    pub(crate) fn is_full(&self) -> bool {
+        self.ptr.is_full()
+    }
+
+    pub(crate) fn get_prefix(&self) -> &[u8] {
+        self.ptr.get_prefix()
+    }
+
+    /// Note: because we're only holding a read lock, the prefix can change concurrently.
+    /// You must be prepared to restart, if read_unlock() returns error later.
+    ///
+    /// Returns the length of the prefix, or None if it's not a match
+    pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
+        self.ptr.prefix_matches(key)
+    }
+
+    pub(crate) fn find_child_or_restart(
+        &self,
+        key_byte: u8,
+    ) -> Result<Option<NodeRef<'e, V>>, ConcurrentUpdateError> {
+        let child_or_value = self.ptr.find_child(key_byte);
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        match child_or_value {
+            None => Ok(None),
+            Some(child_ptr) => Ok(Some(NodeRef {
+                ptr: child_ptr,
+                phantom: self.phantom,
+            })),
+        }
+    }
+
+    pub(crate) fn find_next_child_or_restart(
+        &self,
+        min_key_byte: u8,
+    ) -> Result<Option<(u8, NodeRef<'e, V>)>, ConcurrentUpdateError> {
+        let child_or_value = self.ptr.find_next_child(min_key_byte);
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        match child_or_value {
+            None => Ok(None),
+            Some((k, child_ptr)) => Ok(Some((
+                k,
+                NodeRef {
+                    ptr: child_ptr,
+                    phantom: self.phantom,
+                },
+            ))),
+        }
+    }
+
+    pub(crate) fn get_leaf_value_ptr(&self) -> Result<*const V, ConcurrentUpdateError> {
+        let result = self.ptr.get_leaf_value();
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        // Extend the lifetime.
+        let result = std::ptr::from_ref(result);
+
+        Ok(result)
+    }
+
+    pub(crate) fn upgrade_to_write_lock_or_restart(
+        self,
+    ) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        self.ptr
+            .lockword()
+            .upgrade_to_write_lock_or_restart(self.version)?;
+
+        Ok(WriteLockedNodeRef {
+            ptr: self.ptr,
+            phantom: self.phantom,
+        })
+    }
+
+    pub(crate) fn read_unlock_or_restart(self) -> Result<(), ConcurrentUpdateError> {
+        self.ptr.lockword().check_or_restart(self.version)?;
+        Ok(())
+    }
+
+    pub(crate) fn check_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
+        self.ptr.lockword().check_or_restart(self.version)?;
+        Ok(())
+    }
+}
+
+/// A reference to a node that has been optimistically read-locked. The functions re-check
+/// the version after each read.
+pub struct WriteLockedNodeRef<'e, V> {
+    ptr: NodePtr<V>,
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V: Value> WriteLockedNodeRef<'e, V> {
+    pub(crate) fn can_shrink(&self) -> bool {
+        self.ptr.can_shrink()
+    }
+
+    pub(crate) fn num_children(&self) -> usize {
+        self.ptr.num_children()
+    }
+
+    pub(crate) fn write_unlock(mut self) {
+        self.ptr.lockword().write_unlock();
+        self.ptr = NodePtr::null();
+    }
+
+    pub(crate) fn write_unlock_obsolete(mut self) {
+        self.ptr.lockword().write_unlock_obsolete();
+        self.ptr = NodePtr::null();
+    }
+
+    pub(crate) fn get_prefix(&self) -> &[u8] {
+        self.ptr.get_prefix()
+    }
+
+    pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
+        self.ptr.truncate_prefix(new_prefix_len)
+    }
+
+    pub(crate) fn prepend_prefix(&mut self, prefix: &[u8], prefix_byte: u8) {
+        self.ptr.prepend_prefix(prefix, prefix_byte)
+    }
+
+    pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
+        self.ptr.insert_child(key_byte, child)
+    }
+
+    pub(crate) fn get_leaf_value_mut(&mut self) -> &mut V {
+        self.ptr.get_leaf_value_mut()
+    }
+
+    pub(crate) fn grow<'a, A>(
+        &self,
+        allocator: &'a A,
+    ) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+    where
+        A: ArtAllocator<V>,
+    {
+        let new_node = self.ptr.grow(allocator)?;
+        Ok(NewNodeRef {
+            ptr: new_node,
+            allocator,
+            extra_nodes: Vec::new(),
+        })
+    }
+
+    pub(crate) fn shrink<'a, A>(
+        &self,
+        allocator: &'a A,
+    ) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+    where
+        A: ArtAllocator<V>,
+    {
+        let new_node = self.ptr.shrink(allocator)?;
+        Ok(NewNodeRef {
+            ptr: new_node,
+            allocator,
+            extra_nodes: Vec::new(),
+        })
+    }
+
+    pub(crate) fn as_ptr(&self) -> NodePtr<V> {
+        self.ptr
+    }
+
+    pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
+        self.ptr.replace_child(key_byte, replacement);
+    }
+
+    pub(crate) fn delete_child(&mut self, key_byte: u8) {
+        self.ptr.delete_child(key_byte);
+    }
+
+    pub(crate) fn find_remaining_child(&self) -> (u8, NodeRef<'e, V>) {
+        assert_eq!(self.num_children(), 1);
+        let child_or_value = self.ptr.find_next_child(0);
+
+        match child_or_value {
+            None => panic!("could not find only child in node"),
+            Some((k, child_ptr)) => (
+                k,
+                NodeRef {
+                    ptr: child_ptr,
+                    phantom: self.phantom,
+                },
+            ),
+        }
+    }
+}
+
+impl<'e, V> Drop for WriteLockedNodeRef<'e, V> {
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            self.ptr.lockword().write_unlock();
+        }
+    }
+}
+
+pub(crate) struct NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    ptr: NodePtr<V>,
+    allocator: &'a A,
+
+    extra_nodes: Vec<NodePtr<V>>,
+}
+
+impl<'a, V, A> NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    pub(crate) fn insert_old_child(&mut self, key_byte: u8, child: &WriteLockedNodeRef<V>) {
+        self.ptr.insert_child(key_byte, child.as_ptr())
+    }
+
+    pub(crate) fn into_ptr(mut self) -> NodePtr<V> {
+        let ptr = self.ptr;
+        self.ptr = NodePtr::null();
+        ptr
+    }
+
+    pub(crate) fn insert_new_child(&mut self, key_byte: u8, child: NewNodeRef<'a, V, A>) {
+        let child_ptr = child.into_ptr();
+        self.ptr.insert_child(key_byte, child_ptr);
+        self.extra_nodes.push(child_ptr);
+    }
+}
+
+impl<'a, V, A> Drop for NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    /// This drop implementation deallocates the newly allocated node, if into_ptr() was not called.
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            self.ptr.deallocate(self.allocator);
+            for p in self.extra_nodes.iter() {
+                p.deallocate(self.allocator);
+            }
+        }
+    }
+}
+
+pub(crate) fn new_internal<'a, V, A>(
+    prefix: &[u8],
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    Ok(NewNodeRef {
+        ptr: node_ptr::new_internal(prefix, allocator)?,
+        allocator,
+        extra_nodes: Vec::new(),
+    })
+}
+
+pub(crate) fn new_leaf<'a, V, A>(
+    prefix: &[u8],
+    value: V,
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    Ok(NewNodeRef {
+        ptr: node_ptr::new_leaf(prefix, value, allocator)?,
+        allocator,
+        extra_nodes: Vec::new(),
+    })
+}
--- a/libs/neonart/src/allocator.rs
+++ b/libs/neonart/src/allocator.rs
@@ -0,0 +1,158 @@
+pub mod block;
+mod multislab;
+mod slab;
+pub mod r#static;
+
+use std::alloc::Layout;
+use std::marker::PhantomData;
+use std::mem::MaybeUninit;
+use std::sync::atomic::Ordering;
+
+use crate::allocator::multislab::MultiSlabAllocator;
+use crate::allocator::r#static::alloc_from_slice;
+
+use spin;
+
+use crate::Tree;
+pub use crate::algorithm::node_ptr::{
+    NodeInternal4, NodeInternal16, NodeInternal48, NodeInternal256, NodeLeaf,
+};
+
+#[derive(Debug)]
+pub struct OutOfMemoryError();
+
+pub trait ArtAllocator<V: crate::Value> {
+    fn alloc_tree(&self) -> *mut Tree<V>;
+
+    fn alloc_node_internal4(&self) -> *mut NodeInternal4<V>;
+    fn alloc_node_internal16(&self) -> *mut NodeInternal16<V>;
+    fn alloc_node_internal48(&self) -> *mut NodeInternal48<V>;
+    fn alloc_node_internal256(&self) -> *mut NodeInternal256<V>;
+    fn alloc_node_leaf(&self) -> *mut NodeLeaf<V>;
+
+    fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>);
+    fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>);
+    fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>);
+    fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>);
+    fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>);
+}
+
+pub struct ArtMultiSlabAllocator<'t, V>
+where
+    V: crate::Value,
+{
+    tree_area: spin::Mutex<Option<&'t mut MaybeUninit<Tree<V>>>>,
+
+    pub(crate) inner: MultiSlabAllocator<'t, 5>,
+
+    phantom_val: PhantomData<V>,
+}
+
+impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
+    const LAYOUTS: [Layout; 5] = [
+        Layout::new::<NodeInternal4<V>>(),
+        Layout::new::<NodeInternal16<V>>(),
+        Layout::new::<NodeInternal48<V>>(),
+        Layout::new::<NodeInternal256<V>>(),
+        Layout::new::<NodeLeaf<V>>(),
+    ];
+
+    pub fn new(area: &'t mut [MaybeUninit<u8>]) -> &'t mut ArtMultiSlabAllocator<'t, V> {
+        let (allocator_area, remain) = alloc_from_slice::<ArtMultiSlabAllocator<V>>(area);
+        let (tree_area, remain) = alloc_from_slice::<Tree<V>>(remain);
+
+        let allocator = allocator_area.write(ArtMultiSlabAllocator {
+            tree_area: spin::Mutex::new(Some(tree_area)),
+            inner: MultiSlabAllocator::new(remain, &Self::LAYOUTS),
+            phantom_val: PhantomData,
+        });
+
+        allocator
+    }
+}
+
+impl<'t, V: crate::Value> ArtAllocator<V> for ArtMultiSlabAllocator<'t, V> {
+    fn alloc_tree(&self) -> *mut Tree<V> {
+        let mut t = self.tree_area.lock();
+        if let Some(tree_area) = t.take() {
+            return tree_area.as_mut_ptr().cast();
+        }
+        panic!("cannot allocate more than one tree");
+    }
+
+    fn alloc_node_internal4(&self) -> *mut NodeInternal4<V> {
+        self.inner.alloc_slab(0).cast()
+    }
+    fn alloc_node_internal16(&self) -> *mut NodeInternal16<V> {
+        self.inner.alloc_slab(1).cast()
+    }
+    fn alloc_node_internal48(&self) -> *mut NodeInternal48<V> {
+        self.inner.alloc_slab(2).cast()
+    }
+    fn alloc_node_internal256(&self) -> *mut NodeInternal256<V> {
+        self.inner.alloc_slab(3).cast()
+    }
+    fn alloc_node_leaf(&self) -> *mut NodeLeaf<V> {
+        self.inner.alloc_slab(4).cast()
+    }
+
+    fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>) {
+        self.inner.dealloc_slab(0, ptr.cast())
+    }
+
+    fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>) {
+        self.inner.dealloc_slab(1, ptr.cast())
+    }
+    fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>) {
+        self.inner.dealloc_slab(2, ptr.cast())
+    }
+    fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>) {
+        self.inner.dealloc_slab(3, ptr.cast())
+    }
+    fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>) {
+        self.inner.dealloc_slab(4, ptr.cast())
+    }
+}
+
+impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
+    pub(crate) fn get_statistics(&self) -> ArtMultiSlabStats {
+        ArtMultiSlabStats {
+            num_internal4: self.inner.slab_descs[0]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal16: self.inner.slab_descs[1]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal48: self.inner.slab_descs[2]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal256: self.inner.slab_descs[3]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_leaf: self.inner.slab_descs[4]
+                .num_allocated
+                .load(Ordering::Relaxed),
+
+            num_blocks_internal4: self.inner.slab_descs[0].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal16: self.inner.slab_descs[1].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal48: self.inner.slab_descs[2].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal256: self.inner.slab_descs[3].num_blocks.load(Ordering::Relaxed),
+            num_blocks_leaf: self.inner.slab_descs[4].num_blocks.load(Ordering::Relaxed),
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ArtMultiSlabStats {
+    pub num_internal4: u64,
+    pub num_internal16: u64,
+    pub num_internal48: u64,
+    pub num_internal256: u64,
+    pub num_leaf: u64,
+
+    pub num_blocks_internal4: u64,
+    pub num_blocks_internal16: u64,
+    pub num_blocks_internal48: u64,
+    pub num_blocks_internal256: u64,
+    pub num_blocks_leaf: u64,
+}
--- a/libs/neonart/src/allocator/block.rs
+++ b/libs/neonart/src/allocator/block.rs
@@ -0,0 +1,191 @@
+//! Simple allocator of fixed-size blocks
+
+use std::mem::MaybeUninit;
+use std::sync::atomic::{AtomicU64, Ordering};
+
+use spin;
+
+pub const BLOCK_SIZE: usize = 16 * 1024;
+
+const INVALID_BLOCK: u64 = u64::MAX;
+
+pub(crate) struct BlockAllocator<'t> {
+    blocks_ptr: &'t [MaybeUninit<u8>],
+    num_blocks: u64,
+    num_initialized: AtomicU64,
+
+    freelist_head: spin::Mutex<u64>,
+}
+
+struct FreeListBlock {
+    inner: spin::Mutex<FreeListBlockInner>,
+}
+
+struct FreeListBlockInner {
+    next: u64,
+
+    num_free_blocks: u64,
+    free_blocks: [u64; 100], // FIXME: fill the rest of the block
+}
+
+impl<'t> BlockAllocator<'t> {
+    pub(crate) fn new(area: &'t mut [MaybeUninit<u8>]) -> Self {
+        // Use all the space for the blocks
+        let padding = area.as_ptr().align_offset(BLOCK_SIZE);
+        let remain = &mut area[padding..];
+
+        let num_blocks = (remain.len() / BLOCK_SIZE) as u64;
+
+        BlockAllocator {
+            blocks_ptr: remain,
+            num_blocks,
+            num_initialized: AtomicU64::new(0),
+            freelist_head: spin::Mutex::new(INVALID_BLOCK),
+        }
+    }
+
+    /// safety: you must hold a lock on the pointer to this block, otherwise it might get
+    /// reused for another kind of block
+    fn read_freelist_block(&self, blkno: u64) -> &FreeListBlock {
+        let ptr: *const FreeListBlock = self.get_block_ptr(blkno).cast();
+        unsafe { ptr.as_ref().unwrap() }
+    }
+
+    fn get_block_ptr(&self, blkno: u64) -> *mut u8 {
+        assert!(blkno < self.num_blocks);
+        unsafe {
+            self.blocks_ptr
+                .as_ptr()
+                .byte_offset(blkno as isize * BLOCK_SIZE as isize)
+        }
+        .cast_mut()
+        .cast()
+    }
+
+    #[allow(clippy::mut_from_ref)]
+    pub(crate) fn alloc_block(&self) -> &mut [MaybeUninit<u8>] {
+        // FIXME: handle OOM
+        let blkno = self.alloc_block_internal();
+        if blkno == INVALID_BLOCK {
+            panic!("out of memory");
+        }
+
+        let ptr: *mut MaybeUninit<u8> = self.get_block_ptr(blkno).cast();
+        unsafe { std::slice::from_raw_parts_mut(ptr, BLOCK_SIZE) }
+    }
+
+    fn alloc_block_internal(&self) -> u64 {
+        //  check the free list.
+        {
+            let mut freelist_head = self.freelist_head.lock();
+            if *freelist_head != INVALID_BLOCK {
+                let freelist_block = self.read_freelist_block(*freelist_head);
+
+                // acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
+                let mut g = freelist_block.inner.lock();
+
+                if g.num_free_blocks > 0 {
+                    g.num_free_blocks -= 1;
+                    let result = g.free_blocks[g.num_free_blocks as usize];
+                    return result;
+                } else {
+                    // consume the freelist block itself
+                    let result = *freelist_head;
+                    *freelist_head = g.next;
+                    // This freelist block is now unlinked and can be repurposed
+                    drop(g);
+                    return result;
+                }
+            }
+        }
+
+        // If there are some blocks left that we've never used, pick next such block
+        let mut next_uninitialized = self.num_initialized.load(Ordering::Relaxed);
+        while next_uninitialized < self.num_blocks {
+            match self.num_initialized.compare_exchange(
+                next_uninitialized,
+                next_uninitialized + 1,
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => {
+                    return next_uninitialized;
+                }
+                Err(old) => {
+                    next_uninitialized = old;
+                    continue;
+                }
+            }
+        }
+
+        // out of blocks
+        return INVALID_BLOCK;
+    }
+
+    // TODO: this is currently unused. The slab allocator never releases blocks
+    #[allow(dead_code)]
+    pub(crate) fn release_block(&self, block_ptr: *mut u8) {
+        let blockno = unsafe { block_ptr.byte_offset_from(self.blocks_ptr) / BLOCK_SIZE as isize };
+        self.release_block_internal(blockno as u64);
+    }
+
+    fn release_block_internal(&self, blockno: u64) {
+        let mut freelist_head = self.freelist_head.lock();
+        if *freelist_head != INVALID_BLOCK {
+            let freelist_block = self.read_freelist_block(*freelist_head);
+
+            // acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
+            let mut g = freelist_block.inner.lock();
+
+            let num_free_blocks = g.num_free_blocks;
+            if num_free_blocks < g.free_blocks.len() as u64 {
+                g.free_blocks[num_free_blocks as usize] = blockno;
+                g.num_free_blocks += 1;
+                return;
+            }
+        }
+
+        // Convert the block into a new freelist block
+        let block_ptr: *mut FreeListBlock = self.get_block_ptr(blockno).cast();
+        let init = FreeListBlock {
+            inner: spin::Mutex::new(FreeListBlockInner {
+                next: *freelist_head,
+                num_free_blocks: 0,
+                free_blocks: [INVALID_BLOCK; 100],
+            }),
+        };
+        unsafe { (*block_ptr) = init };
+        *freelist_head = blockno;
+    }
+
+    // for debugging
+    pub(crate) fn get_statistics(&self) -> BlockAllocatorStats {
+        let mut num_free_blocks = 0;
+
+        let mut _prev_lock = None;
+        let head_lock = self.freelist_head.lock();
+        let mut next_blk = *head_lock;
+        let mut _head_lock = Some(head_lock);
+        while next_blk != INVALID_BLOCK {
+            let freelist_block = self.read_freelist_block(next_blk);
+            let lock = freelist_block.inner.lock();
+            num_free_blocks += lock.num_free_blocks;
+            next_blk = lock.next;
+            _prev_lock = Some(lock); // hold the lock until we've read the next block
+            _head_lock = None;
+        }
+
+        BlockAllocatorStats {
+            num_blocks: self.num_blocks,
+            num_initialized: self.num_initialized.load(Ordering::Relaxed),
+            num_free_blocks,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct BlockAllocatorStats {
+    pub num_blocks: u64,
+    pub num_initialized: u64,
+    pub num_free_blocks: u64,
+}
--- a/libs/neonart/src/allocator/multislab.rs
+++ b/libs/neonart/src/allocator/multislab.rs
@@ -0,0 +1,33 @@
+use std::alloc::Layout;
+use std::mem::MaybeUninit;
+
+use crate::allocator::block::BlockAllocator;
+use crate::allocator::slab::SlabDesc;
+
+pub struct MultiSlabAllocator<'t, const N: usize> {
+    pub(crate) block_allocator: BlockAllocator<'t>,
+
+    pub(crate) slab_descs: [SlabDesc; N],
+}
+
+impl<'t, const N: usize> MultiSlabAllocator<'t, N> {
+    pub(crate) fn new(
+        area: &'t mut [MaybeUninit<u8>],
+        layouts: &[Layout; N],
+    ) -> MultiSlabAllocator<'t, N> {
+        let block_allocator = BlockAllocator::new(area);
+        MultiSlabAllocator {
+            block_allocator,
+
+            slab_descs: std::array::from_fn(|i| SlabDesc::new(&layouts[i])),
+        }
+    }
+
+    pub(crate) fn alloc_slab(&self, slab_idx: usize) -> *mut u8 {
+        self.slab_descs[slab_idx].alloc_chunk(&self.block_allocator)
+    }
+
+    pub(crate) fn dealloc_slab(&self, slab_idx: usize, ptr: *mut u8) {
+        self.slab_descs[slab_idx].dealloc_chunk(ptr, &self.block_allocator)
+    }
+}
--- a/libs/neonart/src/allocator/slab.rs
+++ b/libs/neonart/src/allocator/slab.rs
@@ -0,0 +1,432 @@
+//! A slab allocator that carves out fixed-size chunks from larger blocks.
+//!
+//!
+
+use std::alloc::Layout;
+use std::mem::MaybeUninit;
+use std::ops::Deref;
+use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
+
+use spin;
+
+use super::alloc_from_slice;
+use super::block::BlockAllocator;
+
+use crate::allocator::block::BLOCK_SIZE;
+
+pub(crate) struct SlabDesc {
+    pub(crate) layout: Layout,
+
+    block_lists: spin::RwLock<BlockLists>,
+
+    pub(crate) num_blocks: AtomicU64,
+    pub(crate) num_allocated: AtomicU64,
+}
+
+// FIXME: Not sure if SlabDesc is really Sync or Send. It probably is when it's empty, but
+// 'block_lists' contains pointers when it's not empty. In the current use as part of the
+// the art tree, SlabDescs are only moved during initialization.
+unsafe impl Sync for SlabDesc {}
+unsafe impl Send for SlabDesc {}
+
+#[derive(Default, Debug)]
+struct BlockLists {
+    full_blocks: BlockList,
+    nonfull_blocks: BlockList,
+}
+
+impl BlockLists {
+    // Unlink a node. It must be in either one of the two lists.
+    unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
+        let list = unsafe {
+            if (*elem).next.is_null() {
+                if self.full_blocks.tail == elem {
+                    Some(&mut self.full_blocks)
+                } else {
+                    Some(&mut self.nonfull_blocks)
+                }
+            } else if (*elem).prev.is_null() {
+                if self.full_blocks.head == elem {
+                    Some(&mut self.full_blocks)
+                } else {
+                    Some(&mut self.nonfull_blocks)
+                }
+            } else {
+                None
+            }
+        };
+        unsafe { unlink_slab_block(list, elem) };
+    }
+}
+
+unsafe fn unlink_slab_block(mut list: Option<&mut BlockList>, elem: *mut SlabBlockHeader) {
+    unsafe {
+        if (*elem).next.is_null() {
+            assert_eq!(list.as_ref().unwrap().tail, elem);
+            list.as_mut().unwrap().tail = (*elem).prev;
+        } else {
+            assert_eq!((*(*elem).next).prev, elem);
+            (*(*elem).next).prev = (*elem).prev;
+        }
+        if (*elem).prev.is_null() {
+            assert_eq!(list.as_ref().unwrap().head, elem);
+            list.as_mut().unwrap().head = (*elem).next;
+        } else {
+            assert_eq!((*(*elem).prev).next, elem);
+            (*(*elem).prev).next = (*elem).next;
+        }
+    }
+}
+
+#[derive(Debug)]
+struct BlockList {
+    head: *mut SlabBlockHeader,
+    tail: *mut SlabBlockHeader,
+}
+
+impl Default for BlockList {
+    fn default() -> Self {
+        BlockList {
+            head: std::ptr::null_mut(),
+            tail: std::ptr::null_mut(),
+        }
+    }
+}
+
+impl BlockList {
+    unsafe fn push_head(&mut self, elem: *mut SlabBlockHeader) {
+        unsafe {
+            if self.is_empty() {
+                self.tail = elem;
+                (*elem).next = std::ptr::null_mut();
+            } else {
+                (*elem).next = self.head;
+                (*self.head).prev = elem;
+            }
+            (*elem).prev = std::ptr::null_mut();
+            self.head = elem;
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        self.head.is_null()
+    }
+
+    unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
+        unsafe { unlink_slab_block(Some(self), elem) }
+    }
+
+    #[cfg(test)]
+    fn dump(&self) {
+        let mut next = self.head;
+
+        while !next.is_null() {
+            let n = unsafe { next.as_ref() }.unwrap();
+            eprintln!(
+                "  blk {:?} (free {}/{})",
+                next,
+                n.num_free_chunks.load(Ordering::Relaxed),
+                n.num_chunks
+            );
+            next = n.next;
+        }
+    }
+}
+
+impl SlabDesc {
+    pub(crate) fn new(layout: &Layout) -> SlabDesc {
+        SlabDesc {
+            layout: *layout,
+            block_lists: spin::RwLock::new(BlockLists::default()),
+            num_allocated: AtomicU64::new(0),
+            num_blocks: AtomicU64::new(0),
+        }
+    }
+}
+
+#[derive(Debug)]
+struct SlabBlockHeader {
+    free_chunks_head: spin::Mutex<*mut FreeChunk>,
+    num_free_chunks: AtomicU32,
+    num_chunks: u32, // this is really a constant for a given Layout
+
+    // these fields are protected by the lock on the BlockLists
+    prev: *mut SlabBlockHeader,
+    next: *mut SlabBlockHeader,
+}
+
+struct FreeChunk {
+    next: *mut FreeChunk,
+}
+
+enum ReadOrWriteGuard<'a, T> {
+    Read(spin::RwLockReadGuard<'a, T>),
+    Write(spin::RwLockWriteGuard<'a, T>),
+}
+
+impl<'a, T> Deref for ReadOrWriteGuard<'a, T> {
+    type Target = T;
+
+    fn deref(&self) -> &<Self as Deref>::Target {
+        match self {
+            ReadOrWriteGuard::Read(g) => g.deref(),
+            ReadOrWriteGuard::Write(g) => g.deref(),
+        }
+    }
+}
+
+impl SlabDesc {
+    pub fn alloc_chunk(&self, block_allocator: &BlockAllocator) -> *mut u8 {
+        // Are there any free chunks?
+        let mut acquire_write = false;
+        'outer: loop {
+            let mut block_lists_guard = if acquire_write {
+                ReadOrWriteGuard::Write(self.block_lists.write())
+            } else {
+                ReadOrWriteGuard::Read(self.block_lists.read())
+            };
+            'inner: loop {
+                let block_ptr = block_lists_guard.nonfull_blocks.head;
+                if block_ptr.is_null() {
+                    break 'outer;
+                }
+                unsafe {
+                    let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
+                    if !(*free_chunks_head).is_null() {
+                        let result = *free_chunks_head;
+                        (*free_chunks_head) = (*result).next;
+                        let _old = (*block_ptr).num_free_chunks.fetch_sub(1, Ordering::Relaxed);
+
+                        self.num_allocated.fetch_add(1, Ordering::Relaxed);
+                        return result.cast();
+                    }
+                }
+
+                // The block at the head of the list was full. Grab write lock and retry
+                match block_lists_guard {
+                    ReadOrWriteGuard::Read(_) => {
+                        acquire_write = true;
+                        continue 'outer;
+                    }
+                    ReadOrWriteGuard::Write(ref mut g) => {
+                        // move the node to the list of full blocks
+                        unsafe {
+                            g.nonfull_blocks.unlink(block_ptr);
+                            g.full_blocks.push_head(block_ptr);
+                        };
+                        continue 'inner;
+                    }
+                }
+            }
+        }
+
+        // no free chunks. Allocate a new block (and the chunk from that)
+        let (new_block, new_chunk) = self.alloc_block_and_chunk(block_allocator);
+        self.num_blocks.fetch_add(1, Ordering::Relaxed);
+
+        // Add the block to the list in the SlabDesc
+        unsafe {
+            let mut block_lists_guard = self.block_lists.write();
+            block_lists_guard.nonfull_blocks.push_head(new_block);
+        }
+        self.num_allocated.fetch_add(1, Ordering::Relaxed);
+        new_chunk
+    }
+
+    pub fn dealloc_chunk(&self, chunk_ptr: *mut u8, _block_allocator: &BlockAllocator) {
+        // Find the block it belongs to. You can find the block from the address. (And knowing the
+        // layout, you could calculate the chunk number too.)
+        let block_ptr: *mut SlabBlockHeader = {
+            let block_addr = (chunk_ptr.addr() / BLOCK_SIZE) * BLOCK_SIZE;
+            chunk_ptr.with_addr(block_addr).cast()
+        };
+        let chunk_ptr: *mut FreeChunk = chunk_ptr.cast();
+
+        // Mark the chunk as free in 'freechunks' list
+        let num_chunks;
+        let num_free_chunks;
+        unsafe {
+            let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
+            (*chunk_ptr).next = *free_chunks_head;
+            *free_chunks_head = chunk_ptr;
+
+            num_free_chunks = (*block_ptr).num_free_chunks.fetch_add(1, Ordering::Relaxed) + 1;
+            num_chunks = (*block_ptr).num_chunks;
+        }
+
+        if num_free_chunks == 1 {
+            // If the block was full previously, add it to the nonfull blocks list. Note that
+            // we're not holding the lock anymore, so it can immediately become full again.
+            // That's harmless, it will be moved back to the full list again when a call
+            // to alloc_chunk() sees it.
+            let mut block_lists = self.block_lists.write();
+            unsafe {
+                block_lists.unlink(block_ptr);
+                block_lists.nonfull_blocks.push_head(block_ptr);
+            };
+        } else if num_free_chunks == num_chunks {
+            // If the block became completely empty, move it to the free list
+            // TODO
+            // FIXME: we're still holding the spinlock. It's not exactly safe to return it to
+            // the free blocks list, is it? Defer it as garbage to wait out concurrent updates?
+            //block_allocator.release_block()
+        }
+
+        // update stats
+        self.num_allocated.fetch_sub(1, Ordering::Relaxed);
+    }
+
+    fn alloc_block_and_chunk(
+        &self,
+        block_allocator: &BlockAllocator,
+    ) -> (*mut SlabBlockHeader, *mut u8) {
+        // fixme: handle OOM
+        let block_slice: &mut [MaybeUninit<u8>] = block_allocator.alloc_block();
+        let (block_header, remain) = alloc_from_slice::<SlabBlockHeader>(block_slice);
+
+        let padding = remain.as_ptr().align_offset(self.layout.align());
+
+        let num_chunks = (remain.len() - padding) / self.layout.size();
+
+        let first_chunk_ptr: *mut FreeChunk = remain[padding..].as_mut_ptr().cast();
+
+        unsafe {
+            let mut chunk_ptr = first_chunk_ptr;
+            for _ in 0..num_chunks - 1 {
+                let next_chunk_ptr = chunk_ptr.byte_add(self.layout.size());
+                (*chunk_ptr).next = next_chunk_ptr;
+                chunk_ptr = next_chunk_ptr;
+            }
+            (*chunk_ptr).next = std::ptr::null_mut();
+
+            let result_chunk = first_chunk_ptr;
+
+            let block_header = block_header.write(SlabBlockHeader {
+                free_chunks_head: spin::Mutex::new((*first_chunk_ptr).next),
+                prev: std::ptr::null_mut(),
+                next: std::ptr::null_mut(),
+                num_chunks: num_chunks as u32,
+                num_free_chunks: AtomicU32::new(num_chunks as u32 - 1),
+            });
+
+            (block_header, result_chunk.cast())
+        }
+    }
+
+    #[cfg(test)]
+    fn dump(&self) {
+        eprintln!(
+            "slab dump ({} blocks, {} allocated chunks)",
+            self.num_blocks.load(Ordering::Relaxed),
+            self.num_allocated.load(Ordering::Relaxed)
+        );
+        let lists = self.block_lists.read();
+
+        eprintln!("nonfull blocks:");
+        lists.nonfull_blocks.dump();
+        eprintln!("full blocks:");
+        lists.full_blocks.dump();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use rand::Rng;
+    use rand_distr::Zipf;
+
+    struct TestObject {
+        val: usize,
+        _dummy: [u8; BLOCK_SIZE / 4],
+    }
+
+    struct TestObjectSlab<'a>(SlabDesc, BlockAllocator<'a>);
+    impl<'a> TestObjectSlab<'a> {
+        fn new(block_allocator: BlockAllocator) -> TestObjectSlab {
+            TestObjectSlab(SlabDesc::new(&Layout::new::<TestObject>()), block_allocator)
+        }
+
+        fn alloc(&self, val: usize) -> *mut TestObject {
+            let obj: *mut TestObject = self.0.alloc_chunk(&self.1).cast();
+            unsafe { (*obj).val = val };
+            obj
+        }
+
+        fn dealloc(&self, obj: *mut TestObject) {
+            self.0.dealloc_chunk(obj.cast(), &self.1)
+        }
+    }
+
+    #[test]
+    fn test_slab_alloc() {
+        const MEM_SIZE: usize = 100000000;
+        let mut area = Box::new_uninit_slice(MEM_SIZE);
+        let block_allocator = BlockAllocator::new(&mut area);
+
+        let slab = TestObjectSlab::new(block_allocator);
+
+        let mut all: Vec<*mut TestObject> = Vec::new();
+        for i in 0..11 {
+            all.push(slab.alloc(i));
+        }
+        for i in 0..11 {
+            assert!(unsafe { (*all[i]).val == i });
+        }
+
+        let distribution = Zipf::new(10 as f64, 1.1).unwrap();
+        let mut rng = rand::rng();
+        for _ in 0..100000 {
+            slab.0.dump();
+            let idx = (rng.sample(distribution) as usize).into();
+            let ptr: *mut TestObject = all[idx];
+            if !ptr.is_null() {
+                assert_eq!(unsafe { (*ptr).val }, idx);
+                slab.dealloc(ptr);
+                all[idx] = std::ptr::null_mut();
+            } else {
+                all[idx] = slab.alloc(idx);
+            }
+        }
+    }
+
+    fn new_test_blk(i: u32) -> *mut SlabBlockHeader {
+        Box::into_raw(Box::new(SlabBlockHeader {
+            free_chunks_head: spin::Mutex::new(std::ptr::null_mut()),
+            num_free_chunks: AtomicU32::new(0),
+            num_chunks: i,
+            prev: std::ptr::null_mut(),
+            next: std::ptr::null_mut(),
+        }))
+    }
+
+    #[test]
+    fn test_block_linked_list() {
+        // note: these are leaked, but that's OK for tests
+        let a = new_test_blk(0);
+        let b = new_test_blk(1);
+
+        let mut list = BlockList::default();
+        assert!(list.is_empty());
+
+        unsafe {
+            list.push_head(a);
+            assert!(!list.is_empty());
+            list.unlink(a);
+        }
+        assert!(list.is_empty());
+
+        unsafe {
+            list.push_head(b);
+            list.push_head(a);
+            assert_eq!(list.head, a);
+            assert_eq!((*a).next, b);
+            assert_eq!((*b).prev, a);
+            assert_eq!(list.tail, b);
+
+            list.unlink(a);
+            list.unlink(b);
+            assert!(list.is_empty());
+        }
+    }
+}
--- a/libs/neonart/src/allocator/static.rs
+++ b/libs/neonart/src/allocator/static.rs
@@ -0,0 +1,44 @@
+use std::mem::MaybeUninit;
+
+pub fn alloc_from_slice<T>(
+    area: &mut [MaybeUninit<u8>],
+) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
+    let layout = std::alloc::Layout::new::<T>();
+
+    let area_start = area.as_mut_ptr();
+
+    // pad to satisfy alignment requirements
+    let padding = area_start.align_offset(layout.align());
+    if padding + layout.size() > area.len() {
+        panic!("out of memory");
+    }
+    let area = &mut area[padding..];
+    let (result_area, remain) = area.split_at_mut(layout.size());
+
+    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
+    let result = unsafe { result_ptr.as_mut().unwrap() };
+
+    (result, remain)
+}
+
+pub fn alloc_array_from_slice<T>(
+    area: &mut [MaybeUninit<u8>],
+    len: usize,
+) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
+    let layout = std::alloc::Layout::new::<T>();
+
+    let area_start = area.as_mut_ptr();
+
+    // pad to satisfy alignment requirements
+    let padding = area_start.align_offset(layout.align());
+    if padding + layout.size() * len > area.len() {
+        panic!("out of memory");
+    }
+    let area = &mut area[padding..];
+    let (result_area, remain) = area.split_at_mut(layout.size() * len);
+
+    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
+    let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
+
+    (result, remain)
+}
--- a/libs/neonart/src/epoch.rs
+++ b/libs/neonart/src/epoch.rs
@@ -0,0 +1,147 @@
+//! This is similar to crossbeam_epoch crate, but works in shared memory
+
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
+
+use crossbeam_utils::CachePadded;
+use spin;
+
+const NUM_SLOTS: usize = 1000;
+
+/// This is the struct that is stored in shmem
+///
+/// bit 0: is it pinned or not?
+/// rest of the bits are the epoch counter.
+pub struct EpochShared {
+    global_epoch: AtomicU64,
+    participants: [CachePadded<AtomicU64>; NUM_SLOTS],
+
+    broadcast_lock: spin::Mutex<()>,
+}
+
+impl EpochShared {
+    pub fn new() -> EpochShared {
+        EpochShared {
+            global_epoch: AtomicU64::new(2),
+            participants: [const { CachePadded::new(AtomicU64::new(2)) }; NUM_SLOTS],
+            broadcast_lock: spin::Mutex::new(()),
+        }
+    }
+
+    pub fn register(&self) -> LocalHandle {
+        LocalHandle {
+            global: self,
+            last_slot: AtomicUsize::new(0), // todo: choose more intelligently
+        }
+    }
+
+    fn release_pin(&self, slot: usize, _epoch: u64) {
+        let global_epoch = self.global_epoch.load(Ordering::Relaxed);
+        self.participants[slot].store(global_epoch, Ordering::Relaxed);
+    }
+
+    fn pin_internal(&self, slot_hint: usize) -> (usize, u64) {
+        // pick a slot
+        let mut slot = slot_hint;
+        let epoch = loop {
+            let old = self.participants[slot].fetch_or(1, Ordering::Relaxed);
+            if old & 1 == 0 {
+                // Got this slot
+                break old;
+            }
+
+            // the slot was busy by another thread / process. try a different slot
+            slot += 1;
+            if slot == NUM_SLOTS {
+                slot = 0;
+            }
+            continue;
+        };
+        (slot, epoch)
+    }
+
+    pub(crate) fn advance(&self) -> u64 {
+        // Advance the global epoch
+        let old_epoch = self.global_epoch.fetch_add(2, Ordering::Relaxed);
+        let new_epoch = old_epoch + 2;
+
+        // Anyone that release their pin after this will update their slot.
+        new_epoch
+    }
+
+    pub(crate) fn broadcast(&self) {
+        let Some(_guard) = self.broadcast_lock.try_lock() else {
+            return;
+        };
+
+        let epoch = self.global_epoch.load(Ordering::Relaxed);
+        let old_epoch = epoch.wrapping_sub(2);
+
+        // Update all free slots.
+        for i in 0..NUM_SLOTS {
+            // TODO: check result, as a sanity check. It should either be the old epoch, or pinned
+            let _ = self.participants[i].compare_exchange(
+                old_epoch,
+                epoch,
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+            );
+        }
+
+        // FIXME: memory fence here, since we used Relaxed?
+    }
+
+    pub(crate) fn get_oldest(&self) -> u64 {
+        // Read all slots.
+        let now = self.global_epoch.load(Ordering::Relaxed);
+        let mut oldest = now;
+        for i in 0..NUM_SLOTS {
+            let this_epoch = self.participants[i].load(Ordering::Relaxed);
+            let delta = now.wrapping_sub(this_epoch);
+            if delta > u64::MAX / 2 {
+                // this is very recent
+            } else {
+                if delta > now.wrapping_sub(oldest) {
+                    oldest = this_epoch;
+                }
+            }
+        }
+        oldest
+    }
+
+    pub(crate) fn get_current(&self) -> u64 {
+        self.global_epoch.load(Ordering::Relaxed)
+    }
+}
+
+pub(crate) struct EpochPin<'e> {
+    slot: usize,
+    pub(crate) epoch: u64,
+
+    handle: &'e LocalHandle<'e>,
+}
+
+impl<'e> Drop for EpochPin<'e> {
+    fn drop(&mut self) {
+        self.handle.global.release_pin(self.slot, self.epoch);
+    }
+}
+
+pub struct LocalHandle<'g> {
+    global: &'g EpochShared,
+
+    last_slot: AtomicUsize,
+}
+
+impl<'g> LocalHandle<'g> {
+    pub fn pin(&self) -> EpochPin {
+        let (slot, epoch) = self
+            .global
+            .pin_internal(self.last_slot.load(Ordering::Relaxed));
+        self.last_slot.store(slot, Ordering::Relaxed);
+        EpochPin {
+            handle: self,
+            epoch,
+            slot,
+        }
+    }
+}
--- a/libs/neonart/src/lib.rs
+++ b/libs/neonart/src/lib.rs
@@ -0,0 +1,587 @@
+//! Adaptive Radix Tree (ART) implementation, with Optimistic Lock Coupling.
+//!
+//! The data structure is described in these two papers:
+//!
+//! [1] Leis, V. & Kemper, Alfons & Neumann, Thomas. (2013).
+//!     The adaptive radix tree: ARTful indexing for main-memory databases.
+//!     Proceedings - International Conference on Data Engineering. 38-49. 10.1109/ICDE.2013.6544812.
+//!     https://db.in.tum.de/~leis/papers/ART.pdf
+//!
+//! [2] Leis, Viktor & Scheibner, Florian & Kemper, Alfons & Neumann, Thomas. (2016).
+//!     The ART of practical synchronization.
+//!     1-8. 10.1145/2933349.2933352.
+//!     https://db.in.tum.de/~leis/papers/artsync.pdf
+//!
+//! [1] describes the base data structure, and [2] describes the Optimistic Lock Coupling that we
+//! use.
+//!
+//! The papers mention a few different variants. We have made the following choices in this
+//! implementation:
+//!
+//! - All keys have the same length
+//!
+//! - Single-value leaves.
+//!
+//! - For collapsing inner nodes, we use the Pessimistic approach, where each inner node stores a
+//!   variable length "prefix", which stores the keys of all the one-way nodes which have been
+//!   removed. However, similar to the "hybrid" approach described in the paper, each node only has
+//!   space for a constant-size prefix of 8 bytes. If a node would have a longer prefix, then we
+//!   create create one-way nodes to store them. (There was no particular reason for this choice,
+//!   the "hybrid" approach described in the paper might be better.)
+//!
+//! - For concurrency, we use Optimistic Lock Coupling. The paper [2] also describes another method,
+//!   ROWEX, which generally performs better when there is contention, but that is not important
+//!   for use and Optimisic Lock Coupling is simpler to implement.
+//!
+//! ## Requirements
+//!
+//! This data structure is currently used for the integrated LFC, relsize and last-written LSN cache
+//! in the compute communicator, part of the 'neon' Postgres extension. We have some unique
+//! requirements, which is why we had to write our own. Namely:
+//!
+//! - The data structure has to live in fixed-sized shared memory segment. That rules out any
+//!   built-in Rust collections and most crates. (Except possibly with the 'allocator_api' rust
+//!   feature, which still nightly-only experimental as of this writing).
+//!
+//! - The data structure is accessed from multiple processes. Only one process updates the data
+//!   structure, but other processes perform reads. That rules out using built-in Rust locking
+//!   primitives like Mutex and RwLock, and most crates too.
+//!
+//! - Within the one process with write-access, multiple threads can perform updates concurrently.
+//!   That rules out using PostgreSQL LWLocks for the locking.
+//!
+//! The implementation is generic, and doesn't depend on any PostgreSQL specifics, but it has been
+//! written with that usage and the above constraints in mind. Some noteworthy assumptions:
+//!
+//! - Contention is assumed to be rare. In the integrated cache in PostgreSQL, there's higher level
+//!   locking in the PostgreSQL buffer manager, which ensures that two backends should not try to
+//!   read / write the same page at the same time. (Prefetching can conflict with actual reads,
+//!   however.)
+//!
+//!  - The keys in the integrated cache are 17 bytes long.
+//!
+//! ## Usage
+//!
+//! Because this is designed to be used as a Postgres shared memory data structure, initialization
+//! happens in three stages:
+//!
+//! 0. A fixed area of shared memory is allocated at postmaster startup.
+//!
+//! 1. TreeInitStruct::new() is called to initialize it, still in Postmaster process, before any
+//!    other process or thread is running. It returns a TreeInitStruct, which is inherited by all
+//!    the processes through fork().
+//!
+//! 2. One process may have write-access to the struct, by calling
+//!    [TreeInitStruct::attach_writer]. (That process is the communicator process.)
+//!
+//! 3. Other processes get read-access to the struct, by calling [TreeInitStruct::attach_reader]
+//!
+//! "Write access" means that you can insert / update / delete values in the tree.
+//!
+//! NOTE: The Values stored in the tree are sometimes moved, when a leaf node fills up and a new
+//! larger node needs to be allocated. The versioning and epoch-based allocator ensure that the data
+//! structure stays consistent, but if the Value has interior mutability, like atomic fields,
+//! updates to such fields might be lost if the leaf node is concurrently moved! If that becomes a
+//! problem, the version check could be passed up to the caller, so that the caller could detect the
+//! lost updates and retry the operation.
+//!
+//! ## Implementation
+//!
+//! node_ptr: Provides low-level implementations of the four different node types (eight actually,
+//! since there is an Internal and Leaf variant of each)
+//!
+//! lock_and_version.rs: Provides an abstraction for the combined lock and version counter on each
+//! node.
+//!
+//! node_ref.rs: The code in node_ptr.rs deals with raw pointers. node_ref.rs provides more type-safe
+//!   abstractions on top.
+//!
+//! algorithm.rs: Contains the functions to implement lookups and updates in the tree
+//!
+//! allocator.rs: Provides a facility to allocate memory for the tree nodes. (We must provide our
+//!   own abstraction for that because we need the data structure to live in a pre-allocated shared
+//!   memory segment).
+//!
+//! epoch.rs: The data structure requires that when a node is removed from the tree, it is not
+//!   immediately deallocated, but stays around for as long as concurrent readers might still have
+//!   pointers to them. This is enforced by an epoch system. This is similar to
+//!   e.g. crossbeam_epoch, but we couldn't use that either because it has to work across processes
+//!   communicating over the shared memory segment.
+//!
+//! ## See also
+//!
+//! There are some existing Rust ART implementations out there, but none of them filled all
+//! the requirements:
+//!
+//! - https://github.com/XiangpengHao/congee
+//! - https://github.com/declanvk/blart
+//!
+//! ## TODO
+//!
+//! - Removing values has not been implemented
+
+mod algorithm;
+pub mod allocator;
+mod epoch;
+
+use algorithm::RootPtr;
+use algorithm::node_ptr::NodePtr;
+
+use std::collections::VecDeque;
+use std::fmt::Debug;
+use std::marker::PhantomData;
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+use crate::epoch::EpochPin;
+
+#[cfg(test)]
+mod tests;
+
+use allocator::ArtAllocator;
+pub use allocator::ArtMultiSlabAllocator;
+pub use allocator::OutOfMemoryError;
+
+/// Fixed-length key type.
+///
+pub trait Key: Debug {
+    const KEY_LEN: usize;
+
+    fn as_bytes(&self) -> &[u8];
+}
+
+/// Values stored in the tree
+///
+/// Values need to be Cloneable, because when a node "grows", the value is copied to a new node and
+/// the old sticks around until all readers that might see the old value are gone.
+// fixme obsolete, no longer needs Clone
+pub trait Value {}
+
+const MAX_GARBAGE: usize = 1024;
+
+/// The root of the tree, plus other tree-wide data. This is stored in the shared memory.
+pub struct Tree<V: Value> {
+    /// For simplicity, so that we never need to grow or shrink the root, the root node is always an
+    /// Internal256 node. Also, it never has a prefix (that's actually a bit wasteful, incurring one
+    /// indirection to every lookup)
+    root: RootPtr<V>,
+
+    writer_attached: AtomicBool,
+
+    epoch: epoch::EpochShared,
+}
+
+unsafe impl<V: Value + Sync> Sync for Tree<V> {}
+unsafe impl<V: Value + Send> Send for Tree<V> {}
+
+struct GarbageQueue<V>(VecDeque<(NodePtr<V>, u64)>);
+
+unsafe impl<V: Value + Sync> Sync for GarbageQueue<V> {}
+unsafe impl<V: Value + Send> Send for GarbageQueue<V> {}
+
+impl<V> GarbageQueue<V> {
+    fn new() -> GarbageQueue<V> {
+        GarbageQueue(VecDeque::with_capacity(MAX_GARBAGE))
+    }
+
+    fn remember_obsolete_node(&mut self, ptr: NodePtr<V>, epoch: u64) {
+        self.0.push_front((ptr, epoch));
+    }
+
+    fn next_obsolete(&mut self, cutoff_epoch: u64) -> Option<NodePtr<V>> {
+        if let Some(back) = self.0.back() {
+            if back.1 < cutoff_epoch {
+                return Some(self.0.pop_back().unwrap().0);
+            }
+        }
+        None
+    }
+}
+
+/// Struct created at postmaster startup
+pub struct TreeInitStruct<'t, K: Key, V: Value, A: ArtAllocator<V>> {
+    tree: &'t Tree<V>,
+
+    allocator: &'t A,
+
+    phantom_key: PhantomData<K>,
+}
+
+/// The worker process has a reference to this. The write operations are only safe
+/// from the worker process
+pub struct TreeWriteAccess<'t, K: Key, V: Value, A: ArtAllocator<V>>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'t Tree<V>,
+
+    pub allocator: &'t A,
+
+    epoch_handle: epoch::LocalHandle<'t>,
+
+    phantom_key: PhantomData<K>,
+
+    /// Obsolete nodes that cannot be recycled until their epoch expires.
+    garbage: spin::Mutex<GarbageQueue<V>>,
+}
+
+/// The backends have a reference to this. It cannot be used to modify the tree
+pub struct TreeReadAccess<'t, K: Key, V: Value>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'t Tree<V>,
+
+    epoch_handle: epoch::LocalHandle<'t>,
+
+    phantom_key: PhantomData<K>,
+}
+
+impl<'a, 't: 'a, K: Key, V: Value, A: ArtAllocator<V>> TreeInitStruct<'t, K, V, A> {
+    pub fn new(allocator: &'t A) -> TreeInitStruct<'t, K, V, A> {
+        let tree_ptr = allocator.alloc_tree();
+        let tree_ptr = NonNull::new(tree_ptr).expect("out of memory");
+        let init = Tree {
+            root: algorithm::new_root(allocator).expect("out of memory"),
+            writer_attached: AtomicBool::new(false),
+            epoch: epoch::EpochShared::new(),
+        };
+        unsafe { tree_ptr.write(init) };
+
+        TreeInitStruct {
+            tree: unsafe { tree_ptr.as_ref() },
+            allocator,
+            phantom_key: PhantomData,
+        }
+    }
+
+    pub fn attach_writer(self) -> TreeWriteAccess<'t, K, V, A> {
+        let previously_attached = self.tree.writer_attached.swap(true, Ordering::Relaxed);
+        if previously_attached {
+            panic!("writer already attached");
+        }
+        TreeWriteAccess {
+            tree: self.tree,
+            allocator: self.allocator,
+            phantom_key: PhantomData,
+            epoch_handle: self.tree.epoch.register(),
+            garbage: spin::Mutex::new(GarbageQueue::new()),
+        }
+    }
+
+    pub fn attach_reader(self) -> TreeReadAccess<'t, K, V> {
+        TreeReadAccess {
+            tree: self.tree,
+            phantom_key: PhantomData,
+            epoch_handle: self.tree.epoch.register(),
+        }
+    }
+}
+
+impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteAccess<'t, K, V, A> {
+    pub fn start_write<'g>(&'t self) -> TreeWriteGuard<'g, K, V, A>
+    where
+        't: 'g,
+    {
+        TreeWriteGuard {
+            tree_writer: self,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+            created_garbage: false,
+        }
+    }
+
+    pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
+        TreeReadGuard {
+            tree: &self.tree,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+        }
+    }
+}
+
+impl<'t, K: Key, V: Value> TreeReadAccess<'t, K, V> {
+    pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
+        TreeReadGuard {
+            tree: &self.tree,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+        }
+    }
+}
+
+pub struct TreeReadGuard<'e, K, V>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'e Tree<V>,
+
+    epoch_pin: EpochPin<'e>,
+    phantom_key: PhantomData<K>,
+}
+
+impl<'e, K: Key, V: Value> TreeReadGuard<'e, K, V> {
+    pub fn get(&'e self, key: &K) -> Option<&'e V> {
+        algorithm::search(key, self.tree.root, &self.epoch_pin)
+    }
+}
+
+pub struct TreeWriteGuard<'e, K, V, A>
+where
+    K: Key,
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    tree_writer: &'e TreeWriteAccess<'e, K, V, A>,
+
+    epoch_pin: EpochPin<'e>,
+    phantom_key: PhantomData<K>,
+
+    created_garbage: bool,
+}
+
+pub enum UpdateAction<V> {
+    Nothing,
+    Insert(V),
+    Remove,
+}
+
+impl<'e, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
+    /// Get a value
+    pub fn get(&'e mut self, key: &K) -> Option<&'e V> {
+        algorithm::search(key, self.tree_writer.tree.root, &self.epoch_pin)
+    }
+
+    /// Insert a value
+    pub fn insert(self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
+        let mut success = None;
+
+        self.update_with_fn(key, |existing| {
+            if let Some(_) = existing {
+                success = Some(false);
+                UpdateAction::Nothing
+            } else {
+                success = Some(true);
+                UpdateAction::Insert(value)
+            }
+        })?;
+        Ok(success.expect("value_fn not called"))
+    }
+
+    /// Remove value. Returns true if it existed
+    pub fn remove(self, key: &K) -> bool {
+        let mut result = false;
+        // FIXME: It's not clear if OOM is expected while removing. It seems
+        // not nice, but shrinking a node can OOM. Then again, we could opt
+        // to not shrink a node if we cannot allocate, to live a little longer.
+        self.update_with_fn(key, |existing| match existing {
+            Some(_) => {
+                result = true;
+                UpdateAction::Remove
+            }
+            None => UpdateAction::Nothing,
+        })
+        .expect("out of memory while removing");
+        result
+    }
+
+    /// Try to remove value and return the old value.
+    pub fn remove_and_return(self, key: &K) -> Option<V>
+    where
+        V: Clone,
+    {
+        let mut old = None;
+        self.update_with_fn(key, |existing| {
+            old = existing.cloned();
+            UpdateAction::Remove
+        })
+        .expect("out of memory while removing");
+        old
+    }
+
+    /// Update key using the given function. All the other modifying operations are based on this.
+    ///
+    /// The function is passed a reference to the existing value, if any. If the function
+    /// returns None, the value is removed from the tree (or if there was no existing value,
+    /// does nothing). If the function returns Some, the existing value is replaced, of if there
+    /// was no existing value, it is inserted. FIXME: update comment
+    pub fn update_with_fn<F>(mut self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
+    where
+        F: FnOnce(Option<&V>) -> UpdateAction<V>,
+    {
+        algorithm::update_fn(key, value_fn, self.tree_writer.tree.root, &mut self)?;
+
+        if self.created_garbage {
+            let _ = self.collect_garbage();
+        }
+        Ok(())
+    }
+
+    fn remember_obsolete_node(&mut self, ptr: NodePtr<V>) {
+        self.tree_writer
+            .garbage
+            .lock()
+            .remember_obsolete_node(ptr, self.epoch_pin.epoch);
+        self.created_garbage = true;
+    }
+
+    // returns number of nodes recycled
+    fn collect_garbage(&self) -> usize {
+        self.tree_writer.tree.epoch.advance();
+        self.tree_writer.tree.epoch.broadcast();
+
+        let cutoff_epoch = self.tree_writer.tree.epoch.get_oldest();
+
+        let mut result = 0;
+        let mut garbage_queue = self.tree_writer.garbage.lock();
+        while let Some(ptr) = garbage_queue.next_obsolete(cutoff_epoch) {
+            ptr.deallocate(self.tree_writer.allocator);
+            result += 1;
+        }
+        result
+    }
+}
+
+pub struct TreeIterator<K>
+where
+    K: Key + for<'a> From<&'a [u8]>,
+{
+    done: bool,
+    pub next_key: Vec<u8>,
+    max_key: Option<Vec<u8>>,
+
+    phantom_key: PhantomData<K>,
+}
+
+impl<K> TreeIterator<K>
+where
+    K: Key + for<'a> From<&'a [u8]>,
+{
+    pub fn new_wrapping() -> TreeIterator<K> {
+        let mut next_key = Vec::new();
+        next_key.resize(K::KEY_LEN, 0);
+        TreeIterator {
+            done: false,
+            next_key,
+            max_key: None,
+            phantom_key: PhantomData,
+        }
+    }
+
+    pub fn new(range: &std::ops::Range<K>) -> TreeIterator<K> {
+        let result = TreeIterator {
+            done: false,
+            next_key: Vec::from(range.start.as_bytes()),
+            max_key: Some(Vec::from(range.end.as_bytes())),
+            phantom_key: PhantomData,
+        };
+        assert_eq!(result.next_key.len(), K::KEY_LEN);
+        assert_eq!(result.max_key.as_ref().unwrap().len(), K::KEY_LEN);
+
+        result
+    }
+
+    pub fn next<'g, V>(&mut self, read_guard: &'g TreeReadGuard<'g, K, V>) -> Option<(K, &'g V)>
+    where
+        V: Value,
+    {
+        if self.done {
+            return None;
+        }
+
+        let mut wrapped_around = false;
+        loop {
+            assert_eq!(self.next_key.len(), K::KEY_LEN);
+            if let Some((k, v)) = algorithm::iter_next(
+                &mut self.next_key,
+                read_guard.tree.root,
+                &read_guard.epoch_pin,
+            ) {
+                assert_eq!(k.len(), K::KEY_LEN);
+                assert_eq!(self.next_key.len(), K::KEY_LEN);
+
+                // Check if we reached the end of the range
+                if let Some(max_key) = &self.max_key {
+                    if k.as_slice() >= max_key.as_slice() {
+                        self.done = true;
+                        break None;
+                    }
+                }
+
+                // increment the key
+                self.next_key = k.clone();
+                increment_key(self.next_key.as_mut_slice());
+                let k = k.as_slice().into();
+
+                break Some((k, v));
+            } else {
+                if self.max_key.is_some() {
+                    self.done = true;
+                } else {
+                    // Start from beginning
+                    if !wrapped_around {
+                        for i in 0..K::KEY_LEN {
+                            self.next_key[i] = 0;
+                        }
+                        wrapped_around = true;
+                        continue;
+                    } else {
+                        // The tree is completely empty
+                        // FIXME: perhaps we should remember the starting point instead.
+                        // Currently this will scan some ranges twice.
+                        break None;
+                    }
+                }
+                break None;
+            }
+        }
+    }
+}
+
+fn increment_key(key: &mut [u8]) -> bool {
+    for i in (0..key.len()).rev() {
+        let (byte, overflow) = key[i].overflowing_add(1);
+        key[i] = byte;
+        if !overflow {
+            return false;
+        }
+    }
+    true
+}
+
+// Debugging functions
+impl<'e, K: Key, V: Value + Debug, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
+    pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
+        algorithm::dump_tree(self.tree_writer.tree.root, &self.epoch_pin, dst)
+    }
+}
+impl<'e, K: Key, V: Value + Debug> TreeReadGuard<'e, K, V> {
+    pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
+        algorithm::dump_tree(self.tree.root, &self.epoch_pin, dst)
+    }
+}
+impl<'e, K: Key, V: Value> TreeWriteAccess<'e, K, V, ArtMultiSlabAllocator<'e, V>> {
+    pub fn get_statistics(&self) -> ArtTreeStatistics {
+        self.allocator.get_statistics();
+        ArtTreeStatistics {
+            blocks: self.allocator.inner.block_allocator.get_statistics(),
+            slabs: self.allocator.get_statistics(),
+            epoch: self.tree.epoch.get_current(),
+            oldest_epoch: self.tree.epoch.get_oldest(),
+            num_garbage: self.garbage.lock().0.len() as u64,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ArtTreeStatistics {
+    pub blocks: allocator::block::BlockAllocatorStats,
+    pub slabs: allocator::ArtMultiSlabStats,
+
+    pub epoch: u64,
+    pub oldest_epoch: u64,
+    pub num_garbage: u64,
+}
--- a/libs/neonart/src/tests.rs
+++ b/libs/neonart/src/tests.rs
@@ -0,0 +1,243 @@
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fmt::{Debug, Formatter};
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use crate::ArtAllocator;
+use crate::ArtMultiSlabAllocator;
+use crate::TreeInitStruct;
+use crate::TreeIterator;
+use crate::TreeWriteAccess;
+use crate::UpdateAction;
+
+use crate::{Key, Value};
+
+use rand::Rng;
+use rand::seq::SliceRandom;
+use rand_distr::Zipf;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl TestKey {
+    const MIN: TestKey = TestKey([0; TEST_KEY_LEN]);
+    const MAX: TestKey = TestKey([u8::MAX; TEST_KEY_LEN]);
+}
+
+impl Key for TestKey {
+    const KEY_LEN: usize = TEST_KEY_LEN;
+    fn as_bytes(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+impl From<&TestKey> for u128 {
+    fn from(val: &TestKey) -> u128 {
+        u128::from_be_bytes(val.0)
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl<'a> From<&'a [u8]> for TestKey {
+    fn from(bytes: &'a [u8]) -> TestKey {
+        TestKey(bytes.try_into().unwrap())
+    }
+}
+
+impl Value for usize {}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
+    const MEM_SIZE: usize = 10000000;
+    let mut area = Box::new_uninit_slice(MEM_SIZE);
+
+    let allocator = ArtMultiSlabAllocator::new(&mut area);
+
+    let init_struct = TreeInitStruct::<TestKey, usize, _>::new(allocator);
+    let tree_writer = init_struct.attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+        let w = tree_writer.start_write();
+        let res = w.insert(&(*k).into(), idx);
+        assert!(res.is_ok());
+    }
+
+    for (idx, k) in keys.iter().enumerate() {
+        let r = tree_writer.start_read();
+        let value = r.get(&(*k).into());
+        assert_eq!(value, Some(idx).as_ref());
+    }
+
+    eprintln!("stats: {:?}", tree_writer.get_statistics());
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut rand::rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.get(&key).is_some() {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
+
+struct TestValue(AtomicUsize);
+
+impl TestValue {
+    fn new(val: usize) -> TestValue {
+        TestValue(AtomicUsize::new(val))
+    }
+
+    fn load(&self) -> usize {
+        self.0.load(Ordering::Relaxed)
+    }
+}
+
+impl Value for TestValue {}
+
+impl Clone for TestValue {
+    fn clone(&self) -> TestValue {
+        TestValue::new(self.load())
+    }
+}
+
+impl Debug for TestValue {
+    fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.load())
+    }
+}
+
+#[derive(Clone, Debug)]
+struct TestOp(TestKey, Option<usize>);
+
+fn apply_op<A: ArtAllocator<TestValue>>(
+    op: &TestOp,
+    tree: &TreeWriteAccess<TestKey, TestValue, A>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    eprintln!("applying op: {op:?}");
+
+    // apply the change to the shadow tree first
+    let shadow_existing = if let Some(v) = op.1 {
+        shadow.insert(op.0, v)
+    } else {
+        shadow.remove(&op.0)
+    };
+
+    // apply to Art tree
+    let w = tree.start_write();
+    w.update_with_fn(&op.0, |existing| {
+        assert_eq!(existing.map(TestValue::load), shadow_existing);
+
+        match (existing, op.1) {
+            (None, None) => UpdateAction::Nothing,
+            (None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
+            (Some(_old_val), None) => UpdateAction::Remove,
+            (Some(old_val), Some(new_val)) => {
+                old_val.0.store(new_val, Ordering::Relaxed);
+                UpdateAction::Nothing
+            }
+        }
+    })
+    .expect("out of memory");
+}
+
+fn test_iter<A: ArtAllocator<TestValue>>(
+    tree: &TreeWriteAccess<TestKey, TestValue, A>,
+    shadow: &BTreeMap<TestKey, usize>,
+) {
+    let mut shadow_iter = shadow.iter();
+    let mut iter = TreeIterator::new(&(TestKey::MIN..TestKey::MAX));
+
+    loop {
+        let shadow_item = shadow_iter.next().map(|(k, v)| (k.clone(), v.clone()));
+        let r = tree.start_read();
+        let item = iter.next(&r);
+
+        if shadow_item != item.map(|(k, v)| (k, v.load())) {
+            eprintln!(
+                "FAIL: iterator returned {:?}, expected {:?}",
+                item, shadow_item
+            );
+            tree.start_read().dump(&mut std::io::stderr());
+
+            eprintln!("SHADOW:");
+            let mut si = shadow.iter();
+            while let Some(si) = si.next() {
+                eprintln!("key: {:?}, val: {}", si.0, si.1);
+            }
+            panic!(
+                "FAIL: iterator returned {:?}, expected {:?}",
+                item, shadow_item
+            );
+        }
+        if item.is_none() {
+            break;
+        }
+    }
+}
+
+#[test]
+fn random_ops() {
+    const MEM_SIZE: usize = 10000000;
+    let mut area = Box::new_uninit_slice(MEM_SIZE);
+
+    let allocator = ArtMultiSlabAllocator::new(&mut area);
+
+    let init_struct = TreeInitStruct::<TestKey, TestValue, _>::new(allocator);
+    let tree_writer = init_struct.attach_writer();
+
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
+    let mut rng = rand::rng();
+    for i in 0..100000 {
+        let mut key: TestKey = (rng.sample(distribution) as u128).into();
+
+        if rng.random_bool(0.10) {
+            key = TestKey::from(u128::from(&key) | 0xffffffff);
+        }
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &tree_writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            eprintln!("stats: {:?}", tree_writer.get_statistics());
+            test_iter(&tree_writer, &shadow);
+        }
+    }
+}
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -18,7 +18,6 @@ bytes.workspace = true
 byteorder.workspace = true
 utils.workspace = true
 postgres_ffi_types.workspace = true
-postgres_versioninfo.workspace = true
 enum-map.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -76,6 +76,10 @@ pub struct PostHogConfig {
    pub private_api_url: String,
    /// Public API URL
    pub public_api_url: String,
+    /// Refresh interval for the feature flag spec
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(with = "humantime_serde")]
+    pub refresh_interval: Option<Duration>,
 }

 /// `pageserver.toml`
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -11,7 +11,6 @@ use std::time::{Duration, SystemTime};

 #[cfg(feature = "testing")]
 use camino::Utf8PathBuf;
-use postgres_versioninfo::PgMajorVersion;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use serde_with::serde_as;
 pub use utilization::PageserverUtilization;
@@ -399,7 +398,7 @@ pub enum TimelineCreateRequestMode {
        // inherits the ancestor's pg_version. Earlier code wasn't
        // using a flattened enum, so, it was an accepted field, and
        // we continue to accept it by having it here.
-        pg_version: Option<PgMajorVersion>,
+        pg_version: Option<u32>,
        #[serde(default, skip_serializing_if = "std::ops::Not::not")]
        read_only: bool,
    },
@@ -411,7 +410,7 @@ pub enum TimelineCreateRequestMode {
    Bootstrap {
        #[serde(default)]
        existing_initdb_timeline_id: Option<TimelineId>,
-        pg_version: Option<PgMajorVersion>,
+        pg_version: Option<u32>,
    },
 }

@@ -1574,7 +1573,7 @@ pub struct TimelineInfo {
    pub last_received_msg_lsn: Option<Lsn>,
    /// the timestamp (in microseconds) of the last received message
    pub last_received_msg_ts: Option<u128>,
-    pub pg_version: PgMajorVersion,
+    pub pg_version: u32,

    pub state: TimelineState,

--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -19,7 +19,6 @@ serde.workspace = true
 postgres_ffi_types.workspace = true
 utils.workspace = true
 tracing.workspace = true
-postgres_versioninfo.workspace = true

 [dev-dependencies]
 env_logger.workspace = true
--- a/libs/postgres_ffi/benches/waldecoder.rs
+++ b/libs/postgres_ffi/benches/waldecoder.rs
@@ -4,7 +4,6 @@ use criterion::{Bencher, Criterion, criterion_group, criterion_main};
 use postgres_ffi::v17::wal_generator::LogicalMessageGenerator;
 use postgres_ffi::v17::waldecoder_handler::WalStreamDecoderHandler;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use postgres_versioninfo::PgMajorVersion;
 use pprof::criterion::{Output, PProfProfiler};
 use utils::lsn::Lsn;

@@ -33,7 +32,7 @@ fn bench_complete_record(c: &mut Criterion) {
        let value_size = LogicalMessageGenerator::make_value_size(size, PREFIX);
        let value = vec![1; value_size];

-        let mut decoder = WalStreamDecoder::new(Lsn(0), PgMajorVersion::PG17);
+        let mut decoder = WalStreamDecoder::new(Lsn(0), 170000);
        let msg = LogicalMessageGenerator::new(PREFIX, &value)
            .next()
            .unwrap()
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -14,8 +14,6 @@ use bytes::Bytes;
 use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;

-pub use postgres_versioninfo::PgMajorVersion;
-
 macro_rules! postgres_ffi {
    ($version:ident) => {
        #[path = "."]
@@ -93,22 +91,21 @@ macro_rules! dispatch_pgversion {
            $version => $code,
            default = $invalid_pgver_handling,
            pgversions = [
-                $crate::PgMajorVersion::PG14 => v14,
-                $crate::PgMajorVersion::PG15 => v15,
-                $crate::PgMajorVersion::PG16 => v16,
-                $crate::PgMajorVersion::PG17 => v17,
+                14 : v14,
+                15 : v15,
+                16 : v16,
+                17 : v17,
            ]
        )
    };
    ($pgversion:expr => $code:expr,
     default = $default:expr,
-     pgversions = [$($sv:pat => $vsv:ident),+ $(,)?]) => {
-        match ($pgversion.clone().into()) {
+     pgversions = [$($sv:literal : $vsv:ident),+ $(,)?]) => {
+        match ($pgversion) {
            $($sv => {
                use $crate::$vsv as pgv;
                $code
            },)+
-            #[allow(unreachable_patterns)]
            _ => {
                $default
            }
@@ -182,9 +179,9 @@ macro_rules! enum_pgversion {
            $($variant ( $crate::$md::$t )),+
        }
        impl self::$name {
-            pub fn pg_version(&self) -> PgMajorVersion {
+            pub fn pg_version(&self) -> u32 {
                enum_pgversion_dispatch!(self, $name, _ign, {
-                    pgv::bindings::MY_PGVERSION
+                    pgv::bindings::PG_MAJORVERSION_NUM
                })
            }
        }
@@ -198,15 +195,15 @@ macro_rules! enum_pgversion {
    };
    {name = $name:ident,
     path = $p:ident,
-     $(typ = $t:ident,)?
+     typ = $t:ident,
     pgversions = [$($variant:ident : $md:ident),+ $(,)?]} => {
        pub enum $name {
-            $($variant $(($crate::$md::$p::$t))?),+
+            $($variant ($crate::$md::$p::$t)),+
        }
        impl $name {
-            pub fn pg_version(&self) -> PgMajorVersion {
+            pub fn pg_version(&self) -> u32 {
                enum_pgversion_dispatch!(self, $name, _ign, {
-                    pgv::bindings::MY_PGVERSION
+                    pgv::bindings::PG_MAJORVERSION_NUM
                })
            }
        }
@@ -252,21 +249,22 @@ pub use v14::xlog_utils::{
    try_from_pg_timestamp,
 };

-pub fn bkpimage_is_compressed(bimg_info: u8, version: PgMajorVersion) -> bool {
+pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> bool {
    dispatch_pgversion!(version, pgv::bindings::bkpimg_is_compressed(bimg_info))
 }

 pub fn generate_wal_segment(
    segno: u64,
    system_id: u64,
-    pg_version: PgMajorVersion,
+    pg_version: u32,
    lsn: Lsn,
 ) -> Result<Bytes, SerializeError> {
    assert_eq!(segno, lsn.segment_number(WAL_SEGMENT_SIZE));

    dispatch_pgversion!(
        pg_version,
-        pgv::xlog_utils::generate_wal_segment(segno, system_id, lsn)
+        pgv::xlog_utils::generate_wal_segment(segno, system_id, lsn),
+        Err(SerializeError::BadInput)
    )
 }

@@ -274,7 +272,7 @@ pub fn generate_pg_control(
    pg_control_bytes: &[u8],
    checkpoint_bytes: &[u8],
    lsn: Lsn,
-    pg_version: PgMajorVersion,
+    pg_version: u32,
 ) -> anyhow::Result<(Bytes, u64, bool)> {
    dispatch_pgversion!(
        pg_version,
@@ -354,7 +352,6 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber {
 pub mod waldecoder {
    use std::num::NonZeroU32;

-    use crate::PgMajorVersion;
    use bytes::{Buf, Bytes, BytesMut};
    use thiserror::Error;
    use utils::lsn::Lsn;
@@ -372,7 +369,7 @@ pub mod waldecoder {

    pub struct WalStreamDecoder {
        pub lsn: Lsn,
-        pub pg_version: PgMajorVersion,
+        pub pg_version: u32,
        pub inputbuf: BytesMut,
        pub state: State,
    }
@@ -385,7 +382,7 @@ pub mod waldecoder {
    }

    impl WalStreamDecoder {
-        pub fn new(lsn: Lsn, pg_version: PgMajorVersion) -> WalStreamDecoder {
+        pub fn new(lsn: Lsn, pg_version: u32) -> WalStreamDecoder {
            WalStreamDecoder {
                lsn,
                pg_version,
--- a/libs/postgres_ffi/src/pg_constants_v14.rs
+++ b/libs/postgres_ffi/src/pg_constants_v14.rs
@@ -1,7 +1,3 @@
-use crate::PgMajorVersion;
-
-pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG14;
-
 pub const XLOG_DBASE_CREATE: u8 = 0x00;
 pub const XLOG_DBASE_DROP: u8 = 0x10;

--- a/libs/postgres_ffi/src/pg_constants_v15.rs
+++ b/libs/postgres_ffi/src/pg_constants_v15.rs
@@ -1,7 +1,3 @@
-use crate::PgMajorVersion;
-
-pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG15;
-
 pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;

 pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
--- a/libs/postgres_ffi/src/pg_constants_v16.rs
+++ b/libs/postgres_ffi/src/pg_constants_v16.rs
@@ -1,7 +1,3 @@
-use crate::PgMajorVersion;
-
-pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG16;
-
 pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;

 pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
--- a/libs/postgres_ffi/src/pg_constants_v17.rs
+++ b/libs/postgres_ffi/src/pg_constants_v17.rs
@@ -1,7 +1,3 @@
-use crate::PgMajorVersion;
-
-pub const MY_PGVERSION: PgMajorVersion = PgMajorVersion::PG17;
-
 pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;

 pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
--- a/libs/postgres_ffi/src/walrecord.rs
+++ b/libs/postgres_ffi/src/walrecord.rs
@@ -9,8 +9,8 @@ use utils::bin_ser::DeserializeError;
 use utils::lsn::Lsn;

 use crate::{
-    BLCKSZ, BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, PgMajorVersion,
-    RepOriginId, TimestampTz, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants,
+    BLCKSZ, BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, RepOriginId,
+    TimestampTz, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants,
 };

 #[repr(C)]
@@ -199,17 +199,20 @@ impl DecodedWALRecord {
    /// Check if this WAL record represents a legacy "copy" database creation, which populates new relations
    /// by reading other existing relations' data blocks.  This is more complex to apply than new-style database
    /// creations which simply include all the desired blocks in the WAL, so we need a helper function to detect this case.
-    pub fn is_dbase_create_copy(&self, pg_version: PgMajorVersion) -> bool {
+    pub fn is_dbase_create_copy(&self, pg_version: u32) -> bool {
        if self.xl_rmid == pg_constants::RM_DBASE_ID {
            let info = self.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
            match pg_version {
-                PgMajorVersion::PG14 => {
+                14 => {
                    // Postgres 14 database creations are always the legacy kind
                    info == crate::v14::bindings::XLOG_DBASE_CREATE
                }
-                PgMajorVersion::PG15 => info == crate::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY,
-                PgMajorVersion::PG16 => info == crate::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY,
-                PgMajorVersion::PG17 => info == crate::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                15 => info == crate::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                16 => info == crate::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                17 => info == crate::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                _ => {
+                    panic!("Unsupported postgres version {pg_version}")
+                }
            }
        } else {
            false
@@ -245,7 +248,7 @@ impl DecodedWALRecord {
 pub fn decode_wal_record(
    record: Bytes,
    decoded: &mut DecodedWALRecord,
-    pg_version: PgMajorVersion,
+    pg_version: u32,
 ) -> anyhow::Result<()> {
    let mut rnode_spcnode: u32 = 0;
    let mut rnode_dbnode: u32 = 0;
@@ -1103,9 +1106,9 @@ pub struct XlClogTruncate {
 }

 impl XlClogTruncate {
-    pub fn decode(buf: &mut Bytes, pg_version: PgMajorVersion) -> XlClogTruncate {
+    pub fn decode(buf: &mut Bytes, pg_version: u32) -> XlClogTruncate {
        XlClogTruncate {
-            pageno: if pg_version < PgMajorVersion::PG17 {
+            pageno: if pg_version < 17 {
                buf.get_u32_le()
            } else {
                buf.get_u64_le() as u32
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -11,9 +11,9 @@ use super::super::waldecoder::WalStreamDecoder;
 use super::bindings::{
    CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
    XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
-    MY_PGVERSION
 };
 use super::wal_generator::LogicalMessageGenerator;
+use super::PG_MAJORVERSION;
 use crate::pg_constants;
 use crate::PG_TLI;
 use crate::{uint32, uint64, Oid};
@@ -233,7 +233,7 @@ pub fn find_end_of_wal(
    let mut result = start_lsn;
    let mut curr_lsn = start_lsn;
    let mut buf = [0u8; XLOG_BLCKSZ];
-    let pg_version = MY_PGVERSION;
+    let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
    debug!("find_end_of_wal PG_VERSION: {}", pg_version);

    let mut decoder = WalStreamDecoder::new(start_lsn, pg_version);
--- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
+++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
@@ -4,7 +4,6 @@ use std::str::FromStr;
 use anyhow::*;
 use clap::{Arg, ArgMatches, Command, value_parser};
 use postgres::Client;
-use postgres_ffi::PgMajorVersion;
 use wal_craft::*;

 fn main() -> Result<()> {
@@ -49,7 +48,7 @@ fn main() -> Result<()> {
        Some(("with-initdb", arg_matches)) => {
            let cfg = Conf {
                pg_version: *arg_matches
-                    .get_one::<PgMajorVersion>("pg-version")
+                    .get_one::<u32>("pg-version")
                    .context("'pg-version' is required")?,
                pg_distrib_dir: arg_matches
                    .get_one::<PathBuf>("pg-distrib-dir")
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -9,8 +9,8 @@ use log::*;
 use postgres::Client;
 use postgres::types::PgLsn;
 use postgres_ffi::{
-    PgMajorVersion, WAL_SEGMENT_SIZE, XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_LONG_PHD,
-    XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
+    WAL_SEGMENT_SIZE, XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD,
+    XLOG_SIZE_OF_XLOG_SHORT_PHD,
 };

 macro_rules! xlog_utils_test {
@@ -29,7 +29,7 @@ macro_rules! xlog_utils_test {
 postgres_ffi::for_all_postgres_versions! { xlog_utils_test }

 pub struct Conf {
-    pub pg_version: PgMajorVersion,
+    pub pg_version: u32,
    pub pg_distrib_dir: PathBuf,
    pub datadir: PathBuf,
 }
@@ -52,7 +52,11 @@ impl Conf {
    pub fn pg_distrib_dir(&self) -> anyhow::Result<PathBuf> {
        let path = self.pg_distrib_dir.clone();

-        Ok(path.join(self.pg_version.v_str()))
+        #[allow(clippy::manual_range_patterns)]
+        match self.pg_version {
+            14 | 15 | 16 | 17 => Ok(path.join(format!("v{}", self.pg_version))),
+            _ => bail!("Unsupported postgres version: {}", self.pg_version),
+        }
    }

    fn pg_bin_dir(&self) -> anyhow::Result<PathBuf> {
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -24,7 +24,7 @@ fn init_logging() {
 fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    use crate::*;

-    let pg_version = MY_PGVERSION;
+    let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();

    // Craft some WAL
    let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
--- a/libs/postgres_initdb/Cargo.toml
+++ b/libs/postgres_initdb/Cargo.toml
@@ -9,5 +9,4 @@ anyhow.workspace = true
 tokio.workspace = true
 camino.workspace = true
 thiserror.workspace = true
-postgres_versioninfo.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/postgres_initdb/src/lib.rs
+++ b/libs/postgres_initdb/src/lib.rs
@@ -7,13 +7,12 @@
 use std::fmt;

 use camino::Utf8Path;
-use postgres_versioninfo::PgMajorVersion;

 pub struct RunInitdbArgs<'a> {
    pub superuser: &'a str,
    pub locale: &'a str,
    pub initdb_bin: &'a Utf8Path,
-    pub pg_version: PgMajorVersion,
+    pub pg_version: u32,
    pub library_search_path: &'a Utf8Path,
    pub pgdata: &'a Utf8Path,
 }
@@ -80,16 +79,12 @@ pub async fn do_run_initdb(args: RunInitdbArgs<'_>) -> Result<(), Error> {
        .stderr(std::process::Stdio::piped());

    // Before version 14, only the libc provide was available.
-    if pg_version > PgMajorVersion::PG14 {
+    if pg_version > 14 {
        // Version 17 brought with it a builtin locale provider which only provides
        // C and C.UTF-8. While being safer for collation purposes since it is
        // guaranteed to be consistent throughout a major release, it is also more
        // performant.
-        let locale_provider = if pg_version >= PgMajorVersion::PG17 {
-            "builtin"
-        } else {
-            "libc"
-        };
+        let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };

        initdb_command.args(["--locale-provider", locale_provider]);
    }
--- a/libs/postgres_versioninfo/Cargo.toml
+++ b/libs/postgres_versioninfo/Cargo.toml
@@ -1,12 +0,0 @@
-[package]
-name = "postgres_versioninfo"
-version = "0.1.0"
-edition = "2024"
-license.workspace = true
-
-[dependencies]
-anyhow.workspace = true
-thiserror.workspace = true
-serde.workspace = true
-serde_repr.workspace = true
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/postgres_versioninfo/src/lib.rs
+++ b/libs/postgres_versioninfo/src/lib.rs
@@ -1,185 +0,0 @@
-use serde::ser::SerializeTuple;
-use serde::{Deserialize, Deserializer, Serialize, Serializer};
-use serde_repr::{Deserialize_repr, Serialize_repr};
-use std::fmt::{Display, Formatter};
-use std::str::FromStr;
-
-/// An enum with one variant for each major version of PostgreSQL that we support.
-///
-#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq, Deserialize_repr, Serialize_repr)]
-#[repr(u32)]
-pub enum PgMajorVersion {
-    PG14 = 14,
-    PG15 = 15,
-    PG16 = 16,
-    PG17 = 17,
-    // !!! When you add a new PgMajorVersion, don't forget to update PgMajorVersion::ALL
-}
-
-/// A full PostgreSQL version ID, in MMmmbb numerical format (Major/minor/bugfix)
-#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
-#[repr(transparent)]
-pub struct PgVersionId(u32);
-
-impl PgVersionId {
-    pub const UNKNOWN: PgVersionId = PgVersionId(0);
-
-    pub fn from_full_pg_version(version: u32) -> PgVersionId {
-        match version {
-            0 => PgVersionId(version), // unknown version
-            140000..180000 => PgVersionId(version),
-            _ => panic!("Invalid full PostgreSQL version ID {version}"),
-        }
-    }
-}
-
-impl Display for PgVersionId {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        u32::fmt(&self.0, f)
-    }
-}
-
-impl Serialize for PgVersionId {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: Serializer,
-    {
-        u32::serialize(&self.0, serializer)
-    }
-}
-
-impl<'de> Deserialize<'de> for PgVersionId {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: Deserializer<'de>,
-    {
-        u32::deserialize(deserializer).map(PgVersionId)
-    }
-
-    fn deserialize_in_place<D>(deserializer: D, place: &mut Self) -> Result<(), D::Error>
-    where
-        D: Deserializer<'de>,
-    {
-        u32::deserialize_in_place(deserializer, &mut place.0)
-    }
-}
-
-impl PgMajorVersion {
-    /// Get the numerical representation of the represented Major Version
-    pub const fn major_version_num(&self) -> u32 {
-        match self {
-            PgMajorVersion::PG14 => 14,
-            PgMajorVersion::PG15 => 15,
-            PgMajorVersion::PG16 => 16,
-            PgMajorVersion::PG17 => 17,
-        }
-    }
-
-    /// Get the contents of this version's PG_VERSION file.
-    ///
-    /// The PG_VERSION file is used to determine the PostgreSQL version that currently
-    /// owns the data in a PostgreSQL data directory.
-    pub fn versionfile_string(&self) -> String {
-        match self {
-            PgMajorVersion::PG17 => "17\x0A".to_string(),
-            PgMajorVersion::PG16 => "16\x0A".to_string(),
-            PgMajorVersion::PG15 => "15".to_string(),
-            PgMajorVersion::PG14 => "14".to_string(),
-        }
-    }
-
-    /// Get the v{version} string of this major PostgreSQL version.
-    ///
-    /// Because this was hand-coded in various places, this was moved into a shared
-    /// implementation.
-    pub fn v_str(&self) -> String {
-        match self {
-            PgMajorVersion::PG17 => "v17".to_string(),
-            PgMajorVersion::PG16 => "v16".to_string(),
-            PgMajorVersion::PG15 => "v15".to_string(),
-            PgMajorVersion::PG14 => "v14".to_string(),
-        }
-    }
-
-    /// All currently supported major versions of PostgreSQL.
-    pub const ALL: [PgMajorVersion; 4] = [
-        PgMajorVersion::PG14,
-        PgMajorVersion::PG15,
-        PgMajorVersion::PG16,
-        PgMajorVersion::PG17,
-    ];
-}
-
-impl Display for PgMajorVersion {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        match self {
-            PgMajorVersion::PG14 => {
-                write!(f, "PgMajorVersion::PG14")
-            }
-            PgMajorVersion::PG15 => {
-                write!(f, "PgMajorVersion::PG15")
-            }
-            PgMajorVersion::PG16 => {
-                write!(f, "PgMajorVersion::PG16")
-            }
-            PgMajorVersion::PG17 => {
-                write!(f, "PgMajorVersion::PG17")
-            }
-        }
-    }
-}
-
-#[derive(Debug, thiserror::Error)]
-#[allow(dead_code)]
-pub struct InvalidPgVersion(u32);
-
-impl Display for InvalidPgVersion {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        f.serialize_tuple_struct("InvalidPgVersion", 1)?
-            .serialize_element(&self.0)
-    }
-}
-
-impl TryFrom<PgVersionId> for PgMajorVersion {
-    type Error = InvalidPgVersion;
-
-    fn try_from(value: PgVersionId) -> Result<Self, Self::Error> {
-        Ok(match value.0 / 10000 {
-            14 => PgMajorVersion::PG14,
-            15 => PgMajorVersion::PG15,
-            16 => PgMajorVersion::PG16,
-            17 => PgMajorVersion::PG17,
-            _ => return Err(InvalidPgVersion(value.0)),
-        })
-    }
-}
-
-impl From<PgMajorVersion> for PgVersionId {
-    fn from(value: PgMajorVersion) -> Self {
-        PgVersionId((value as u32) * 10000)
-    }
-}
-
-#[derive(Debug, PartialEq, Eq, thiserror::Error)]
-pub struct PgMajorVersionParseError(String);
-
-impl Display for PgMajorVersionParseError {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        f.serialize_tuple_struct("PgMajorVersionParseError", 1)?
-            .serialize_element(&self.0)
-    }
-}
-
-impl FromStr for PgMajorVersion {
-    type Err = PgMajorVersionParseError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "14" => Ok(PgMajorVersion::PG14),
-            "15" => Ok(PgMajorVersion::PG15),
-            "16" => Ok(PgMajorVersion::PG16),
-            "17" => Ok(PgMajorVersion::PG17),
-            _ => Err(PgMajorVersionParseError(s.to_string())),
-        }
-    }
-}
--- a/libs/posthog_client_lite/src/background_loop.rs
+++ b/libs/posthog_client_lite/src/background_loop.rs
@@ -36,7 +36,10 @@ impl FeatureResolverBackgroundLoop {
        // Main loop of updating the feature flags.
        handle.spawn(
            async move {
-                tracing::info!("Starting PostHog feature resolver");
+                tracing::info!(
+                    "Starting PostHog feature resolver with refresh period: {:?}",
+                    refresh_period
+                );
                let mut ticker = tokio::time::interval(refresh_period);
                ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
                loop {
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -10,7 +10,6 @@ const_format.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 postgres_ffi.workspace = true
-postgres_versioninfo.workspace = true
 pq_proto.workspace = true
 tokio.workspace = true
 utils.workspace = true
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -8,8 +8,6 @@ pub mod membership;
 /// Public API types
 pub mod models;

-pub use postgres_versioninfo::{PgMajorVersion, PgVersionId};
-
 /// Consensus logical timestamp. Note: it is a part of sk control file.
 pub type Term = u64;
 /// With this term timeline is created initially. It
@@ -22,7 +20,7 @@ pub const INITIAL_TERM: Term = 0;
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub struct ServerInfo {
    /// Postgres server version
-    pub pg_version: PgVersionId,
+    pub pg_version: u32,
    pub system_id: SystemId,
    pub wal_seg_size: u32,
 }
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -4,7 +4,6 @@ use std::net::SocketAddr;

 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::TimestampTz;
-use postgres_versioninfo::PgVersionId;
 use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
 use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
@@ -24,7 +23,8 @@ pub struct TimelineCreateRequest {
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
    pub mconf: Configuration,
-    pub pg_version: PgVersionId,
+    /// In the PG_VERSION_NUM macro format, like 140017.
+    pub pg_version: u32,
    pub system_id: Option<u64>,
    // By default WAL_SEGMENT_SIZE
    pub wal_seg_size: Option<u32>,
--- a/libs/wal_decoder/benches/bench_interpret_wal.rs
+++ b/libs/wal_decoder/benches/bench_interpret_wal.rs
@@ -10,7 +10,7 @@ use futures::StreamExt;
 use futures::stream::FuturesUnordered;
 use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use postgres_ffi::{MAX_SEND_SIZE, PgMajorVersion, WAL_SEGMENT_SIZE};
+use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
 use pprof::criterion::{Output, PProfProfiler};
 use remote_storage::{
    DownloadOpts, GenericRemoteStorage, ListingMode, RemoteStorageConfig, RemoteStorageKind,
@@ -115,7 +115,7 @@ struct BenchmarkData {

 #[derive(Deserialize)]
 struct BenchmarkMetadata {
-    pg_version: PgMajorVersion,
+    pg_version: u32,
    start_lsn: Lsn,
 }

--- a/libs/wal_decoder/src/decoder.rs
+++ b/libs/wal_decoder/src/decoder.rs
@@ -7,8 +7,8 @@ use bytes::{Buf, Bytes};
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
+use postgres_ffi::pg_constants;
 use postgres_ffi::walrecord::*;
-use postgres_ffi::{PgMajorVersion, pg_constants};
 use postgres_ffi_types::forknum::VISIBILITYMAP_FORKNUM;
 use utils::lsn::Lsn;

@@ -24,7 +24,7 @@ impl InterpretedWalRecord {
        buf: Bytes,
        shards: &[ShardIdentity],
        next_record_lsn: Lsn,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
    ) -> anyhow::Result<HashMap<ShardIdentity, InterpretedWalRecord>> {
        let mut decoded = DecodedWALRecord::default();
        decode_wal_record(buf, &mut decoded, pg_version)?;
@@ -78,7 +78,7 @@ impl MetadataRecord {
        decoded: &DecodedWALRecord,
        shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
        next_record_lsn: Lsn,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
    ) -> anyhow::Result<()> {
        // Note: this doesn't actually copy the bytes since
        // the [`Bytes`] type implements it via a level of indirection.
@@ -193,7 +193,7 @@ impl MetadataRecord {
    fn decode_heapam_record(
        buf: &mut Bytes,
        decoded: &DecodedWALRecord,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
    ) -> anyhow::Result<Option<MetadataRecord>> {
        // Handle VM bit updates that are implicitly part of heap records.

@@ -205,7 +205,7 @@ impl MetadataRecord {
        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;

        match pg_version {
-            PgMajorVersion::PG14 => {
+            14 => {
                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;

@@ -272,7 +272,7 @@ impl MetadataRecord {
                    anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
                }
            }
-            PgMajorVersion::PG15 => {
+            15 => {
                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;

@@ -339,7 +339,7 @@ impl MetadataRecord {
                    anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
                }
            }
-            PgMajorVersion::PG16 => {
+            16 => {
                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;

@@ -406,7 +406,7 @@ impl MetadataRecord {
                    anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
                }
            }
-            PgMajorVersion::PG17 => {
+            17 => {
                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;

@@ -473,6 +473,7 @@ impl MetadataRecord {
                    anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
                }
            }
+            _ => {}
        }

        if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
@@ -499,7 +500,7 @@ impl MetadataRecord {
    fn decode_neonmgr_record(
        buf: &mut Bytes,
        decoded: &DecodedWALRecord,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
    ) -> anyhow::Result<Option<MetadataRecord>> {
        // Handle VM bit updates that are implicitly part of heap records.

@@ -513,7 +514,7 @@ impl MetadataRecord {
        assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);

        match pg_version {
-            PgMajorVersion::PG16 | PgMajorVersion::PG17 => {
+            16 | 17 => {
                let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;

                match info {
@@ -573,7 +574,7 @@ impl MetadataRecord {
                    info => anyhow::bail!("Unknown WAL record type for Neon RMGR: {}", info),
                }
            }
-            PgMajorVersion::PG15 | PgMajorVersion::PG14 => anyhow::bail!(
+            _ => anyhow::bail!(
                "Neon RMGR has no known compatibility with PostgreSQL version {}",
                pg_version
            ),
@@ -628,121 +629,116 @@ impl MetadataRecord {
    fn decode_dbase_record(
        buf: &mut Bytes,
        decoded: &DecodedWALRecord,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
    ) -> anyhow::Result<Option<MetadataRecord>> {
        // TODO: Refactor this to avoid the duplication between postgres versions.

        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
        tracing::debug!(%info, %pg_version, "handle RM_DBASE_ID");

-        match pg_version {
-            PgMajorVersion::PG14 => {
-                if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
-                    let createdb = XlCreateDatabase::decode(buf);
-                    tracing::debug!("XLOG_DBASE_CREATE v14");
+        if pg_version == 14 {
+            if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
+                let createdb = XlCreateDatabase::decode(buf);
+                tracing::debug!("XLOG_DBASE_CREATE v14");

-                    let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
-                        db_id: createdb.db_id,
-                        tablespace_id: createdb.tablespace_id,
-                        src_db_id: createdb.src_db_id,
-                        src_tablespace_id: createdb.src_tablespace_id,
-                    }));
+                let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
+                    db_id: createdb.db_id,
+                    tablespace_id: createdb.tablespace_id,
+                    src_db_id: createdb.src_db_id,
+                    src_tablespace_id: createdb.src_tablespace_id,
+                }));

-                    return Ok(Some(record));
-                } else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP {
-                    let dropdb = XlDropDatabase::decode(buf);
+                return Ok(Some(record));
+            } else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP {
+                let dropdb = XlDropDatabase::decode(buf);

-                    let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
-                        db_id: dropdb.db_id,
-                        tablespace_ids: dropdb.tablespace_ids,
-                    }));
+                let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
+                    db_id: dropdb.db_id,
+                    tablespace_ids: dropdb.tablespace_ids,
+                }));

-                    return Ok(Some(record));
-                }
+                return Ok(Some(record));
            }
-            PgMajorVersion::PG15 => {
-                if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
-                    tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-                } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
-                    // The XLOG record was renamed between v14 and v15,
-                    // but the record format is the same.
-                    // So we can reuse XlCreateDatabase here.
-                    tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
+        } else if pg_version == 15 {
+            if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+            } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                // The XLOG record was renamed between v14 and v15,
+                // but the record format is the same.
+                // So we can reuse XlCreateDatabase here.
+                tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");

-                    let createdb = XlCreateDatabase::decode(buf);
-                    let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
-                        db_id: createdb.db_id,
-                        tablespace_id: createdb.tablespace_id,
-                        src_db_id: createdb.src_db_id,
-                        src_tablespace_id: createdb.src_tablespace_id,
-                    }));
+                let createdb = XlCreateDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
+                    db_id: createdb.db_id,
+                    tablespace_id: createdb.tablespace_id,
+                    src_db_id: createdb.src_db_id,
+                    src_tablespace_id: createdb.src_tablespace_id,
+                }));

-                    return Ok(Some(record));
-                } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP {
-                    let dropdb = XlDropDatabase::decode(buf);
-                    let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
-                        db_id: dropdb.db_id,
-                        tablespace_ids: dropdb.tablespace_ids,
-                    }));
+                return Ok(Some(record));
+            } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP {
+                let dropdb = XlDropDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
+                    db_id: dropdb.db_id,
+                    tablespace_ids: dropdb.tablespace_ids,
+                }));

-                    return Ok(Some(record));
-                }
+                return Ok(Some(record));
            }
-            PgMajorVersion::PG16 => {
-                if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
-                    tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-                } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
-                    // The XLOG record was renamed between v14 and v15,
-                    // but the record format is the same.
-                    // So we can reuse XlCreateDatabase here.
-                    tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
+        } else if pg_version == 16 {
+            if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+            } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                // The XLOG record was renamed between v14 and v15,
+                // but the record format is the same.
+                // So we can reuse XlCreateDatabase here.
+                tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");

-                    let createdb = XlCreateDatabase::decode(buf);
-                    let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
-                        db_id: createdb.db_id,
-                        tablespace_id: createdb.tablespace_id,
-                        src_db_id: createdb.src_db_id,
-                        src_tablespace_id: createdb.src_tablespace_id,
-                    }));
+                let createdb = XlCreateDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
+                    db_id: createdb.db_id,
+                    tablespace_id: createdb.tablespace_id,
+                    src_db_id: createdb.src_db_id,
+                    src_tablespace_id: createdb.src_tablespace_id,
+                }));

-                    return Ok(Some(record));
-                } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP {
-                    let dropdb = XlDropDatabase::decode(buf);
-                    let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
-                        db_id: dropdb.db_id,
-                        tablespace_ids: dropdb.tablespace_ids,
-                    }));
+                return Ok(Some(record));
+            } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP {
+                let dropdb = XlDropDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
+                    db_id: dropdb.db_id,
+                    tablespace_ids: dropdb.tablespace_ids,
+                }));

-                    return Ok(Some(record));
-                }
+                return Ok(Some(record));
            }
-            PgMajorVersion::PG17 => {
-                if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG {
-                    tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-                } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY {
-                    // The XLOG record was renamed between v14 and v15,
-                    // but the record format is the same.
-                    // So we can reuse XlCreateDatabase here.
-                    tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
+        } else if pg_version == 17 {
+            if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+            } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                // The XLOG record was renamed between v14 and v15,
+                // but the record format is the same.
+                // So we can reuse XlCreateDatabase here.
+                tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");

-                    let createdb = XlCreateDatabase::decode(buf);
-                    let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
-                        db_id: createdb.db_id,
-                        tablespace_id: createdb.tablespace_id,
-                        src_db_id: createdb.src_db_id,
-                        src_tablespace_id: createdb.src_tablespace_id,
-                    }));
+                let createdb = XlCreateDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
+                    db_id: createdb.db_id,
+                    tablespace_id: createdb.tablespace_id,
+                    src_db_id: createdb.src_db_id,
+                    src_tablespace_id: createdb.src_tablespace_id,
+                }));

-                    return Ok(Some(record));
-                } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP {
-                    let dropdb = XlDropDatabase::decode(buf);
-                    let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
-                        db_id: dropdb.db_id,
-                        tablespace_ids: dropdb.tablespace_ids,
-                    }));
+                return Ok(Some(record));
+            } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP {
+                let dropdb = XlDropDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
+                    db_id: dropdb.db_id,
+                    tablespace_ids: dropdb.tablespace_ids,
+                }));

-                    return Ok(Some(record));
-                }
+                return Ok(Some(record));
            }
        }

@@ -752,12 +748,12 @@ impl MetadataRecord {
    fn decode_clog_record(
        buf: &mut Bytes,
        decoded: &DecodedWALRecord,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
    ) -> anyhow::Result<Option<MetadataRecord>> {
        let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;

        if info == pg_constants::CLOG_ZEROPAGE {
-            let pageno = if pg_version < PgMajorVersion::PG17 {
+            let pageno = if pg_version < 17 {
                buf.get_u32_le()
            } else {
                buf.get_u64_le() as u32
@@ -769,7 +765,7 @@ impl MetadataRecord {
                ClogZeroPage { segno, rpageno },
            ))))
        } else {
-            assert_eq!(info, pg_constants::CLOG_TRUNCATE);
+            assert!(info == pg_constants::CLOG_TRUNCATE);
            let xlrec = XlClogTruncate::decode(buf, pg_version);

            Ok(Some(MetadataRecord::Clog(ClogRecord::Truncate(
@@ -842,14 +838,14 @@ impl MetadataRecord {
    fn decode_multixact_record(
        buf: &mut Bytes,
        decoded: &DecodedWALRecord,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
    ) -> anyhow::Result<Option<MetadataRecord>> {
        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;

        if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
            || info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
        {
-            let pageno = if pg_version < PgMajorVersion::PG17 {
+            let pageno = if pg_version < 17 {
                buf.get_u32_le()
            } else {
                buf.get_u64_le() as u32
--- a/libs/wal_decoder/src/serialized_batch.rs
+++ b/libs/wal_decoder/src/serialized_batch.rs
@@ -13,7 +13,7 @@ use pageserver_api::keyspace::KeySpace;
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::walrecord::{DecodedBkpBlock, DecodedWALRecord};
-use postgres_ffi::{BLCKSZ, PgMajorVersion, page_is_new, page_set_lsn, pg_constants};
+use postgres_ffi::{BLCKSZ, page_is_new, page_set_lsn, pg_constants};
 use serde::{Deserialize, Serialize};
 use utils::bin_ser::BeSer;
 use utils::lsn::Lsn;
@@ -139,7 +139,7 @@ impl SerializedValueBatch {
        decoded: DecodedWALRecord,
        shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
        next_record_lsn: Lsn,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
    ) -> anyhow::Result<()> {
        // First determine how big the buffers need to be and allocate it up-front.
        // This duplicates some of the work below, but it's empirically much faster.
@@ -267,7 +267,7 @@ impl SerializedValueBatch {
    fn estimate_buffer_size(
        decoded: &DecodedWALRecord,
        shard: &ShardIdentity,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
    ) -> usize {
        let mut estimate: usize = 0;

@@ -303,11 +303,7 @@ impl SerializedValueBatch {
        estimate
    }

-    fn block_is_image(
-        decoded: &DecodedWALRecord,
-        blk: &DecodedBkpBlock,
-        pg_version: PgMajorVersion,
-    ) -> bool {
+    fn block_is_image(decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, pg_version: u32) -> bool {
        blk.apply_image
            && blk.has_image
            && decoded.xl_rmid == pg_constants::RM_XLOG_ID
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -12,6 +12,9 @@ testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "

 fuzz-read-path = ["testing"]

+# Enables benchmarking only APIs
+benchmarking = []
+
 [dependencies]
 anyhow.workspace = true
 arc-swap.workspace = true
@@ -51,6 +54,7 @@ pageserver_api.workspace = true
 pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
 pageserver_compaction.workspace = true
 pageserver_page_api.workspace = true
+peekable.workspace = true
 pem.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
@@ -63,6 +67,7 @@ postgres-types.workspace = true
 posthog_client_lite.workspace = true
 pprof.workspace = true
 pq_proto.workspace = true
+prost.workspace = true
 rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
@@ -127,6 +132,7 @@ harness = false
 [[bench]]
 name = "bench_ingest"
 harness = false
+required-features = ["benchmarking"]

 [[bench]]
 name = "upload_queue"
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -1,22 +1,29 @@
 use std::env;
 use std::num::NonZeroUsize;
+use std::sync::Arc;

 use bytes::Bytes;
 use camino::Utf8PathBuf;
 use criterion::{Criterion, criterion_group, criterion_main};
+use futures::stream::FuturesUnordered;
 use pageserver::config::PageServerConf;
 use pageserver::context::{DownloadBehavior, RequestContext};
+use pageserver::keyspace::KeySpace;
 use pageserver::l0_flush::{L0FlushConfig, L0FlushGlobalState};
 use pageserver::task_mgr::TaskKind;
-use pageserver::tenant::storage_layer::InMemoryLayer;
+use pageserver::tenant::storage_layer::IoConcurrency;
+use pageserver::tenant::storage_layer::{InMemoryLayer, ValuesReconstructState};
 use pageserver::{page_cache, virtual_file};
+use pageserver_api::config::GetVectoredConcurrentIo;
 use pageserver_api::key::Key;
 use pageserver_api::models::virtual_file::IoMode;
 use pageserver_api::shard::TenantShardId;
-use strum::IntoEnumIterator;
+use tokio_stream::StreamExt;
 use tokio_util::sync::CancellationToken;
 use utils::bin_ser::BeSer;
 use utils::id::{TenantId, TimelineId};
+use utils::lsn::Lsn;
+use utils::sync::gate::Gate;
 use wal_decoder::models::value::Value;
 use wal_decoder::serialized_batch::SerializedValueBatch;

@@ -30,7 +37,7 @@ fn murmurhash32(mut h: u32) -> u32 {
    h
 }

-#[derive(serde::Serialize, Clone, Copy, Debug)]
+#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)]
 enum KeyLayout {
    /// Sequential unique keys
    Sequential,
@@ -40,19 +47,30 @@ enum KeyLayout {
    RandomReuse(u32),
 }

-#[derive(serde::Serialize, Clone, Copy, Debug)]
+#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)]
 enum WriteDelta {
    Yes,
    No,
 }

+#[derive(serde::Serialize, Clone, Copy, Debug, PartialEq)]
+enum ConcurrentReads {
+    Yes,
+    No,
+}
+
 async fn ingest(
    conf: &'static PageServerConf,
    put_size: usize,
    put_count: usize,
    key_layout: KeyLayout,
    write_delta: WriteDelta,
+    concurrent_reads: ConcurrentReads,
 ) -> anyhow::Result<()> {
+    if concurrent_reads == ConcurrentReads::Yes {
+        assert_eq!(key_layout, KeyLayout::Sequential);
+    }
+
    let mut lsn = utils::lsn::Lsn(1000);
    let mut key = Key::from_i128(0x0);

@@ -68,16 +86,18 @@ async fn ingest(
    let gate = utils::sync::gate::Gate::default();
    let cancel = CancellationToken::new();

-    let layer = InMemoryLayer::create(
-        conf,
-        timeline_id,
-        tenant_shard_id,
-        lsn,
-        &gate,
-        &cancel,
-        &ctx,
-    )
-    .await?;
+    let layer = Arc::new(
+        InMemoryLayer::create(
+            conf,
+            timeline_id,
+            tenant_shard_id,
+            lsn,
+            &gate,
+            &cancel,
+            &ctx,
+        )
+        .await?,
+    );

    let data = Value::Image(Bytes::from(vec![0u8; put_size]));
    let data_ser_size = data.serialized_size().unwrap() as usize;
@@ -86,6 +106,61 @@ async fn ingest(
        pageserver::context::DownloadBehavior::Download,
    );

+    const READ_BATCH_SIZE: u32 = 32;
+    let (tx, mut rx) = tokio::sync::watch::channel::<Option<Key>>(None);
+    let reader_cancel = CancellationToken::new();
+    let reader_handle = if concurrent_reads == ConcurrentReads::Yes {
+        Some(tokio::task::spawn({
+            let cancel = reader_cancel.clone();
+            let layer = layer.clone();
+            let ctx = ctx.attached_child();
+            async move {
+                let gate = Gate::default();
+                let gate_guard = gate.enter().unwrap();
+                let io_concurrency = IoConcurrency::spawn_from_conf(
+                    GetVectoredConcurrentIo::SidecarTask,
+                    gate_guard,
+                );
+
+                rx.wait_for(|key| key.is_some()).await.unwrap();
+
+                while !cancel.is_cancelled() {
+                    let key = match *rx.borrow() {
+                        Some(some) => some,
+                        None => unreachable!(),
+                    };
+
+                    let mut start_key = key;
+                    start_key.field6 = key.field6.saturating_sub(READ_BATCH_SIZE);
+                    let key_range = start_key..key.next();
+
+                    let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
+
+                    layer
+                        .get_values_reconstruct_data(
+                            KeySpace::single(key_range),
+                            Lsn(1)..Lsn(u64::MAX),
+                            &mut reconstruct_state,
+                            &ctx,
+                        )
+                        .await
+                        .unwrap();
+
+                    let mut collect_futs = std::mem::take(&mut reconstruct_state.keys)
+                        .into_values()
+                        .map(|state| state.sink_pending_ios())
+                        .collect::<FuturesUnordered<_>>();
+                    while collect_futs.next().await.is_some() {}
+                }
+
+                drop(io_concurrency);
+                gate.close().await;
+            }
+        }))
+    } else {
+        None
+    };
+
    const BATCH_SIZE: usize = 16;
    let mut batch = Vec::new();

@@ -113,19 +188,27 @@ async fn ingest(

        batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
        if batch.len() >= BATCH_SIZE {
+            let last_key = Key::from_compact(batch.last().unwrap().0);
+
            let this_batch = std::mem::take(&mut batch);
            let serialized = SerializedValueBatch::from_values(this_batch);
            layer.put_batch(serialized, &ctx).await?;
+
+            tx.send(Some(last_key)).unwrap();
        }
    }
    if !batch.is_empty() {
+        let last_key = Key::from_compact(batch.last().unwrap().0);
+
        let this_batch = std::mem::take(&mut batch);
        let serialized = SerializedValueBatch::from_values(this_batch);
        layer.put_batch(serialized, &ctx).await?;
+
+        tx.send(Some(last_key)).unwrap();
    }
    layer.freeze(lsn + 1).await;

-    if matches!(write_delta, WriteDelta::Yes) {
+    if write_delta == WriteDelta::Yes {
        let l0_flush_state = L0FlushGlobalState::new(L0FlushConfig::Direct {
            max_concurrency: NonZeroUsize::new(1).unwrap(),
        });
@@ -136,6 +219,11 @@ async fn ingest(
        tokio::fs::remove_file(path).await?;
    }

+    reader_cancel.cancel();
+    if let Some(handle) = reader_handle {
+        handle.await.unwrap();
+    }
+
    Ok(())
 }

@@ -147,6 +235,7 @@ fn ingest_main(
    put_count: usize,
    key_layout: KeyLayout,
    write_delta: WriteDelta,
+    concurrent_reads: ConcurrentReads,
 ) {
    pageserver::virtual_file::set_io_mode(io_mode);

@@ -156,7 +245,15 @@ fn ingest_main(
        .unwrap();

    runtime.block_on(async move {
-        let r = ingest(conf, put_size, put_count, key_layout, write_delta).await;
+        let r = ingest(
+            conf,
+            put_size,
+            put_count,
+            key_layout,
+            write_delta,
+            concurrent_reads,
+        )
+        .await;
        if let Err(e) = r {
            panic!("{e:?}");
        }
@@ -195,6 +292,7 @@ fn criterion_benchmark(c: &mut Criterion) {
        key_size: usize,
        key_layout: KeyLayout,
        write_delta: WriteDelta,
+        concurrent_reads: ConcurrentReads,
    }
    #[derive(Clone)]
    struct HandPickedParameters {
@@ -245,7 +343,7 @@ fn criterion_benchmark(c: &mut Criterion) {
    ];
    let exploded_parameters = {
        let mut out = Vec::new();
-        for io_mode in IoMode::iter() {
+        for concurrent_reads in [ConcurrentReads::Yes, ConcurrentReads::No] {
            for param in expect.clone() {
                let HandPickedParameters {
                    volume_mib,
@@ -253,12 +351,18 @@ fn criterion_benchmark(c: &mut Criterion) {
                    key_layout,
                    write_delta,
                } = param;
+
+                if key_layout != KeyLayout::Sequential && concurrent_reads == ConcurrentReads::Yes {
+                    continue;
+                }
+
                out.push(ExplodedParameters {
-                    io_mode,
+                    io_mode: IoMode::DirectRw,
                    volume_mib,
                    key_size,
                    key_layout,
                    write_delta,
+                    concurrent_reads,
                });
            }
        }
@@ -272,9 +376,10 @@ fn criterion_benchmark(c: &mut Criterion) {
                key_size,
                key_layout,
                write_delta,
+                concurrent_reads,
            } = self;
            format!(
-                "io_mode={io_mode:?} volume_mib={volume_mib:?} key_size_bytes={key_size:?} key_layout={key_layout:?} write_delta={write_delta:?}"
+                "io_mode={io_mode:?} volume_mib={volume_mib:?} key_size_bytes={key_size:?} key_layout={key_layout:?} write_delta={write_delta:?} concurrent_reads={concurrent_reads:?}"
            )
        }
    }
@@ -287,12 +392,23 @@ fn criterion_benchmark(c: &mut Criterion) {
            key_size,
            key_layout,
            write_delta,
+            concurrent_reads,
        } = params;
        let put_count = volume_mib * 1024 * 1024 / key_size;
        group.throughput(criterion::Throughput::Bytes((key_size * put_count) as u64));
        group.sample_size(10);
        group.bench_function(id, |b| {
-            b.iter(|| ingest_main(conf, io_mode, key_size, put_count, key_layout, write_delta))
+            b.iter(|| {
+                ingest_main(
+                    conf,
+                    io_mode,
+                    key_size,
+                    put_count,
+                    key_layout,
+                    write_delta,
+                    concurrent_reads,
+                )
+            })
        });
    }
 }
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -18,7 +18,6 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
 tokio.workspace = true
-postgres_versioninfo.workspace = true
 futures.workspace = true
 tokio-util.workspace = true
 anyhow.workspace = true
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -7,7 +7,6 @@ use detach_ancestor::AncestorDetached;
 use http_utils::error::HttpErrorBody;
 use pageserver_api::models::*;
 use pageserver_api::shard::TenantShardId;
-use postgres_versioninfo::PgMajorVersion;
 pub use reqwest::Body as ReqwestBody;
 use reqwest::{IntoUrl, Method, StatusCode, Url};
 use utils::id::{TenantId, TimelineId};
@@ -746,11 +745,9 @@ impl Client {
        timeline_id: TimelineId,
        base_lsn: Lsn,
        end_lsn: Lsn,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
        basebackup_tarball: ReqwestBody,
    ) -> Result<()> {
-        let pg_version = pg_version.major_version_num();
-
        let uri = format!(
            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}",
            self.mgmt_api_endpoint,
--- a/pageserver/client_grpc/Cargo.toml
+++ b/pageserver/client_grpc/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "pageserver_client_grpc"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+bytes.workspace = true
+futures.workspace = true
+http.workspace = true
+thiserror.workspace = true
+tonic.workspace = true
+tracing.workspace = true
+tokio = { version = "1.43.1", features = ["full", "macros", "net", "io-util", "rt", "rt-multi-thread"] }
+uuid = { version = "1", features = ["v4"] }
+tower = {  version = "0.4", features = ["timeout", "util"] }
+rand = "0.8"
+tokio-util = { version = "0.7", features = ["compat"] }
+hyper-util = "0.1.9"
+hyper = "1.6.0"
+metrics.workspace = true
+priority-queue = "2.3.1"
+async-trait = { version = "0.1" }
+tokio-stream = "0.1"
+dashmap = "5"
+chrono = { version = "0.4", features = ["serde"] }
+
+
+pageserver_page_api.workspace = true
+pageserver_api.workspace = true
+utils.workspace = true
--- a/pageserver/client_grpc/examples/load_test.rs
+++ b/pageserver/client_grpc/examples/load_test.rs
@@ -0,0 +1,295 @@
+// examples/load_test.rs, generated by AI
+
+use std::collections::{HashMap, HashSet};
+use std::sync::{
+    Arc,
+    Mutex,
+    atomic::{AtomicU64, AtomicUsize, Ordering},
+};
+use std::time::{Duration, Instant};
+
+use tokio::task;
+use tokio::time::sleep;
+use rand::Rng;
+use tonic::Status;
+
+// Pull in your ConnectionPool and PooledItemFactory from the pageserver_client_grpc crate.
+// Adjust these paths if necessary.
+use pageserver_client_grpc::client_cache::ConnectionPool;
+use pageserver_client_grpc::client_cache::PooledItemFactory;
+
+// --------------------------------------
+// GLOBAL COUNTERS FOR “CREATED” / “DROPPED” MockConnections
+// --------------------------------------
+static CREATED: AtomicU64 = AtomicU64::new(0);
+static DROPPED: AtomicU64 = AtomicU64::new(0);
+
+// --------------------------------------
+// MockConnection + Factory
+// --------------------------------------
+
+#[derive(Debug)]
+pub struct MockConnection {
+    pub id: u64,
+}
+
+impl Clone for MockConnection {
+    fn clone(&self) -> Self {
+        // Cloning a MockConnection does NOT count as “creating” a brand‐new connection,
+        // so we do NOT bump CREATED here. We only bump CREATED in the factory’s `create()`.
+        CREATED.fetch_add(1, Ordering::Relaxed);
+        MockConnection { id: self.id }
+    }
+}
+
+impl Drop for MockConnection {
+    fn drop(&mut self) {
+        // When a MockConnection actually gets dropped, bump the counter.
+        DROPPED.fetch_add(1, Ordering::SeqCst);
+    }
+}
+
+pub struct MockConnectionFactory {
+    counter: AtomicU64,
+}
+
+impl MockConnectionFactory {
+    pub fn new() -> Self {
+        MockConnectionFactory {
+            counter: AtomicU64::new(1),
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl PooledItemFactory<MockConnection> for MockConnectionFactory {
+    /// The trait on ConnectionPool expects:
+    ///   async fn create(&self, timeout: Duration)
+    ///       -> Result<Result<MockConnection, Status>, tokio::time::error::Elapsed>;
+    ///
+    /// On success: Ok(Ok(MockConnection))
+    /// On a simulated “gRPC” failure: Ok(Err(Status::…))
+    /// On a transport/factory error: Err(Box<…>)
+    async fn create(
+        &self,
+        _timeout: Duration,
+    ) -> Result<Result<MockConnection, Status>, tokio::time::error::Elapsed> {
+        // Simulate connection creation immediately succeeding.
+        CREATED.fetch_add(1, Ordering::SeqCst);
+        let next_id = self.counter.fetch_add(1, Ordering::Relaxed);
+        Ok(Ok(MockConnection { id: next_id }))
+    }
+}
+
+// --------------------------------------
+// CLIENT WORKER
+// --------------------------------------
+//
+// Each worker repeatedly calls `pool.get_client().await`. When it succeeds, we:
+//  1. Lock the shared Mutex<HashMap<u64, Arc<AtomicUsize>>> to fetch/insert an Arc<AtomicUsize> for this conn_id.
+//  2. Lock the shared Mutex<HashSet<u64>> to record this conn_id as “seen.”
+//  3. Drop both locks, then atomically increment that counter and assert it ≤ max_consumers.
+//  4. Sleep 10–100 ms to simulate “work.”
+//  5. Atomically decrement the counter.
+//  6. Call `pooled.finish(Ok(()))` to return to the pool.
+
+async fn client_worker(
+    pool: Arc<ConnectionPool<MockConnection>>,
+    usage_map: Arc<Mutex<HashMap<u64, Arc<AtomicUsize>>>>,
+    seen_set: Arc<Mutex<HashSet<u64>>>,
+    max_consumers: usize,
+    worker_id: usize,
+) {
+    for iteration in 0..10 {
+        match pool.clone().get_client().await {
+            Ok(pooled) => {
+                let conn: MockConnection = pooled.channel();
+                let conn_id = conn.id;
+
+                // 1. Fetch or insert the Arc<AtomicUsize> for this conn_id:
+                let counter_arc: Arc<AtomicUsize> = {
+                    let mut guard = usage_map.lock().unwrap();
+                    guard
+                        .entry(conn_id)
+                        .or_insert_with(|| Arc::new(AtomicUsize::new(0)))
+                        .clone()
+                    // MutexGuard is dropped here
+                };
+
+                // 2. Record this conn_id in the shared HashSet of “seen” IDs:
+                {
+                    let mut seen_guard = seen_set.lock().unwrap();
+                    seen_guard.insert(conn_id);
+                    // MutexGuard is dropped immediately
+                }
+
+                // 3. Atomically bump the count for this connection ID
+                let prev = counter_arc.fetch_add(1, Ordering::SeqCst);
+                let current = prev + 1;
+                assert!(
+                    current <= max_consumers,
+                    "Connection {} exceeded max_consumers (got {})",
+                    conn_id,
+                    current
+                );
+
+                println!(
+                    "[worker {}][iter {}] got MockConnection id={} ({} concurrent)",
+                    worker_id, iteration, conn_id, current
+                );
+
+                // 4. Simulate some work (10–100 ms)
+                let delay_ms = rand::thread_rng().gen_range(10..100);
+                sleep(Duration::from_millis(delay_ms)).await;
+
+                // 5. Decrement the usage counter
+                let prev2 = counter_arc.fetch_sub(1, Ordering::SeqCst);
+                let after = prev2 - 1;
+                println!(
+                    "[worker {}][iter {}] returning MockConnection id={} (now {} remain)",
+                    worker_id, iteration, conn_id, after
+                );
+
+                // 6. Return to the pool (mark success)
+                pooled.finish(Ok(())).await;
+            }
+            Err(status) => {
+                eprintln!(
+                    "[worker {}][iter {}] failed to get client: {:?}",
+                    worker_id, iteration, status
+                );
+            }
+        }
+
+        // Small random pause before next iteration to spread out load
+        let pause = rand::thread_rng().gen_range(0..20);
+        sleep(Duration::from_millis(pause)).await;
+    }
+}
+
+#[tokio::main(flavor = "multi_thread", worker_threads = 8)]
+async fn main() {
+    // --------------------------------------
+    // 1. Create factory and shared instrumentation
+    // --------------------------------------
+    let factory = Arc::new(MockConnectionFactory::new());
+
+    // Shared map: connection ID → Arc<AtomicUsize>
+    let usage_map: Arc<Mutex<HashMap<u64, Arc<AtomicUsize>>>> =
+        Arc::new(Mutex::new(HashMap::new()));
+
+    // Shared set: record each unique connection ID we actually saw
+    let seen_set: Arc<Mutex<HashSet<u64>>> = Arc::new(Mutex::new(HashSet::new()));
+
+    // --------------------------------------
+    // 2. Pool parameters
+    // --------------------------------------
+    let connect_timeout    = Duration::from_millis(500);
+    let connect_backoff    = Duration::from_millis(100);
+    let max_consumers      = 100;                 // test limit
+    let error_threshold    = 2;                 // mock never fails
+    let max_idle_duration  = Duration::from_secs(2);
+    let max_total_connections  = 3;
+    let aggregate_metrics  = None;
+
+    let pool: Arc<ConnectionPool<MockConnection>> = ConnectionPool::new(
+        factory,
+        connect_timeout,
+        connect_backoff,
+        max_consumers,
+        error_threshold,
+        max_idle_duration,
+        max_total_connections,
+        aggregate_metrics,
+    );
+
+    // --------------------------------------
+    // 3. Spawn worker tasks
+    // --------------------------------------
+    let num_workers = 10000;
+    let mut handles = Vec::with_capacity(num_workers);
+    let start_time = Instant::now();
+
+    for worker_id in 0..num_workers {
+        let pool_clone   = Arc::clone(&pool);
+        let usage_clone  = Arc::clone(&usage_map);
+        let seen_clone   = Arc::clone(&seen_set);
+        let mc           = max_consumers;
+
+        let handle = task::spawn(async move {
+            client_worker(pool_clone, usage_clone, seen_clone, mc, worker_id).await;
+        });
+        handles.push(handle);
+    }
+
+    // --------------------------------------
+    // 4. Wait for workers to finish
+    // --------------------------------------
+    for handle in handles {
+        let _ = handle.await;
+    }
+    let elapsed = Instant::now().duration_since(start_time);
+    println!(
+        "All {} workers completed in {:?}",
+        num_workers, elapsed
+    );
+
+    // --------------------------------------
+    // 5. Print the total number of unique connections seen so far
+    // --------------------------------------
+    let unique_count = {
+        let seen_guard = seen_set.lock().unwrap();
+        seen_guard.len()
+    };
+    println!("Total unique connections used by workers: {}", unique_count);
+
+    // --------------------------------------
+    // 6. Sleep so the background sweeper can run (max_idle_duration = 2 s)
+    // --------------------------------------
+    sleep(Duration::from_secs(3)).await;
+
+    // --------------------------------------
+    // 7. Shutdown the pool
+    // --------------------------------------
+    let shutdown_pool = Arc::clone(&pool);
+    shutdown_pool.shutdown().await;
+    println!("Pool.shutdown() returned.");
+
+    // --------------------------------------
+    // 8. Verify that no background task still holds an Arc clone of `pool`.
+    //    If any task is still alive (sweeper/create_connection), strong_count > 1.
+    // --------------------------------------
+    sleep(Duration::from_secs(1)).await; // give tasks time to exit
+    let sc = Arc::strong_count(&pool);
+    assert!(
+        sc == 1,
+        "Pool tasks did not all terminate: Arc::strong_count = {} (expected 1)",
+        sc
+    );
+    println!("Verified: all pool tasks have terminated (strong_count == 1).");
+
+    // --------------------------------------
+    // 9. Verify no MockConnection was leaked:
+    //    CREATED must equal DROPPED.
+    // --------------------------------------
+    let created = CREATED.load(Ordering::SeqCst);
+    let dropped = DROPPED.load(Ordering::SeqCst);
+    assert!(
+        created == dropped,
+        "Leaked connections: created={} but dropped={}",
+        created,
+        dropped
+    );
+    println!(
+        "Verified: no connections leaked (created = {}, dropped = {}).",
+        created, dropped
+    );
+
+    // --------------------------------------
+    // 10. Because `client_worker` asserted inside that no connection
+    //     ever exceeded `max_consumers`, reaching this point means that check passed.
+    // --------------------------------------
+    println!("All per-connection usage stayed within max_consumers = {}.", max_consumers);
+
+    println!("Load test complete; exiting cleanly.");
+}
--- a/pageserver/client_grpc/examples/request_tracker_load_test.rs
+++ b/pageserver/client_grpc/examples/request_tracker_load_test.rs
@@ -0,0 +1,145 @@
+// examples/request_tracker_load_test.rs
+
+use std::{sync::Arc, time::Duration};
+use tokio;
+use pageserver_client_grpc::request_tracker::RequestTracker;
+use pageserver_client_grpc::request_tracker::MockStreamFactory;
+use pageserver_client_grpc::request_tracker::StreamReturner;
+use pageserver_client_grpc::client_cache::ConnectionPool;
+use pageserver_client_grpc::client_cache::PooledItemFactory;
+use pageserver_client_grpc::ClientCacheOptions;
+use pageserver_client_grpc::PageserverClientAggregateMetrics;
+use pageserver_client_grpc::AuthInterceptor;
+
+use pageserver_client_grpc::client_cache::ChannelFactory;
+
+use tonic::transport::Channel;
+
+use rand::prelude::*;
+
+use pageserver_api::key::Key;
+
+use utils::lsn::Lsn;
+use utils::shard::ShardIndex;
+
+use futures::stream::FuturesOrdered;
+use futures::StreamExt;
+
+use pageserver_page_api::proto;
+
+#[tokio::main]
+async fn main() {
+    // 1) configure the client‐pool behavior
+    let client_cache_options = ClientCacheOptions {
+        max_delay_ms:       0,
+        drop_rate:          0.0,
+        hang_rate:          0.0,
+        connect_timeout:    Duration::from_secs(10),
+        connect_backoff:    Duration::from_millis(200),
+        max_consumers:      64,
+        error_threshold:    10,
+        max_idle_duration:  Duration::from_secs(60),
+        max_total_connections: 12,
+    };
+
+    // 2) metrics collector (we assume Default is implemented)
+    let metrics = Arc::new(PageserverClientAggregateMetrics::new());
+    let pool = ConnectionPool::<StreamReturner>::new(
+        Arc::new(MockStreamFactory::new(
+        )),
+        client_cache_options.connect_timeout,
+        client_cache_options.connect_backoff,
+        client_cache_options.max_consumers,
+        client_cache_options.error_threshold,
+        client_cache_options.max_idle_duration,
+        client_cache_options.max_total_connections,
+        Some(Arc::clone(&metrics)),
+    );
+
+    // -----------
+    // There is no mock for the unary connection pool, so for now just
+    // don't use this pool
+    //
+    let channel_fact : Arc<dyn PooledItemFactory<Channel> + Send + Sync> = Arc::new(ChannelFactory::new(
+        "".to_string(),
+        client_cache_options.max_delay_ms,
+        client_cache_options.drop_rate,
+        client_cache_options.hang_rate,
+    ));
+    let unary_pool: Arc<ConnectionPool<Channel>> = ConnectionPool::new(
+        Arc::clone(&channel_fact),
+        client_cache_options.connect_timeout,
+        client_cache_options.connect_backoff,
+        client_cache_options.max_consumers,
+        client_cache_options.error_threshold,
+        client_cache_options.max_idle_duration,
+        client_cache_options.max_total_connections,
+        Some(Arc::clone(&metrics)),
+    );
+
+    // -----------
+    // Dummy auth interceptor. This is not used in this test.
+    let auth_interceptor = AuthInterceptor::new("dummy_tenant_id",
+                                                "dummy_timeline_id",
+                                                None);
+    let tracker = RequestTracker::new(
+        pool,
+        unary_pool,
+        auth_interceptor,
+        ShardIndex::unsharded(),
+    );
+
+    // 4) fire off 10 000 requests in parallel
+    let mut handles = FuturesOrdered::new();
+    for _i in 0..500000 {
+
+            let mut rng = rand::thread_rng();
+            let r = 0..=1000000i128;
+            let key: i128 = rng.gen_range(r.clone());
+            let key = Key::from_i128(key);
+            let (rel_tag, block_no) = key
+                .to_rel_block()
+                .expect("we filter non-rel-block keys out above");
+
+            let req2 = proto::GetPageRequest {
+                request_id: 0,
+                request_class: proto::GetPageClass::Normal as i32,
+                read_lsn: Some(proto::ReadLsn {
+                    request_lsn: if rng.gen_bool(0.5) {
+                        u64::from(Lsn::MAX)
+                    } else {
+                        10000
+                    },
+                    not_modified_since_lsn: 10000,
+                }),
+                rel: Some(rel_tag.into()),
+                block_number: vec![block_no],
+            };
+        let req_model = pageserver_page_api::GetPageRequest::try_from(req2.clone());
+
+        // RequestTracker is Clone, so we can share it
+        let mut tr = tracker.clone();
+        let fut = async move {
+            let resp = tr.send_getpage_request(req_model.unwrap()).await.unwrap();
+            // sanity‐check: the mock echo returns the same request_id
+            assert!(resp.request_id > 0);
+        };
+        handles.push_back(fut);
+
+        // empty future
+        let fut = async move {};
+        fut.await;
+    }
+
+    // print timestamp
+    println!("Starting 5000000 requests at: {}", chrono::Utc::now());
+    // 5) wait for them all
+    for _i in 0..500000 {
+        handles.next().await.expect("Failed to get next handle");
+    }
+
+    // print timestamp
+    println!("Finished 5000000 requests at: {}", chrono::Utc::now());
+
+    println!("✅ All 100000 requests completed successfully");
+}
--- a/pageserver/client_grpc/src/client_cache.rs
+++ b/pageserver/client_grpc/src/client_cache.rs
@@ -0,0 +1,741 @@
+use std::{
+    collections::HashMap,
+    io::{self, Error, ErrorKind},
+    sync::Arc,
+    time::{Duration, Instant},
+};
+
+use priority_queue::PriorityQueue;
+
+use tokio::{
+    io::{AsyncRead, AsyncWrite, ReadBuf},
+    net::TcpStream,
+    sync::{Mutex, OwnedSemaphorePermit, Semaphore},
+    time::sleep,
+};
+use tonic::transport::{Channel, Endpoint};
+
+use uuid;
+
+use std::{
+    pin::Pin,
+    task::{Context, Poll},
+};
+
+use futures::future;
+use rand::{Rng, SeedableRng, rngs::StdRng};
+
+use bytes::BytesMut;
+use http::Uri;
+use hyper_util::rt::TokioIo;
+use tower::service_fn;
+
+use tokio_util::sync::CancellationToken;
+use async_trait::async_trait;
+
+//
+// The "TokioTcp" is flakey TCP network for testing purposes, in order
+// to simulate network errors and delays.
+//
+
+/// Wraps a `TcpStream`, buffers incoming data, and injects a random delay per fresh read/write.
+pub struct TokioTcp {
+    tcp: TcpStream,
+    /// Maximum randomized delay in milliseconds
+    delay_ms: u64,
+
+    /// Next deadline instant for delay
+    deadline: Instant,
+    /// Internal buffer of previously-read data
+    buffer: BytesMut,
+}
+
+impl TokioTcp {
+    /// Create a new wrapper with given max delay (ms)
+    pub fn new(stream: TcpStream, delay_ms: u64) -> Self {
+        let initial = if delay_ms > 0 {
+            rand::thread_rng().gen_range(0..delay_ms)
+        } else {
+            0
+        };
+        let deadline = Instant::now() + Duration::from_millis(initial);
+        TokioTcp {
+            tcp: stream,
+            delay_ms,
+            deadline,
+            buffer: BytesMut::new(),
+        }
+    }
+}
+
+impl AsyncRead for TokioTcp {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        // Safe because TokioTcp is Unpin
+        let this = self.get_mut();
+
+        // 1) Drain any buffered data
+        if !this.buffer.is_empty() {
+            let to_copy = this.buffer.len().min(buf.remaining());
+            buf.put_slice(&this.buffer.split_to(to_copy));
+            return Poll::Ready(Ok(()));
+        }
+
+        // 2) If we're still before the deadline, schedule a wake and return Pending
+        let now = Instant::now();
+        if this.delay_ms > 0 && now < this.deadline {
+            let waker = cx.waker().clone();
+            let wait = this.deadline - now;
+            tokio::spawn(async move {
+                sleep(wait).await;
+                waker.wake_by_ref();
+            });
+            return Poll::Pending;
+        }
+
+        // 3) Past deadline: compute next random deadline
+        if this.delay_ms > 0 {
+            let next_ms = rand::thread_rng().gen_range(0..=this.delay_ms);
+            this.deadline = Instant::now() + Duration::from_millis(next_ms);
+        }
+
+        // 4) Perform actual read into a temporary buffer
+        let mut tmp = [0u8; 4096];
+        let mut rb = ReadBuf::new(&mut tmp);
+        match Pin::new(&mut this.tcp).poll_read(cx, &mut rb) {
+            Poll::Pending => Poll::Pending,
+            Poll::Ready(Ok(())) => {
+                let filled = rb.filled();
+                if filled.is_empty() {
+                    // EOF or zero bytes
+                    Poll::Ready(Ok(()))
+                } else {
+                    this.buffer.extend_from_slice(filled);
+                    let to_copy = this.buffer.len().min(buf.remaining());
+                    buf.put_slice(&this.buffer.split_to(to_copy));
+                    Poll::Ready(Ok(()))
+                }
+            }
+            Poll::Ready(Err(e)) => Poll::Ready(Err(e)),
+        }
+    }
+}
+
+impl AsyncWrite for TokioTcp {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        data: &[u8],
+    ) -> Poll<io::Result<usize>> {
+        let this = self.get_mut();
+
+        // 1) If before deadline, schedule wake and return Pending
+        let now = Instant::now();
+        if this.delay_ms > 0 && now < this.deadline {
+            let waker = cx.waker().clone();
+            let wait = this.deadline - now;
+            tokio::spawn(async move {
+                sleep(wait).await;
+                waker.wake_by_ref();
+            });
+            return Poll::Pending;
+        }
+
+        // 2) Past deadline: compute next random deadline
+        if this.delay_ms > 0 {
+            let next_ms = rand::thread_rng().gen_range(0..=this.delay_ms);
+            this.deadline = Instant::now() + Duration::from_millis(next_ms);
+        }
+
+        // 3) Actual write
+        Pin::new(&mut this.tcp).poll_write(cx, data)
+    }
+
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        let this = self.get_mut();
+        Pin::new(&mut this.tcp).poll_flush(cx)
+    }
+
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        let this = self.get_mut();
+        Pin::new(&mut this.tcp).poll_shutdown(cx)
+    }
+}
+
+#[async_trait]
+pub trait PooledItemFactory<T>: Send + Sync + 'static {
+    /// Create a new pooled item.
+    async fn create(&self, connect_timeout: Duration) ->  Result<Result<T, tonic::Status>, tokio::time::error::Elapsed>;
+}
+
+pub struct ChannelFactory {
+    endpoint: String,
+    max_delay_ms: u64,
+    drop_rate: f64,
+    hang_rate: f64,
+}
+
+
+impl ChannelFactory {
+    pub fn new(
+        endpoint: String,
+        max_delay_ms: u64,
+        drop_rate: f64,
+        hang_rate: f64,
+    ) -> Self {
+        ChannelFactory {
+            endpoint,
+            max_delay_ms,
+            drop_rate,
+            hang_rate,
+        }
+    }
+}
+
+#[async_trait]
+impl PooledItemFactory<Channel> for ChannelFactory {
+    async fn create(&self, connect_timeout: Duration) -> Result<Result<Channel, tonic::Status>, tokio::time::error::Elapsed> {
+        let max_delay_ms = self.max_delay_ms;
+        let drop_rate = self.drop_rate;
+        let hang_rate = self.hang_rate;
+
+        // This is a custom connector that inserts delays and errors, for
+        // testing purposes. It would normally be disabled by the config.
+        let connector = service_fn(move |uri: Uri| {
+            let drop_rate = drop_rate;
+            let hang_rate = hang_rate;
+            async move {
+                let mut rng = StdRng::from_entropy();
+                // Simulate an indefinite hang
+                if hang_rate > 0.0 && rng.gen_bool(hang_rate) {
+                    // never completes, to test timeout
+                    return future::pending::<Result<TokioIo<TokioTcp>, std::io::Error>>().await;
+                }
+
+                // Random drop (connect error)
+                if drop_rate > 0.0 && rng.gen_bool(drop_rate) {
+                    return Err(std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        "simulated connect drop",
+                    ));
+                }
+
+                // Otherwise perform real TCP connect
+                let addr = match (uri.host(), uri.port()) {
+                    // host + explicit port
+                    (Some(host), Some(port)) => format!("{}:{}", host, port.as_str()),
+                    // host only (no port)
+                    (Some(host), None) => host.to_string(),
+                    // neither? error out
+                    _ => return Err(Error::new(ErrorKind::InvalidInput, "no host or port")),
+                };
+
+                let tcp = TcpStream::connect(addr).await?;
+                let tcpwrapper = TokioTcp::new(tcp, max_delay_ms);
+                Ok(TokioIo::new(tcpwrapper))
+            }
+        });
+
+
+        let attempt = tokio::time::timeout(
+            connect_timeout,
+            Endpoint::from_shared(self.endpoint.clone())
+                .expect("invalid endpoint")
+                .timeout(connect_timeout)
+                .connect_with_connector(connector),
+        )
+            .await;
+        match attempt {
+            Ok(Ok(channel)) => {
+                // Connection succeeded
+                Ok(Ok(channel))
+            }
+            Ok(Err(e)) => {
+                Ok(Err(tonic::Status::new(
+                    tonic::Code::Unavailable,
+                    format!("Failed to connect: {}", e),
+                )))
+            }
+            Err(e) => {
+                Err(e)
+            }
+        }
+    }
+}
+
+
+/// A pooled gRPC client with capacity tracking and error handling.
+pub struct ConnectionPool<T> {
+    inner: Mutex<Inner<T>>,
+
+    fact: Arc<dyn PooledItemFactory<T> + Send + Sync>,
+
+    connect_timeout: Duration,
+    connect_backoff: Duration,
+    /// The maximum number of consumers that can use a single connection.
+    max_consumers: usize,
+    /// The number of consecutive errors before a connection is removed from the pool.
+    error_threshold: usize,
+    /// The maximum duration a connection can be idle before being removed.
+    max_idle_duration: Duration,
+    max_total_connections: usize,
+
+    channel_semaphore: Arc<Semaphore>,
+
+    shutdown_token: CancellationToken,
+    aggregate_metrics: Option<Arc<crate::PageserverClientAggregateMetrics>>,
+}
+
+struct Inner<T> {
+    entries: HashMap<uuid::Uuid, ConnectionEntry<T>>,
+    pq: PriorityQueue<uuid::Uuid, usize>,
+    // This is updated when a connection is dropped, or we fail
+    // to create a new connection.
+    last_connect_failure: Option<Instant>,
+    waiters: usize,
+    in_progress: usize,
+}
+struct ConnectionEntry<T> {
+    channel: T,
+    active_consumers: usize,
+    consecutive_errors: usize,
+    last_used: Instant,
+}
+
+/// A client borrowed from the pool.
+pub struct PooledClient<T> {
+    pub channel: T,
+    pool: Arc<ConnectionPool<T>>,
+    is_ok: bool,
+    id: uuid::Uuid,
+    permit: OwnedSemaphorePermit,
+}
+
+impl<T: Clone + Send + 'static> ConnectionPool<T> {
+    pub fn new(
+        fact: Arc<dyn PooledItemFactory<T> + Send + Sync>,
+        connect_timeout: Duration,
+        connect_backoff: Duration,
+        max_consumers: usize,
+        error_threshold: usize,
+        max_idle_duration: Duration,
+        max_total_connections: usize,
+        aggregate_metrics: Option<Arc<crate::PageserverClientAggregateMetrics>>,
+    ) -> Arc<Self> {
+        let shutdown_token = CancellationToken::new();
+        let pool = Arc::new(Self {
+            inner: Mutex::new(Inner::<T> {
+                entries: HashMap::new(),
+                pq: PriorityQueue::new(),
+                last_connect_failure: None,
+                waiters: 0,
+                in_progress: 0,
+            }),
+            fact: Arc::clone(&fact),
+            connect_timeout,
+            connect_backoff,
+            max_consumers,
+            error_threshold,
+            max_idle_duration,
+            max_total_connections,
+            channel_semaphore: Arc::new(Semaphore::new(0)),
+            shutdown_token: shutdown_token.clone(),
+            aggregate_metrics: aggregate_metrics.clone(),
+        });
+
+        // Cancelable background task to sweep idle connections
+        let sweeper_token = shutdown_token.clone();
+        let sweeper_pool = Arc::clone(&pool);
+        tokio::spawn(async move {
+            loop {
+                tokio::select! {
+                    _ = sweeper_token.cancelled() => break,
+                    _ = async {
+                        sweeper_pool.sweep_idle_connections().await;
+                        sleep(Duration::from_secs(5)).await;
+                    } => {}
+                }
+            }
+        });
+
+        pool
+    }
+
+    pub async fn shutdown(self: Arc<Self>) {
+        self.shutdown_token.cancel();
+
+        loop {
+            let all_idle = {
+                let inner = self.inner.lock().await;
+                inner.entries.values().all(|e| e.active_consumers == 0)
+            };
+            if all_idle {
+                break;
+            }
+            sleep(Duration::from_millis(100)).await;
+        }
+
+        // 4. Remove all entries
+        let mut inner = self.inner.lock().await;
+        inner.entries.clear();
+    }
+
+    /// Sweep and remove idle connections safely, burning their permits.
+    async fn sweep_idle_connections(self: &Arc<Self>) {
+        let mut ids_to_remove = Vec::new();
+        let now = Instant::now();
+
+        // Remove idle entries. First collect permits for those connections so that
+        // no consumer will reserve them, then remove them from the pool.
+        {
+            let mut inner = self.inner.lock().await;
+            inner.entries.retain(|id, entry| {
+                if entry.active_consumers == 0
+                    && now.duration_since(entry.last_used) > self.max_idle_duration
+                {
+                    // metric
+                    match self.aggregate_metrics {
+                        Some(ref metrics) => {
+                            metrics
+                                .retry_counters
+                                .with_label_values(&["connection_swept"])
+                                .inc();
+                        }
+                        None => {}
+                    }
+                    ids_to_remove.push(*id);
+                    return false; // remove this entry
+                }
+                true
+            });
+            // Remove the entries from the priority queue
+            for id in ids_to_remove {
+                inner.pq.remove(&id);
+            }
+        }
+    }
+
+    // If we have a permit already, get a connection out of the heap
+    async fn get_conn_with_permit(
+        self: Arc<Self>,
+        permit: OwnedSemaphorePermit,
+    ) -> Option<PooledClient<T>> {
+        let mut inner = self.inner.lock().await;
+
+        // Pop the highest-active-consumers connection. There are no connections
+        // in the heap that have more than max_consumers active consumers.
+        if let Some((id, _cons)) = inner.pq.pop() {
+            let entry = inner
+                .entries
+                .get_mut(&id)
+                .expect("pq and entries got out of sync");
+
+            let mut active_consumers = entry.active_consumers;
+            entry.active_consumers += 1;
+            entry.last_used = Instant::now();
+
+            let client = PooledClient::<T> {
+                channel: entry.channel.clone(),
+                pool: Arc::clone(&self),
+                is_ok: true,
+                id,
+                permit: permit,
+            };
+
+            // re‐insert with updated priority
+            active_consumers += 1;
+            if active_consumers < self.max_consumers {
+                inner.pq.push(id, active_consumers as usize);
+            }
+            return Some(client);
+        } else {
+            // If there is no connection to take, it is because permits for a connection
+            // need to drain. This can happen if a connection is removed because it has
+            // too many errors. It is taken out of the heap/hash table in this case, but
+            // we can't remove it's permits until now.
+            //
+            // Just forget the permit and retry.
+            permit.forget();
+            return None;
+        }
+    }
+
+    pub async fn get_client(self: Arc<Self>) -> Result<PooledClient<T>, tonic::Status> {
+        // The pool is shutting down. Don't accept new connections.
+        if self.shutdown_token.is_cancelled() {
+            return Err(tonic::Status::unavailable("Pool is shutting down"));
+        }
+
+        // A loop is necessary because when a connection is draining, we have to return
+        // a permit and retry.
+        loop {
+            let self_clone = Arc::clone(&self);
+            let mut semaphore = Arc::clone(&self_clone.channel_semaphore);
+
+            match semaphore.try_acquire_owned() {
+                Ok(permit_) => {
+                    // We got a permit, so check the heap for a connection
+                    // we can use.
+                    let pool_conn = self_clone.get_conn_with_permit(permit_).await;
+                    match pool_conn {
+                        Some(pool_conn_) => {
+                            return Ok(pool_conn_);
+                        }
+                        None => {
+                            // No connection available. Forget the permit and retry.
+                            continue;
+                        }
+                    }
+                }
+                Err(_) => {
+                    match self_clone.aggregate_metrics {
+                        Some(ref metrics) => {
+                            metrics
+                                .retry_counters
+                                .with_label_values(&["sema_acquire_failed"])
+                                .inc();
+                        }
+                        None => {}
+                    }
+
+                    {
+                        //
+                        // This is going to generate enough connections to handle a burst,
+                        // but it may generate up to twice the number of connections needed
+                        // in the worst case. Extra connections will go idle and be cleaned
+                        // up.
+                        //
+                        let mut inner = self_clone.inner.lock().await;
+                        inner.waiters += 1;
+                        if inner.waiters > (inner.in_progress * self_clone.max_consumers) {
+                            if (inner.entries.len() + inner.in_progress) < self_clone.max_total_connections {
+
+                                let self_clone_spawn = Arc::clone(&self_clone);
+                                tokio::task::spawn(async move {
+                                    self_clone_spawn.create_connection().await;
+                                });
+                                inner.in_progress += 1;
+                            }
+
+                        }
+                    }
+                    // Wait for a connection to become available, either because it
+                    // was created or because a connection was returned to the pool
+                    // by another consumer.
+                    semaphore = Arc::clone(&self_clone.channel_semaphore);
+                    let conn_permit = semaphore.acquire_owned().await.unwrap();
+                    {
+                        let mut inner = self_clone.inner.lock().await;
+                        inner.waiters -= 1;
+                    }
+                    // We got a permit, check the heap for a connection.
+                    let pool_conn = self_clone.get_conn_with_permit(conn_permit).await;
+                    match pool_conn {
+                        Some(pool_conn_) => {
+                            return Ok(pool_conn_);
+                        }
+                        None => {
+                            // No connection was found, forget the permit and retry.
+                            continue;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    async fn create_connection(&self) -> () {
+
+        // Generate a random backoff to add some jitter so that connections
+        // don't all retry at the same time.
+        let mut backoff_delay = Duration::from_millis(
+            rand::thread_rng().gen_range(0..=self.connect_backoff.as_millis() as u64),
+        );
+
+        loop {
+            if self.shutdown_token.is_cancelled() {
+                return;
+            }
+
+            // Back off.
+            // Loop because failure can occur while we are sleeping, so wait
+            // until the failure stopped for at least one backoff period. Backoff
+            // period includes some jitter, so that if multiple connections are
+            // failing, they don't all retry at the same time.
+            loop {
+                if let Some(delay) = {
+                    let inner = self.inner.lock().await;
+                    inner.last_connect_failure.and_then(|at| {
+                        (at.elapsed() < backoff_delay).then(|| backoff_delay - at.elapsed())
+                    })
+                } {
+                    sleep(delay).await;
+                } else {
+                    break; // No delay, so we can create a connection
+                }
+            }
+
+            //
+            // Create a new connection.
+            //
+            // The connect timeout is also the timeout for an individual gRPC request
+            // on this connection. (Requests made later on this channel will time out
+            // with the same timeout.)
+            //
+            match self.aggregate_metrics {
+                Some(ref metrics) => {
+                    metrics
+                        .retry_counters
+                        .with_label_values(&["connection_attempt"])
+                        .inc();
+                }
+                None => {}
+            }
+
+            let attempt = self.fact
+                .create(self.connect_timeout)
+                .await;
+
+            match attempt {
+                // Connection succeeded
+                Ok(Ok(channel)) => {
+                    {
+                        match self.aggregate_metrics {
+                            Some(ref metrics) => {
+                                metrics
+                                    .retry_counters
+                                    .with_label_values(&["connection_success"])
+                                    .inc();
+                            }
+                            None => {}
+                        }
+                        let mut inner = self.inner.lock().await;
+                        let id = uuid::Uuid::new_v4();
+                        inner.entries.insert(
+                            id,
+                            ConnectionEntry::<T> {
+                                channel: channel.clone(),
+                                active_consumers: 0,
+                                consecutive_errors: 0,
+                                last_used: Instant::now(),
+                            },
+                        );
+                        inner.pq.push(id, 0);
+                        inner.in_progress -= 1;
+                        self.channel_semaphore.add_permits(self.max_consumers);
+                        return;
+                    };
+                }
+                // Connection failed, back off and retry
+                Ok(Err(_)) | Err(_) => {
+                    match self.aggregate_metrics {
+                        Some(ref metrics) => {
+                            metrics
+                                .retry_counters
+                                .with_label_values(&["connect_failed"])
+                                .inc();
+                        }
+                        None => {}
+                    }
+                    let mut inner = self.inner.lock().await;
+                    inner.last_connect_failure = Some(Instant::now());
+                    // Add some jitter so that every connection doesn't retry at once
+                    let jitter = rand::thread_rng().gen_range(0..=backoff_delay.as_millis() as u64);
+                    backoff_delay =
+                        Duration::from_millis(backoff_delay.as_millis() as u64 + jitter);
+
+                    // Do not backoff longer than one minute
+                    if backoff_delay > Duration::from_secs(60) {
+                        backoff_delay = Duration::from_secs(60);
+                    }
+                    // continue the loop to retry
+                }
+            }
+        }
+    }
+
+    /// Return client to the pool, indicating success or error.
+    pub async fn return_client(&self, id: uuid::Uuid, success: bool, permit: OwnedSemaphorePermit) {
+        let mut inner = self.inner.lock().await;
+        if let Some(entry) = inner.entries.get_mut(&id) {
+            entry.last_used = Instant::now();
+            if entry.active_consumers <= 0 {
+                panic!("A consumer completed when active_consumers was zero!")
+            }
+            entry.active_consumers = entry.active_consumers - 1;
+            if success {
+                if entry.consecutive_errors < self.error_threshold {
+                    entry.consecutive_errors = 0;
+                }
+            } else {
+                entry.consecutive_errors += 1;
+                if entry.consecutive_errors == self.error_threshold {
+                    match self.aggregate_metrics {
+                        Some(ref metrics) => {
+                            metrics
+                                .retry_counters
+                                .with_label_values(&["connection_dropped"])
+                                .inc();
+                        }
+                        None => {}
+                    }
+                }
+            }
+
+            //
+            // Too many errors on this connection. If there are no active users,
+            // remove it. Otherwise just wait for active_consumers to go to zero.
+            // This connection will not be selected for new consumers.
+            //
+            let active_consumers = entry.active_consumers;
+            if entry.consecutive_errors >= self.error_threshold {
+                // too many errors, remove the connection permanently. Once it drains,
+                // it will be dropped.
+                if inner.pq.get_priority(&id).is_some() {
+                    inner.pq.remove(&id);
+                }
+
+                // remove from entries
+                // check if entry is in inner
+                if inner.entries.contains_key(&id) {
+                    inner.entries.remove(&id);
+                }
+                inner.last_connect_failure = Some(Instant::now());
+
+                // The connection has been removed, it's permits will be
+                // drained because if we look for a connection and it's not there
+                // we just forget the permit. However, this process can be a little
+                // bit faster if we just forget permits as the connections are returned.
+                permit.forget();
+            } else {
+                // update its priority in the queue
+                if inner.pq.get_priority(&id).is_some() {
+                    inner.pq.change_priority(&id, active_consumers);
+                } else {
+                    // This connection is not in the heap, but it has space
+                    // for more consumers. Put it back in the heap.
+                    if active_consumers < self.max_consumers {
+                        inner.pq.push(id, active_consumers);
+                    }
+                }
+            }
+        }
+    }
+}
+
+impl<T: Clone + Send + 'static> PooledClient<T> {
+    pub fn channel(&self) -> T {
+        return self.channel.clone();
+    }
+    pub async fn finish(mut self, result: Result<(), tonic::Status>) {
+        self.is_ok = result.is_ok();
+        self.pool.return_client(
+            self.id,
+            self.is_ok,
+            self.permit,
+        ).await;
+    }
+}
--- a/pageserver/client_grpc/src/lib.rs
+++ b/pageserver/client_grpc/src/lib.rs
@@ -0,0 +1,451 @@
+//! Pageserver Data API client
+//!
+//! - Manage connections to pageserver
+//! - Send requests to correct shards
+//!
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::sync::RwLock;
+use std::time::Duration;
+
+use bytes::Bytes;
+use futures::{Stream, StreamExt};
+use thiserror::Error;
+use tonic::metadata::AsciiMetadataValue;
+
+use pageserver_page_api::proto;
+use pageserver_page_api::*;
+
+use pageserver_page_api::proto::PageServiceClient;
+use utils::shard::ShardIndex;
+
+use std::fmt::Debug;
+pub mod client_cache;
+pub mod request_tracker;
+use tonic::transport::Channel;
+
+use metrics::{IntCounterVec, core::Collector};
+
+#[derive(Error, Debug)]
+pub enum PageserverClientError {
+    #[error("could not connect to service: {0}")]
+    ConnectError(#[from] tonic::transport::Error),
+    #[error("could not perform request: {0}`")]
+    RequestError(#[from] tonic::Status),
+    #[error("protocol error: {0}")]
+    ProtocolError(#[from] ProtocolError),
+
+    #[error("could not perform request: {0}`")]
+    InvalidUri(#[from] http::uri::InvalidUri),
+
+    #[error("could not perform request: {0}`")]
+    Other(String),
+}
+
+#[derive(Clone, Debug)]
+pub struct PageserverClientAggregateMetrics {
+    pub request_counters: IntCounterVec,
+    pub retry_counters: IntCounterVec,
+}
+impl PageserverClientAggregateMetrics {
+    pub fn new() -> Self {
+        let request_counters = IntCounterVec::new(
+            metrics::core::Opts::new(
+                "backend_requests_total",
+                "Number of requests from backends.",
+            ),
+            &["request_kind"],
+        )
+        .unwrap();
+
+        let retry_counters = IntCounterVec::new(
+            metrics::core::Opts::new(
+                "backend_requests_retries_total",
+                "Number of retried requests from backends.",
+            ),
+            &["request_kind"],
+        )
+        .unwrap();
+        Self {
+            request_counters,
+            retry_counters,
+        }
+    }
+
+    pub fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
+        let mut metrics = Vec::new();
+        metrics.append(&mut self.request_counters.collect());
+        metrics.append(&mut self.retry_counters.collect());
+        metrics
+    }
+}
+
+pub struct PageserverClient {
+    _tenant_id: String,
+    _timeline_id: String,
+
+    _auth_token: Option<String>,
+
+    shard_map: HashMap<ShardIndex, String>,
+
+    channels: RwLock<HashMap<ShardIndex, Arc<client_cache::ConnectionPool<Channel>>>>,
+
+    auth_interceptor: AuthInterceptor,
+
+    client_cache_options: ClientCacheOptions,
+
+    aggregate_metrics: Option<Arc<PageserverClientAggregateMetrics>>,
+}
+#[derive(Clone)]
+pub struct ClientCacheOptions {
+    pub max_consumers: usize,
+    pub error_threshold: usize,
+    pub connect_timeout: Duration,
+    pub connect_backoff: Duration,
+    pub max_idle_duration: Duration,
+    pub max_total_connections: usize,
+    pub max_delay_ms: u64,
+    pub drop_rate: f64,
+    pub hang_rate: f64,
+}
+
+impl PageserverClient {
+    /// TODO: this doesn't currently react to changes in the shard map.
+    pub fn new(
+        tenant_id: &str,
+        timeline_id: &str,
+        auth_token: &Option<String>,
+        shard_map: HashMap<ShardIndex, String>,
+    ) -> Self {
+        let options = ClientCacheOptions {
+            max_consumers: 5000,
+            error_threshold: 5,
+            connect_timeout: Duration::from_secs(5),
+            connect_backoff: Duration::from_secs(1),
+            max_idle_duration: Duration::from_secs(60),
+            max_total_connections: 100000,
+            max_delay_ms: 0,
+            drop_rate: 0.0,
+            hang_rate: 0.0,
+        };
+        Self::new_with_config(tenant_id, timeline_id, auth_token, shard_map, options, None)
+    }
+    pub fn new_with_config(
+        tenant_id: &str,
+        timeline_id: &str,
+        auth_token: &Option<String>,
+        shard_map: HashMap<ShardIndex, String>,
+        options: ClientCacheOptions,
+        metrics: Option<Arc<PageserverClientAggregateMetrics>>,
+    ) -> Self {
+        Self {
+            _tenant_id: tenant_id.to_string(),
+            _timeline_id: timeline_id.to_string(),
+            _auth_token: auth_token.clone(),
+            shard_map,
+            channels: RwLock::new(HashMap::new()),
+            auth_interceptor: AuthInterceptor::new(tenant_id, timeline_id, auth_token.as_deref()),
+            client_cache_options: options,
+            aggregate_metrics: metrics,
+        }
+    }
+    pub async fn process_check_rel_exists_request(
+        &self,
+        request: CheckRelExistsRequest,
+    ) -> Result<bool, PageserverClientError> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        let request = proto::CheckRelExistsRequest::from(request);
+        let response = client.check_rel_exists(tonic::Request::new(request)).await;
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                return Err(PageserverClientError::RequestError(status));
+            }
+            Ok(resp) => {
+                pooled_client.finish(Ok(())).await; // Pass success to finish
+                return Ok(resp.get_ref().exists);
+            }
+        }
+    }
+
+    pub async fn process_get_rel_size_request(
+        &self,
+        request: GetRelSizeRequest,
+    ) -> Result<u32, PageserverClientError> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        let request = proto::GetRelSizeRequest::from(request);
+        let response = client.get_rel_size(tonic::Request::new(request)).await;
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                return Err(PageserverClientError::RequestError(status));
+            }
+            Ok(resp) => {
+                pooled_client.finish(Ok(())).await; // Pass success to finish
+                return Ok(resp.get_ref().num_blocks);
+            }
+        }
+    }
+
+    // Request a single batch of pages
+    //
+    // TODO: This opens a new gRPC stream for every request, which is extremely inefficient
+    pub async fn get_page(
+        &self,
+        request: GetPageRequest,
+    ) -> Result<Vec<Bytes>, PageserverClientError> {
+        // FIXME: calculate the shard number correctly
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        let request = proto::GetPageRequest::from(request);
+
+        let request_stream = futures::stream::once(std::future::ready(request));
+
+        let mut response_stream = client
+            .get_pages(tonic::Request::new(request_stream))
+            .await?
+            .into_inner();
+
+        let Some(response) = response_stream.next().await else {
+            return Err(PageserverClientError::Other(
+                "no response received for getpage request".to_string(),
+            ));
+        };
+
+        match self.aggregate_metrics {
+            Some(ref metrics) => {
+                metrics
+                    .request_counters
+                    .with_label_values(&["get_page"])
+                    .inc();
+            }
+            None => {}
+        }
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                return Err(PageserverClientError::RequestError(status));
+            }
+            Ok(resp) => {
+                pooled_client.finish(Ok(())).await; // Pass success to finish
+                let response: GetPageResponse = resp.into();
+                return Ok(response.page_images.to_vec());
+            }
+        }
+    }
+
+    // Open a stream for requesting pages
+    //
+    // TODO: This is a pretty low level interface, the caller should not need to be concerned
+    // with streams. But 'get_page' is currently very naive and inefficient.
+    pub async fn get_pages(
+        &self,
+        requests: impl Stream<Item = proto::GetPageRequest> + Send + 'static,
+    ) -> std::result::Result<
+        tonic::Response<tonic::codec::Streaming<proto::GetPageResponse>>,
+        PageserverClientError,
+    > {
+        // FIXME: calculate the shard number correctly
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        let response = client.get_pages(tonic::Request::new(requests)).await;
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                return Err(PageserverClientError::RequestError(status));
+            }
+            Ok(resp) => {
+                return Ok(resp);
+            }
+        }
+    }
+
+    /// Process a request to get the size of a database.
+    pub async fn process_get_dbsize_request(
+        &self,
+        request: GetDbSizeRequest,
+    ) -> Result<u64, PageserverClientError> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        let request = proto::GetDbSizeRequest::from(request);
+        let response = client.get_db_size(tonic::Request::new(request)).await;
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                return Err(PageserverClientError::RequestError(status));
+            }
+            Ok(resp) => {
+                pooled_client.finish(Ok(())).await; // Pass success to finish
+                return Ok(resp.get_ref().num_bytes);
+            }
+        }
+    }
+    /// Process a request to get the size of a database.
+    pub async fn get_base_backup(
+        &self,
+        request: GetBaseBackupRequest,
+        gzip: bool,
+    ) -> std::result::Result<
+        tonic::Response<tonic::codec::Streaming<proto::GetBaseBackupResponseChunk>>,
+        PageserverClientError,
+    > {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        if gzip {
+            client = client.accept_compressed(tonic::codec::CompressionEncoding::Gzip);
+        }
+
+        let request = proto::GetBaseBackupRequest::from(request);
+        let response = client.get_base_backup(tonic::Request::new(request)).await;
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                return Err(PageserverClientError::RequestError(status));
+            }
+            Ok(resp) => {
+                pooled_client.finish(Ok(())).await; // Pass success to finish
+                return Ok(resp);
+            }
+        }
+    }
+    /// Get a client for given shard
+    ///
+    /// Get a client from the pool for this shard, also creating the pool if it doesn't exist.
+    ///
+    async fn get_client(&self, shard: ShardIndex) -> client_cache::PooledClient<Channel> {
+        let reused_pool: Option<Arc<client_cache::ConnectionPool<Channel>>> = {
+            let channels = self.channels.read().unwrap();
+            channels.get(&shard).cloned()
+        };
+
+        let usable_pool: Arc<client_cache::ConnectionPool<Channel>>;
+        match reused_pool {
+            Some(pool) => {
+                let pooled_client = pool.get_client().await.unwrap();
+                return pooled_client;
+            }
+            None => {
+                // Create a new pool using client_cache_options
+                // declare new_pool
+
+                let new_pool: Arc<client_cache::ConnectionPool<Channel>>;
+                let channel_fact = Arc::new(client_cache::ChannelFactory::new(
+                    self.shard_map.get(&shard).unwrap().clone(),
+                    self.client_cache_options.max_delay_ms,
+                    self.client_cache_options.drop_rate,
+                    self.client_cache_options.hang_rate,
+                ));
+                new_pool = client_cache::ConnectionPool::new(
+                    channel_fact,
+                    self.client_cache_options.connect_timeout,
+                    self.client_cache_options.connect_backoff,
+                    self.client_cache_options.max_consumers,
+                    self.client_cache_options.error_threshold,
+                    self.client_cache_options.max_idle_duration,
+                    self.client_cache_options.max_total_connections,
+                    self.aggregate_metrics.clone(),
+                );
+                let mut write_pool = self.channels.write().unwrap();
+                write_pool.insert(shard, new_pool.clone());
+                usable_pool = new_pool.clone();
+            }
+        }
+
+        let pooled_client = usable_pool.get_client().await.unwrap();
+        return pooled_client;
+    }
+}
+
+/// Inject tenant_id, timeline_id and authentication token to all pageserver requests.
+#[derive(Clone)]
+pub struct AuthInterceptor {
+    tenant_id: AsciiMetadataValue,
+    shard_id: Option<AsciiMetadataValue>,
+    timeline_id: AsciiMetadataValue,
+
+    auth_header: Option<AsciiMetadataValue>, // including "Bearer " prefix
+}
+
+impl AuthInterceptor {
+    pub fn new(tenant_id: &str, timeline_id: &str, auth_token: Option<&str>) -> Self {
+        Self {
+            tenant_id: tenant_id.parse().expect("could not parse tenant id"),
+            shard_id: None,
+            timeline_id: timeline_id.parse().expect("could not parse timeline id"),
+            auth_header: auth_token
+                .map(|t| format!("Bearer {t}"))
+                .map(|t| t.parse().expect("could not parse auth token")),
+        }
+    }
+
+    fn for_shard(&self, shard_id: ShardIndex) -> Self {
+        let mut with_shard = self.clone();
+        with_shard.shard_id = Some(
+            shard_id
+                .to_string()
+                .parse()
+                .expect("could not parse shard id"),
+        );
+        with_shard
+    }
+}
+
+impl tonic::service::Interceptor for AuthInterceptor {
+    fn call(&mut self, mut req: tonic::Request<()>) -> Result<tonic::Request<()>, tonic::Status> {
+        req.metadata_mut()
+            .insert("neon-tenant-id", self.tenant_id.clone());
+        if let Some(shard_id) = &self.shard_id {
+            req.metadata_mut().insert("neon-shard-id", shard_id.clone());
+        }
+        req.metadata_mut()
+            .insert("neon-timeline-id", self.timeline_id.clone());
+        if let Some(auth_header) = &self.auth_header {
+            req.metadata_mut()
+                .insert("authorization", auth_header.clone());
+        }
+
+        Ok(req)
+    }
+}
--- a/pageserver/client_grpc/src/request_tracker.rs
+++ b/pageserver/client_grpc/src/request_tracker.rs
@@ -0,0 +1,590 @@
+
+//
+// API Visible to the spawner, just a function call that is async
+//
+use std::sync::Arc;
+use crate::client_cache;
+use pageserver_page_api::GetPageRequest;
+use pageserver_page_api::GetPageResponse;
+use pageserver_page_api::*;
+use pageserver_page_api::proto;
+use crate::client_cache::ConnectionPool;
+use crate::client_cache::ChannelFactory;
+use crate::AuthInterceptor;
+use tonic::{transport::{Channel}, Request};
+use crate::ClientCacheOptions;
+use crate::PageserverClientAggregateMetrics;
+use tokio::sync::Mutex;
+use std::sync::atomic::AtomicU64;
+
+use utils::shard::ShardIndex;
+
+use tokio_stream::wrappers::ReceiverStream;
+use pageserver_page_api::proto::PageServiceClient;
+
+use tonic::{
+    Status,
+    Code,
+};
+
+use async_trait::async_trait;
+use std::time::Duration;
+
+use client_cache::PooledItemFactory;
+//use tracing::info;
+//
+// A mock stream pool that just returns a sending channel, and whenever a GetPageRequest
+// comes in on that channel, it randomly sleeps before sending a GetPageResponse
+//
+
+#[derive(Clone)]
+pub struct StreamReturner {
+    sender: tokio::sync::mpsc::Sender<proto::GetPageRequest>,
+    sender_hashmap: Arc<Mutex<std::collections::HashMap<u64, tokio::sync::mpsc::Sender<Result<proto::GetPageResponse, Status>>>>>,
+}
+pub struct MockStreamFactory {
+}
+
+impl MockStreamFactory {
+    pub fn new() -> Self {
+        MockStreamFactory {
+        }
+    }
+}
+#[async_trait]
+impl PooledItemFactory<StreamReturner> for MockStreamFactory {
+    async fn create(&self, _connect_timeout: Duration) -> Result<Result<StreamReturner, tonic::Status>, tokio::time::error::Elapsed> {
+        let (sender, mut receiver) = tokio::sync::mpsc::channel::<proto::GetPageRequest>(1000);
+        // Create a StreamReturner that will send requests to the receiver channel
+        let stream_returner = StreamReturner {
+            sender: sender.clone(),
+            sender_hashmap: Arc::new(Mutex::new(std::collections::HashMap::new())),
+        };
+
+        let map : Arc<Mutex<std::collections::HashMap<u64, tokio::sync::mpsc::Sender<Result<proto::GetPageResponse, _>>>>>
+            = Arc::clone(&stream_returner.sender_hashmap);
+        tokio::spawn(async move {
+            while let Some(request) = receiver.recv().await {
+
+                // Break out of the loop with 1% chance
+                if rand::random::<f32>() < 0.001 {
+                    break;
+                }
+                // Generate a random number between 0 and 100
+                // Simulate some processing time
+                let mapclone = Arc::clone(&map);
+                tokio::spawn(async move {
+                    let sleep_ms = rand::random::<u64>() % 100;
+                    tokio::time::sleep(tokio::time::Duration::from_millis(sleep_ms)).await;
+                    let response = proto::GetPageResponse {
+                        request_id: request.request_id,
+                        ..Default::default()
+                    };
+                    // look up stream in hash map
+                    let mut hashmap = mapclone.lock().await;
+                    if let Some(sender) = hashmap.get(&request.request_id) {
+                        // Send the response to the original request sender
+                        if let Err(e) = sender.send(Ok(response.clone())).await {
+                            eprintln!("Failed to send response: {}", e);
+                        }
+                        hashmap.remove(&request.request_id);
+                    } else {
+                        eprintln!("No sender found for request ID: {}", request.request_id);
+                    }
+                });
+            }
+            // Close every sender stream in the hashmap
+            let hashmap = map.lock().await;
+            for sender in hashmap.values() {
+                let error = Status::new(Code::Unknown, "Stream closed");
+                if let Err(e) = sender.send(Err(error)).await {
+                    eprintln!("Failed to send close response: {}", e);
+                }
+            }
+        });
+
+        Ok(Ok(stream_returner))
+    }
+}
+
+
+pub struct StreamFactory {
+    connection_pool: Arc<client_cache::ConnectionPool<Channel>>,
+    auth_interceptor: AuthInterceptor,
+    shard: ShardIndex,
+}
+
+impl StreamFactory {
+    pub fn new(
+        connection_pool: Arc<ConnectionPool<Channel>>,
+        auth_interceptor: AuthInterceptor,
+        shard: ShardIndex,
+    ) -> Self {
+        StreamFactory {
+            connection_pool,
+            auth_interceptor,
+            shard,
+        }
+    }
+}
+
+#[async_trait]
+impl PooledItemFactory<StreamReturner> for StreamFactory {
+    async fn create(&self, _connect_timeout: Duration) ->
+    Result<Result<StreamReturner, tonic::Status>, tokio::time::error::Elapsed>
+    {
+        let pool_clone : Arc<ConnectionPool<Channel>> = Arc::clone(&self.connection_pool);
+        let pooled_client = pool_clone.get_client().await;
+        let channel = pooled_client.unwrap().channel();
+        let mut client =
+            PageServiceClient::with_interceptor(channel, self.auth_interceptor.for_shard(self.shard));
+
+        let (sender, receiver) = tokio::sync::mpsc::channel::<proto::GetPageRequest>(1000);
+        let outbound = ReceiverStream::new(receiver);
+
+        let client_resp = client
+            .get_pages(Request::new(outbound))
+            .await;
+
+        match client_resp {
+            Err(status) => {
+                // TODO: Convert this error correctly
+                Ok(Err(tonic::Status::new(
+                    status.code(),
+                    format!("Failed to connect to pageserver: {}", status.message()),
+                )))
+            }
+            Ok(resp) => {
+                let stream_returner = StreamReturner {
+                    sender: sender.clone(),
+                    sender_hashmap: Arc::new(Mutex::new(std::collections::HashMap::new())),
+                };
+                let map : Arc<Mutex<std::collections::HashMap<u64, tokio::sync::mpsc::Sender<Result<proto::GetPageResponse, _>>>>>
+                    = Arc::clone(&stream_returner.sender_hashmap);
+
+                tokio::spawn(async move {
+
+                    let map_clone = Arc::clone(&map);
+                    let mut inner = resp.into_inner();
+                    loop {
+
+                        let resp = inner.message().await;
+                        if !resp.is_ok() {
+                            break; // Exit the loop if no more messages
+                        }
+                        let response = resp.unwrap().unwrap();
+
+                        // look up stream in hash map
+                        let mut hashmap = map_clone.lock().await;
+                        if let Some(sender) = hashmap.get(&response.request_id) {
+                            // Send the response to the original request sender
+                            if let Err(e) = sender.send(Ok(response.clone())).await {
+                                eprintln!("Failed to send response: {}", e);
+                            }
+                            hashmap.remove(&response.request_id);
+                        } else {
+                            eprintln!("No sender found for request ID: {}", response.request_id);
+                        }
+                    }
+                    // Close every sender stream in the hashmap
+                    let hashmap = map_clone.lock().await;
+                    for sender in hashmap.values() {
+                        let error = Status::new(Code::Unknown, "Stream closed");
+                        if let Err(e) = sender.send(Err(error)).await {
+                            eprintln!("Failed to send close response: {}", e);
+                        }
+                    }
+                });
+
+                Ok(Ok(stream_returner))
+            }
+        }
+    }
+}
+
+#[derive(Clone)]
+pub struct RequestTracker {
+    cur_id: Arc<AtomicU64>,
+    stream_pool: Arc<ConnectionPool<StreamReturner>>,
+    unary_pool: Arc<ConnectionPool<Channel>>,
+    auth_interceptor: AuthInterceptor,
+    shard: ShardIndex,
+}
+
+impl RequestTracker {
+    pub fn new(stream_pool: Arc<ConnectionPool<StreamReturner>>,
+                unary_pool: Arc<ConnectionPool<Channel>>,
+                auth_interceptor: AuthInterceptor,
+                shard: ShardIndex,
+    ) -> Self {
+        let cur_id = Arc::new(AtomicU64::new(0));
+
+        RequestTracker {
+            cur_id: cur_id.clone(),
+            stream_pool: stream_pool,
+            unary_pool: unary_pool,
+            auth_interceptor: auth_interceptor,
+            shard: shard.clone()
+        }
+    }
+
+    pub async fn send_process_check_rel_exists_request(
+        &self,
+        req: CheckRelExistsRequest,
+    ) -> Result<bool, tonic::Status> {
+        loop {
+            let unary_pool = Arc::clone(&self.unary_pool);
+            let pooled_client = unary_pool.get_client().await.unwrap();
+            let channel = pooled_client.channel();
+            let mut ps_client = PageServiceClient::with_interceptor(channel, self.auth_interceptor.for_shard(self.shard));
+            let request = proto::CheckRelExistsRequest::from(req.clone());
+            let response = ps_client.check_rel_exists(tonic::Request::new(request)).await;
+
+            match response {
+                Err(status) => {
+                    pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                    continue;
+                }
+                Ok(resp) => {
+                    pooled_client.finish(Ok(())).await; // Pass success to finish
+                    return Ok(resp.get_ref().exists);
+                }
+            }
+        }
+    }
+
+    pub async fn send_process_get_rel_size_request(
+        &self,
+        req: GetRelSizeRequest,
+    ) -> Result<u32, tonic::Status> {
+        loop {
+            // Current sharding model assumes that all metadata is present only at shard 0.
+            let unary_pool = Arc::clone(&self.unary_pool);
+            let pooled_client = unary_pool.get_client().await.unwrap();
+            let channel = pooled_client.channel();
+            let mut ps_client = PageServiceClient::with_interceptor(channel, self.auth_interceptor.for_shard(self.shard));
+
+            let request = proto::GetRelSizeRequest::from(req.clone());
+            let response = ps_client.get_rel_size(tonic::Request::new(request)).await;
+
+            match response {
+                Err(status) => {
+                    pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                    continue;
+                }
+                Ok(resp) => {
+                    pooled_client.finish(Ok(())).await; // Pass success to finish
+                    return Ok(resp.get_ref().num_blocks);
+                }
+            }
+
+        }
+    }
+
+    pub async fn send_process_get_dbsize_request(
+        &self,
+        req: GetDbSizeRequest,
+    ) -> Result<u64, tonic::Status> {
+        loop {
+            // Current sharding model assumes that all metadata is present only at shard 0.
+            let unary_pool = Arc::clone(&self.unary_pool);
+            let pooled_client = unary_pool.get_client().await.unwrap();let channel = pooled_client.channel();
+            let mut ps_client = PageServiceClient::with_interceptor(channel, self.auth_interceptor.for_shard(self.shard));
+
+            let request = proto::GetDbSizeRequest::from(req.clone());
+            let response = ps_client.get_db_size(tonic::Request::new(request)).await;
+
+            match response {
+                Err(status) => {
+                    pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                    continue;
+                }
+                Ok(resp) => {
+                    pooled_client.finish(Ok(())).await; // Pass success to finish
+                    return Ok(resp.get_ref().num_bytes);
+                }
+            }
+
+        }
+    }
+
+    pub async fn send_getpage_request(
+        &mut self,
+        req: GetPageRequest,
+    ) -> Result<GetPageResponse, tonic::Status> {
+        loop {
+            let request = req.clone();
+            // Increment cur_id
+            //let request_id = self.cur_id.fetch_add(1, Ordering::SeqCst) + 1;
+            let request_id = request.request_id;
+            let response_sender: tokio::sync::mpsc::Sender<Result<proto::GetPageResponse, Status>>;
+            let mut response_receiver: tokio::sync::mpsc::Receiver<Result<proto::GetPageResponse, Status>>;
+
+            (response_sender, response_receiver) = tokio::sync::mpsc::channel(1);
+            //request.request_id = request_id;
+
+            // Get a stream from the stream pool
+            let pool_clone = Arc::clone(&self.stream_pool);
+            let sender_stream_pool = pool_clone.get_client().await;
+            let stream_returner = match sender_stream_pool {
+                Ok(stream_ret) => stream_ret,
+                Err(_e) => {
+                    // retry
+                    continue;
+                }
+            };
+            let returner = stream_returner.channel();
+            let map = returner.sender_hashmap.clone();
+            // Insert the response sender into the hashmap
+            {
+                let mut map_inner = map.lock().await;
+                map_inner.insert(request_id, response_sender);
+            }
+            let sent = returner.sender.send(proto::GetPageRequest::from(request))
+                .await;
+
+            if let Err(_e) = sent {
+                // Remove the request from the map if sending failed
+                {
+                    let mut map_inner = map.lock().await;
+                    // remove from hashmap
+                    map_inner.remove(&request_id);
+                }
+                stream_returner.finish(Err(Status::new(Code::Unknown,
+                                                       "Failed to send request"))).await;
+                continue;
+            }
+
+            let response: Option<Result<proto::GetPageResponse, Status>>;
+            response = response_receiver.recv().await;
+            match response {
+                Some (resp) => {
+                    match resp {
+                        Err(_status) => {
+                            // Handle the case where the response was not received
+                            stream_returner.finish(Err(Status::new(Code::Unknown,
+                                                                   "Failed to receive response"))).await;
+                            continue;
+                        },
+                        Ok(resp) => {
+                            stream_returner.finish(Result::Ok(())).await;
+                            return Ok(resp.clone().into());
+                        }
+                    }
+                }
+                None => {
+                    // Handle the case where the response channel was closed
+                    stream_returner.finish(Err(Status::new(Code::Unknown,
+                                                           "Response channel closed"))).await;
+                    continue;
+                }
+            }
+        }
+    }
+}
+
+struct ShardedRequestTrackerInner {
+    // Hashmap of shard index to RequestTracker
+    trackers: std::collections::HashMap<ShardIndex, RequestTracker>,
+}
+pub struct ShardedRequestTracker {
+    inner: Arc<Mutex<ShardedRequestTrackerInner>>,
+    tcp_client_cache_options: ClientCacheOptions,
+    stream_client_cache_options: ClientCacheOptions,
+}
+
+//
+// TODO: Functions in the ShardedRequestTracker should be able to timeout and
+// cancel a reqeust. The request should return an error if it is cancelled.
+//
+impl ShardedRequestTracker {
+    pub fn new() -> Self {
+        //
+        // Default configuration for the client. These could be added to a config file
+        //
+        let tcp_client_cache_options = ClientCacheOptions {
+            max_delay_ms:       0,
+            drop_rate:          0.0,
+            hang_rate:          0.0,
+            connect_timeout:    Duration::from_secs(1),
+            connect_backoff:    Duration::from_millis(100),
+            max_consumers:      8, // Streams per connection
+            error_threshold:    10,
+            max_idle_duration:  Duration::from_secs(5),
+            max_total_connections: 8,
+        };
+        let stream_client_cache_options = ClientCacheOptions {
+            max_delay_ms:       0,
+            drop_rate:          0.0,
+            hang_rate:          0.0,
+            connect_timeout:    Duration::from_secs(1),
+            connect_backoff:    Duration::from_millis(100),
+            max_consumers:      64, // Requests per stream
+            error_threshold:    10,
+            max_idle_duration:  Duration::from_secs(5),
+            max_total_connections: 64, // Total allowable number of streams
+        };
+        ShardedRequestTracker {
+            inner: Arc::new(Mutex::new(ShardedRequestTrackerInner {
+                trackers: std::collections::HashMap::new(),
+            })),
+            tcp_client_cache_options,
+            stream_client_cache_options,
+        }
+    }
+
+    pub async fn update_shard_map(&self,
+                            shard_urls: std::collections::HashMap<ShardIndex, String>,
+                            metrics: Option<Arc<PageserverClientAggregateMetrics>>,
+                            tenant_id: String, timeline_id: String, auth_str: Option<&str>) {
+
+
+       let mut trackers = std::collections::HashMap::new();
+        for (shard, endpoint_url) in shard_urls {
+            //
+            // Create a pool of streams for streaming get_page requests
+            //
+            let channel_fact : Arc<dyn PooledItemFactory<Channel> + Send + Sync> = Arc::new(ChannelFactory::new(
+                endpoint_url.clone(),
+                self.tcp_client_cache_options.max_delay_ms,
+                self.tcp_client_cache_options.drop_rate,
+                self.tcp_client_cache_options.hang_rate,
+            ));
+            let new_pool: Arc<ConnectionPool<Channel>>;
+            new_pool = ConnectionPool::new(
+                Arc::clone(&channel_fact),
+                self.tcp_client_cache_options.connect_timeout,
+                self.tcp_client_cache_options.connect_backoff,
+                self.tcp_client_cache_options.max_consumers,
+                self.tcp_client_cache_options.error_threshold,
+                self.tcp_client_cache_options.max_idle_duration,
+                self.tcp_client_cache_options.max_total_connections,
+                metrics.clone(),
+            );
+
+            let auth_interceptor = AuthInterceptor::new(tenant_id.as_str(),
+                                                        timeline_id.as_str(),
+                                                        auth_str);
+
+            let stream_pool = ConnectionPool::<StreamReturner>::new(
+                Arc::new(StreamFactory::new(new_pool.clone(),
+                                            auth_interceptor.clone(), ShardIndex::unsharded())),
+                self.stream_client_cache_options.connect_timeout,
+                self.stream_client_cache_options.connect_backoff,
+                self.stream_client_cache_options.max_consumers,
+                self.stream_client_cache_options.error_threshold,
+                self.stream_client_cache_options.max_idle_duration,
+                self.stream_client_cache_options.max_total_connections,
+                metrics.clone(),
+            );
+
+            //
+            // Create a client pool for unary requests
+            //
+
+            let unary_pool: Arc<ConnectionPool<Channel>>;
+            unary_pool = ConnectionPool::new(
+                Arc::clone(&channel_fact),
+                self.tcp_client_cache_options.connect_timeout,
+                self.tcp_client_cache_options.connect_backoff,
+                self.tcp_client_cache_options.max_consumers,
+                self.tcp_client_cache_options.error_threshold,
+                self.tcp_client_cache_options.max_idle_duration,
+                self.tcp_client_cache_options.max_total_connections,
+                metrics.clone()
+            );
+            //
+            // Create a new RequestTracker for this shard
+            //
+            let new_tracker = RequestTracker::new(stream_pool, unary_pool, auth_interceptor, shard);
+            trackers.insert(shard, new_tracker);
+        }
+        let mut inner = self.inner.lock().await;
+        inner.trackers = trackers;
+    }
+
+    pub async fn get_page(
+        &self,
+        req: GetPageRequest,
+    ) -> Result<GetPageResponse, tonic::Status> {
+
+        // Get shard index from the request
+        let shard_index = ShardIndex::unsharded();
+        let inner = self.inner.lock().await;
+        let mut tracker : RequestTracker;
+        if let Some(t) = inner.trackers.get(&shard_index) {
+            tracker = t.clone();
+        } else {
+            return Err(tonic::Status::not_found(format!("Shard {} not found", shard_index)));
+        }
+        drop(inner);
+        // Call the send_getpage_request method on the tracker
+        let response = tracker.send_getpage_request(req).await;
+        match response {
+            Ok(resp) => Ok(resp),
+            Err(e) => Err(tonic::Status::unknown(format!("Failed to get page: {}", e))),
+        }
+    }
+    pub async fn process_get_dbsize_request(
+        &self,
+        request: GetDbSizeRequest,
+    ) -> Result<u64, tonic::Status> {
+        let shard_index = ShardIndex::unsharded();
+        let inner = self.inner.lock().await;
+        let tracker: RequestTracker;
+        if let Some(t) = inner.trackers.get(&shard_index) {
+            tracker = t.clone();
+        } else {
+            return Err(tonic::Status::not_found(format!("Shard {} not found", shard_index)));
+        }
+        drop(inner); // Release the lock before calling send_process_get_dbsize_request
+        // Call the send_process_get_dbsize_request method on the tracker
+        let response = tracker.send_process_get_dbsize_request(request).await;
+        match response {
+            Ok(resp) => Ok(resp),
+            Err(e) => Err(e),
+        }
+    }
+
+    pub async fn process_get_rel_size_request(
+        &self,
+        request: GetRelSizeRequest,
+    ) -> Result<u32, tonic::Status> {
+        let shard_index = ShardIndex::unsharded();
+        let inner = self.inner.lock().await;
+        let tracker: RequestTracker;
+        if let Some(t) = inner.trackers.get(&shard_index) {
+            tracker = t.clone();
+        } else {
+            return Err(tonic::Status::not_found(format!("Shard {} not found", shard_index)));
+        }
+        drop(inner); // Release the lock before calling send_process_get_rel_size_request
+        // Call the send_process_get_rel_size_request method on the tracker
+        let response = tracker.send_process_get_rel_size_request(request).await;
+        match response {
+            Ok(resp) => Ok(resp),
+            Err(e) => Err(e),
+        }
+    }
+
+    pub async fn process_check_rel_exists_request(
+        &self,
+        request: CheckRelExistsRequest,
+    ) -> Result<bool, tonic::Status> {
+        let shard_index = ShardIndex::unsharded();
+        let inner = self.inner.lock().await;
+        let tracker: RequestTracker;
+        if let Some(t) = inner.trackers.get(&shard_index) {
+            tracker = t.clone();
+        } else {
+            return Err(tonic::Status::not_found(format!("Shard {} not found", shard_index)));
+        }
+        drop(inner); // Release the lock before calling send_process_check_rel_exists_request
+        // Call the send_process_check_rel_exists_request method on the tracker
+        let response = tracker.send_process_check_rel_exists_request(request).await;
+        match response {
+            Ok(resp) => Ok(resp),
+            Err(e) => Err(e),
+        }
+    }
+}
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -24,10 +24,14 @@ tracing.workspace = true
 tokio.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
+axum.workspace = true
+http.workspace = true
+metrics.workspace = true
 tonic.workspace = true
 url.workspace = true

 pageserver_client.workspace = true
+pageserver_client_grpc.workspace = true
 pageserver_api.workspace = true
 pageserver_page_api.workspace = true
 utils = { path = "../../libs/utils/" }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,4 +1,4 @@
-use std::collections::{HashMap, HashSet, VecDeque};
+use std::collections::{HashSet, HashMap, VecDeque};
 use std::future::Future;
 use std::num::NonZeroUsize;
 use std::pin::Pin;
@@ -26,12 +26,28 @@ use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
 use utils::shard::ShardIndex;

+use axum::Router;
+use axum::body::Body;
+use axum::extract::State;
+use axum::response::Response;
+
+use http::StatusCode;
+use http::header::CONTENT_TYPE;
+
+use metrics;
+use metrics::proto::MetricFamily;
+use metrics::{Encoder, TextEncoder};
+
 use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
 use crate::util::{request_stats, tokio_thread_local_stats};

 /// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
 #[derive(clap::Parser)]
 pub(crate) struct Args {
+    #[clap(long, default_value = "false")]
+    grpc: bool,
+    #[clap(long, default_value = "false")]
+    grpc_stream: bool,
    #[clap(long, default_value = "http://localhost:9898")]
    mgmt_api_endpoint: String,
    /// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
@@ -72,6 +88,9 @@ pub(crate) struct Args {
    #[clap(long)]
    set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,

+    #[clap(long)]
+    only_relnode: Option<u32>,
+
    /// Queue depth generated in each client.
    #[clap(long, default_value = "1")]
    queue_depth: NonZeroUsize,
@@ -86,10 +105,31 @@ pub(crate) struct Args {
    #[clap(long, default_value = "1")]
    batch_size: NonZeroUsize,

-    #[clap(long)]
-    only_relnode: Option<u32>,
-
    targets: Option<Vec<TenantTimelineId>>,
+
+    #[clap(long, default_value = "100")]
+    pool_max_consumers: NonZeroUsize,
+
+    #[clap(long, default_value = "5")]
+    pool_error_threshold: NonZeroUsize,
+
+    #[clap(long, default_value = "5000")]
+    pool_connect_timeout: NonZeroUsize,
+
+    #[clap(long, default_value = "1000")]
+    pool_connect_backoff: NonZeroUsize,
+
+    #[clap(long, default_value = "60000")]
+    pool_max_idle_duration: NonZeroUsize,
+
+    #[clap(long, default_value = "0")]
+    max_delay_ms: usize,
+
+    #[clap(long, default_value = "0")]
+    percent_drops: usize,
+
+    #[clap(long, default_value = "0")]
+    percent_hangs: usize,
 }

 /// State shared by all clients
@@ -146,6 +186,37 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
        main_impl(args, thread_local_stats)
    })
 }
+async fn get_metrics(
+    State(state): State<Arc<pageserver_client_grpc::PageserverClientAggregateMetrics>>,
+) -> Response {
+    let metrics = state.collect();
+
+    info!("metrics: {metrics:?}");
+    // When we call TextEncoder::encode() below, it will immediately return an
+    // error if a metric family has no metrics, so we need to preemptively
+    // filter out metric families with no metrics.
+    let metrics = metrics
+        .into_iter()
+        .filter(|m| !m.get_metric().is_empty())
+        .collect::<Vec<MetricFamily>>();
+
+    let encoder = TextEncoder::new();
+    let mut buffer = vec![];
+
+    if let Err(e) = encoder.encode(&metrics, &mut buffer) {
+        Response::builder()
+            .status(StatusCode::INTERNAL_SERVER_ERROR)
+            .header(CONTENT_TYPE, "application/text")
+            .body(Body::from(e.to_string()))
+            .unwrap()
+    } else {
+        Response::builder()
+            .status(StatusCode::OK)
+            .header(CONTENT_TYPE, encoder.format_type())
+            .body(Body::from(buffer))
+            .unwrap()
+    }
+}

 async fn main_impl(
    args: Args,
@@ -153,6 +224,24 @@ async fn main_impl(
 ) -> anyhow::Result<()> {
    let args: &'static Args = Box::leak(Box::new(args));

+    // Vector of pageserver clients
+    let client_metrics = Arc::new(pageserver_client_grpc::PageserverClientAggregateMetrics::new());
+
+    use axum::routing::get;
+    let app = Router::new()
+        .route("/metrics", get(get_metrics))
+        .with_state(client_metrics.clone());
+
+    // TODO: make configurable. Or listen on unix domain socket?
+    let listener = tokio::net::TcpListener::bind("127.0.0.1:9090")
+        .await
+        .unwrap();
+
+    tokio::spawn(async {
+        tracing::info!("metrics listener spawned");
+        axum::serve(listener, app).await.unwrap()
+    });
+
    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
        reqwest::Client::new(), // TODO: support ssl_ca_file for https APIs in pagebench.
        args.mgmt_api_endpoint.clone(),
@@ -311,6 +400,7 @@ async fn main_impl(
    let rps_period = args
        .per_client_rate
        .map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
+
    let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
        let ss = shared_state.clone();
        let cancel = cancel.clone();
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -20,8 +20,7 @@ use pageserver_api::key::{Key, rel_block_to_key};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants::{PG_HBA, PGDATA_SPECIAL_FILES};
 use postgres_ffi::{
-    BLCKSZ, PG_TLI, PgMajorVersion, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName,
-    dispatch_pgversion, pg_constants,
+    BLCKSZ, PG_TLI, RELSEG_SIZE, WAL_SEGMENT_SIZE, XLogFileName, dispatch_pgversion, pg_constants,
 };
 use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi_types::forknum::{INIT_FORKNUM, MAIN_FORKNUM};
@@ -620,7 +619,10 @@ where
        };

        if spcnode == GLOBALTABLESPACE_OID {
-            let pg_version_str = self.timeline.pg_version.versionfile_string();
+            let pg_version_str = match self.timeline.pg_version {
+                14 | 15 => self.timeline.pg_version.to_string(),
+                ver => format!("{ver}\x0A"),
+            };
            let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
            self.ar
                .append(&header, pg_version_str.as_bytes())
@@ -677,7 +679,10 @@ where
            if let Some(img) = relmap_img {
                let dst_path = format!("base/{}/PG_VERSION", dbnode);

-                let pg_version_str = self.timeline.pg_version.versionfile_string();
+                let pg_version_str = match self.timeline.pg_version {
+                    14 | 15 => self.timeline.pg_version.to_string(),
+                    ver => format!("{ver}\x0A"),
+                };
                let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
                self.ar
                    .append(&header, pg_version_str.as_bytes())
@@ -708,7 +713,7 @@ where
        buf.extend_from_slice(&img[..]);
        let crc = crc32c::crc32c(&img[..]);
        buf.put_u32_le(crc);
-        let path = if self.timeline.pg_version < PgMajorVersion::PG17 {
+        let path = if self.timeline.pg_version < 17 {
            format!("pg_twophase/{:>08X}", xid)
        } else {
            format!("pg_twophase/{:>016X}", xid)
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -11,7 +11,7 @@ use std::num::NonZeroUsize;
 use std::sync::Arc;
 use std::time::Duration;

-use anyhow::{Context, ensure};
+use anyhow::{Context, bail, ensure};
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use pageserver_api::config::{
@@ -22,7 +22,6 @@ use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
 use pem::Pem;
 use postgres_backend::AuthType;
-use postgres_ffi::PgMajorVersion;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use reqwest::Url;
 use storage_broker::Uri;
@@ -339,16 +338,20 @@ impl PageServerConf {
    //
    // Postgres distribution paths
    //
-    pub fn pg_distrib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<Utf8PathBuf> {
+    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
        let path = self.pg_distrib_dir.clone();

-        Ok(path.join(pg_version.v_str()))
+        #[allow(clippy::manual_range_patterns)]
+        match pg_version {
+            14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))),
+            _ => bail!("Unsupported postgres version: {}", pg_version),
+        }
    }

-    pub fn pg_bin_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<Utf8PathBuf> {
+    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
    }
-    pub fn pg_lib_dir(&self, pg_version: PgMajorVersion) -> anyhow::Result<Utf8PathBuf> {
+    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
    }

--- a/pageserver/src/feature_resolver.rs
+++ b/pageserver/src/feature_resolver.rs
@@ -12,6 +12,8 @@ use utils::id::TenantId;

 use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION};

+const DEFAULT_POSTHOG_REFRESH_INTERVAL: Duration = Duration::from_secs(600);
+
 #[derive(Clone)]
 pub struct FeatureResolver {
    inner: Option<Arc<FeatureResolverBackgroundLoop>>,
@@ -139,10 +141,13 @@ impl FeatureResolver {
                }
                tenants
            };
-            // TODO: make refresh period configurable
-            inner
-                .clone()
-                .spawn(handle, Duration::from_secs(60), fake_tenants);
+            inner.clone().spawn(
+                handle,
+                posthog_config
+                    .refresh_interval
+                    .unwrap_or(DEFAULT_POSTHOG_REFRESH_INTERVAL),
+                fake_tenants,
+            );
            Ok(FeatureResolver {
                inner: Some(inner),
                internal_properties: Some(internal_properties),
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -41,7 +41,6 @@ use pageserver_api::models::{
    TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse,
 };
 use pageserver_api::shard::{ShardCount, TenantShardId};
-use postgres_ffi::PgMajorVersion;
 use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
 use scopeguard::defer;
 use serde_json::json;
@@ -3382,7 +3381,7 @@ async fn put_tenant_timeline_import_basebackup(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?;
    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
-    let pg_version: PgMajorVersion = must_parse_query_param(&request, "pg_version")?;
+    let pg_version: u32 = must_parse_query_param(&request, "pg_version")?;

    check_permission(&request, Some(tenant_id))?;

--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -38,7 +38,6 @@ pub mod walredo;

 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
-use postgres_ffi::PgMajorVersion;
 use tenant::mgr::{BackgroundPurges, TenantManager};
 use tenant::secondary;
 use tracing::{info, info_span};
@@ -52,7 +51,7 @@ use tracing::{info, info_span};
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;

-pub const DEFAULT_PG_VERSION: PgMajorVersion = PgMajorVersion::PG17;
+pub const DEFAULT_PG_VERSION: u32 = 17;

 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3426,7 +3426,7 @@ impl TimelineMetrics {
    pub fn dec_frozen_layer(&self, layer: &InMemoryLayer) {
        assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
        let labels = self.make_frozen_layer_labels(layer);
-        let size = layer.try_len().expect("frozen layer should have no writer");
+        let size = layer.len();
        TIMELINE_LAYER_COUNT
            .get_metric_with_label_values(&labels)
            .unwrap()
@@ -3441,7 +3441,7 @@ impl TimelineMetrics {
    pub fn inc_frozen_layer(&self, layer: &InMemoryLayer) {
        assert!(matches!(layer.info(), InMemoryLayerInfo::Frozen { .. }));
        let labels = self.make_frozen_layer_labels(layer);
-        let size = layer.try_len().expect("frozen layer should have no writer");
+        let size = layer.len();
        TIMELINE_LAYER_COUNT
            .get_metric_with_label_values(&labels)
            .unwrap()
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -3544,8 +3544,9 @@ impl proto::PageService for GrpcPageServiceHandler {
        &self,
        req: tonic::Request<proto::GetBaseBackupRequest>,
    ) -> Result<tonic::Response<Self::GetBaseBackupStream>, tonic::Status> {
-        // Send 64 KB chunks to avoid large memory allocations.
-        const CHUNK_SIZE: usize = 64 * 1024;
+        // Send chunks of 256 KB to avoid large memory allocations. pagebench basebackup shows this
+        // to be the sweet spot where throughput is saturated.
+        const CHUNK_SIZE: usize = 256 * 1024;

        let timeline = self.get_request_timeline(&req).await?;
        let ctx = self.ctx.with_scope_timeline(&timeline);
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -25,7 +25,7 @@ use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace};
 use pageserver_api::models::RelSizeMigration;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::{BLCKSZ, PgMajorVersion, TimestampTz, TransactionId};
+use postgres_ffi::{BLCKSZ, TimestampTz, TransactionId};
 use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi_types::{Oid, RepOriginId};
 use serde::{Deserialize, Serialize};
@@ -1081,7 +1081,7 @@ impl Timeline {
        // fetch directory entry
        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;

-        if self.pg_version >= PgMajorVersion::PG17 {
+        if self.pg_version >= 17 {
            Ok(TwoPhaseDirectoryV17::des(&buf)?.xids)
        } else {
            Ok(TwoPhaseDirectory::des(&buf)?
@@ -1613,7 +1613,7 @@ impl DatadirModification<'_> {
            .push((DirectoryKind::Db, MetricsUpdate::Set(0)));
        self.put(DBDIR_KEY, Value::Image(buf.into()));

-        let buf = if self.tline.pg_version >= PgMajorVersion::PG17 {
+        let buf = if self.tline.pg_version >= 17 {
            TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 {
                xids: HashSet::new(),
            })
@@ -1967,7 +1967,7 @@ impl DatadirModification<'_> {
    ) -> Result<(), WalIngestError> {
        // Add it to the directory entry
        let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
-        let newdirbuf = if self.tline.pg_version >= PgMajorVersion::PG17 {
+        let newdirbuf = if self.tline.pg_version >= 17 {
            let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
            if !dir.xids.insert(xid) {
                Err(WalIngestErrorKind::FileAlreadyExists(xid))?;
@@ -2383,7 +2383,7 @@ impl DatadirModification<'_> {
    ) -> Result<(), WalIngestError> {
        // Remove it from the directory entry
        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
-        let newdirbuf = if self.tline.pg_version >= PgMajorVersion::PG17 {
+        let newdirbuf = if self.tline.pg_version >= 17 {
            let mut dir = TwoPhaseDirectoryV17::des(&buf)?;

            if !dir.xids.remove(&xid) {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -38,7 +38,6 @@ use pageserver_api::models::{
    WalRedoManagerStatus,
 };
 use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId};
-use postgres_ffi::PgMajorVersion;
 use remote_storage::{DownloadError, GenericRemoteStorage, TimeoutOrCancel};
 use remote_timeline_client::index::GcCompactionState;
 use remote_timeline_client::manifest::{
@@ -498,7 +497,7 @@ impl WalRedoManager {
        lsn: Lsn,
        base_img: Option<(Lsn, bytes::Bytes)>,
        records: Vec<(Lsn, wal_decoder::models::record::NeonWalRecord)>,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
        redo_attempt_type: RedoAttemptType,
    ) -> Result<bytes::Bytes, walredo::Error> {
        match self {
@@ -934,7 +933,7 @@ pub(crate) enum CreateTimelineParams {
 pub(crate) struct CreateTimelineParamsBootstrap {
    pub(crate) new_timeline_id: TimelineId,
    pub(crate) existing_initdb_timeline_id: Option<TimelineId>,
-    pub(crate) pg_version: PgMajorVersion,
+    pub(crate) pg_version: u32,
 }

 /// NB: See comment on [`CreateTimelineIdempotency::Branch`] for why there's no `pg_version` here.
@@ -972,7 +971,7 @@ pub(crate) enum CreateTimelineIdempotency {
    /// NB: special treatment, see comment in [`Self`].
    FailWithConflict,
    Bootstrap {
-        pg_version: PgMajorVersion,
+        pg_version: u32,
    },
    /// NB: branches always have the same `pg_version` as their ancestor.
    /// While [`pageserver_api::models::TimelineCreateRequestMode::Branch::pg_version`]
@@ -2542,7 +2541,7 @@ impl TenantShard {
        self: &Arc<Self>,
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
        ctx: &RequestContext,
    ) -> anyhow::Result<(UninitializedTimeline, RequestContext)> {
        anyhow::ensure!(
@@ -2594,7 +2593,7 @@ impl TenantShard {
        self: &Arc<Self>,
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
        let (uninit_tl, ctx) = self
@@ -2633,7 +2632,7 @@ impl TenantShard {
        self: &Arc<Self>,
        new_timeline_id: TimelineId,
        initdb_lsn: Lsn,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
        ctx: &RequestContext,
        in_memory_layer_desc: Vec<timeline::InMemoryLayerTestDesc>,
        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
@@ -2899,7 +2898,7 @@ impl TenantShard {
                    Lsn(0),
                    initdb_lsn,
                    initdb_lsn,
-                    PgMajorVersion::PG15,
+                    15,
                );
                this.prepare_new_timeline(
                    new_timeline_id,
@@ -5091,7 +5090,7 @@ impl TenantShard {
    pub(crate) async fn bootstrap_timeline_test(
        self: &Arc<Self>,
        timeline_id: TimelineId,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
        load_existing_initdb: Option<TimelineId>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
@@ -5233,7 +5232,7 @@ impl TenantShard {
    async fn bootstrap_timeline(
        self: &Arc<Self>,
        timeline_id: TimelineId,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
        load_existing_initdb: Option<TimelineId>,
        ctx: &RequestContext,
    ) -> Result<CreateTimelineResult, CreateTimelineError> {
@@ -5771,7 +5770,7 @@ impl TenantShard {
 async fn run_initdb(
    conf: &'static PageServerConf,
    initdb_target_dir: &Utf8Path,
-    pg_version: PgMajorVersion,
+    pg_version: u32,
    cancel: &CancellationToken,
 ) -> Result<(), InitdbError> {
    let initdb_bin_path = conf
@@ -6052,7 +6051,7 @@ pub(crate) mod harness {
            lsn: Lsn,
            base_img: Option<(Lsn, Bytes)>,
            records: Vec<(Lsn, NeonWalRecord)>,
-            _pg_version: PgMajorVersion,
+            _pg_version: u32,
            _redo_attempt_type: RedoAttemptType,
        ) -> Result<Bytes, walredo::Error> {
            let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
@@ -6224,7 +6223,7 @@ mod tests {
    async fn randomize_timeline(
        tenant: &Arc<TenantShard>,
        new_timeline_id: TimelineId,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
        spec: TestTimelineSpecification,
        random: &mut rand::rngs::StdRng,
        ctx: &RequestContext,
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -3,7 +3,7 @@

 use std::io;
 use std::sync::Arc;
-use std::sync::atomic::AtomicU64;
+use std::sync::atomic::{AtomicU64, Ordering};

 use camino::Utf8PathBuf;
 use num_traits::Num;
@@ -18,6 +18,7 @@ use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache;
+use crate::tenant::storage_layer::inmemory_layer::GlobalResourceUnits;
 use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
 use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
 use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
@@ -30,9 +31,13 @@ pub struct EphemeralFile {
    _tenant_shard_id: TenantShardId,
    _timeline_id: TimelineId,
    page_cache_file_id: page_cache::FileId,
-    bytes_written: u64,
    file: TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter,
-    buffered_writer: BufferedWriter,
+
+    buffered_writer: tokio::sync::RwLock<BufferedWriter>,
+
+    bytes_written: AtomicU64,
+
+    resource_units: std::sync::Mutex<GlobalResourceUnits>,
 }

 type BufferedWriter = owned_buffers_io::write::BufferedWriter<
@@ -94,9 +99,8 @@ impl EphemeralFile {
            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
            page_cache_file_id,
-            bytes_written: 0,
            file: file.clone(),
-            buffered_writer: BufferedWriter::new(
+            buffered_writer: tokio::sync::RwLock::new(BufferedWriter::new(
                file,
                0,
                || IoBufferMut::with_capacity(TAIL_SZ),
@@ -104,7 +108,9 @@ impl EphemeralFile {
                cancel.child_token(),
                ctx,
                info_span!(parent: None, "ephemeral_file_buffered_writer", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), timeline_id=%timeline_id, path = %filename),
-            ),
+            )),
+            bytes_written: AtomicU64::new(0),
+            resource_units: std::sync::Mutex::new(GlobalResourceUnits::new()),
        })
    }
 }
@@ -151,15 +157,17 @@ impl std::ops::Deref for TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter

 #[derive(Debug, thiserror::Error)]
 pub(crate) enum EphemeralFileWriteError {
-    #[error("{0}")]
-    TooLong(String),
    #[error("cancelled")]
    Cancelled,
 }

 impl EphemeralFile {
    pub(crate) fn len(&self) -> u64 {
-        self.bytes_written
+        // TODO(vlad): The value returned here is not always correct if
+        // we have more than one concurrent writer. Writes are always
+        // sequenced, but we could grab the buffered writer lock if we wanted
+        // to.
+        self.bytes_written.load(Ordering::Acquire)
    }

    pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
@@ -186,7 +194,7 @@ impl EphemeralFile {
    /// Panics if the write is short because there's no way we can recover from that.
    /// TODO: make upstack handle this as an error.
    pub(crate) async fn write_raw(
-        &mut self,
+        &self,
        srcbuf: &[u8],
        ctx: &RequestContext,
    ) -> Result<u64, EphemeralFileWriteError> {
@@ -198,22 +206,13 @@ impl EphemeralFile {
    }

    async fn write_raw_controlled(
-        &mut self,
+        &self,
        srcbuf: &[u8],
        ctx: &RequestContext,
    ) -> Result<(u64, Option<owned_buffers_io::write::FlushControl>), EphemeralFileWriteError> {
-        let pos = self.bytes_written;
+        let mut writer = self.buffered_writer.write().await;

-        let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
-            EphemeralFileWriteError::TooLong(format!(
-                "write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}",
-                srcbuf_len = srcbuf.len(),
-            ))
-        })?;
-
-        // Write the payload
-        let (nwritten, control) = self
-            .buffered_writer
+        let (nwritten, control) = writer
            .write_buffered_borrowed_controlled(srcbuf, ctx)
            .await
            .map_err(|e| match e {
@@ -225,43 +224,69 @@ impl EphemeralFile {
            "buffered writer has no short writes"
        );

-        self.bytes_written = new_bytes_written;
+        // There's no realistic risk of overflow here. We won't have exabytes sized files on disk.
+        let pos = self
+            .bytes_written
+            .fetch_add(srcbuf.len().into_u64(), Ordering::AcqRel);
+
+        let mut resource_units = self.resource_units.lock().unwrap();
+        resource_units.maybe_publish_size(self.bytes_written.load(Ordering::Relaxed));

        Ok((pos, control))
    }
+
+    pub(crate) fn tick(&self) -> Option<u64> {
+        let mut resource_units = self.resource_units.lock().unwrap();
+        let len = self.bytes_written.load(Ordering::Relaxed);
+        resource_units.publish_size(len)
+    }
 }

 impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
    async fn read_exact_at_eof_ok<B: IoBufAlignedMut + Send>(
        &self,
        start: u64,
-        dst: tokio_epoll_uring::Slice<B>,
+        mut dst: tokio_epoll_uring::Slice<B>,
        ctx: &RequestContext,
    ) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
-        let submitted_offset = self.buffered_writer.bytes_submitted();
+        // We will fill the slice in back to front. Hence, we need
+        // the slice to be fully initialized.
+        // TODO(vlad): Is there a nicer way of doing this?
+        dst.as_mut_rust_slice_full_zeroed();

-        let mutable = match self.buffered_writer.inspect_mutable() {
-            Some(mutable) => &mutable[0..mutable.pending()],
-            None => {
-                // Timeline::cancel and hence buffered writer flush was cancelled.
-                // Remain read-available while timeline is shutting down.
-                &[]
-            }
-        };
+        let writer = self.buffered_writer.read().await;

-        let maybe_flushed = self.buffered_writer.inspect_maybe_flushed();
+        // Read bytes written while under lock. This is a hack to deal with concurrent
+        // writes updating the number of bytes written. `bytes_written` is not DIO alligned
+        // but we may end the read there.
+        //
+        // TODO(vlad): Feels like there's a nicer path where we align the end if it
+        // shoots over the end of the file.
+        let bytes_written = self.bytes_written.load(Ordering::Acquire);

        let dst_cap = dst.bytes_total().into_u64();
        let end = {
            // saturating_add is correct here because the max file size is u64::MAX, so,
            // if start + dst.len() > u64::MAX, then we know it will be a short read
            let mut end: u64 = start.saturating_add(dst_cap);
-            if end > self.bytes_written {
-                end = self.bytes_written;
+            if end > bytes_written {
+                end = bytes_written;
            }
            end
        };

+        let submitted_offset = writer.bytes_submitted();
+        let maybe_flushed = writer.inspect_maybe_flushed();
+
+        let mutable = match writer.inspect_mutable() {
+            Some(mutable) => &mutable[0..mutable.pending()],
+            None => {
+                // Timeline::cancel and hence buffered writer flush was cancelled.
+                // Remain read-available while timeline is shutting down.
+                &[]
+            }
+        };
+
        // inclusive, exclusive
        #[derive(Debug)]
        struct Range<N>(N, N);
@@ -306,13 +331,33 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral

        let mutable_range = Range(std::cmp::max(start, submitted_offset), end);

-        let dst = if written_range.len() > 0 {
+        // There are three sources from which we might have to read data:
+        // 1. The file itself
+        // 2. The buffer which contains changes currently being flushed
+        // 3. The buffer which contains chnages yet to be flushed
+        //
+        // For better concurrency, we do them in reverse order: perform the in-memory
+        // reads while holding the writer lock, drop the writer lock and read from the
+        // file if required.
+
+        let dst = if mutable_range.len() > 0 {
+            let offset_in_buffer = mutable_range
+                .0
+                .checked_sub(submitted_offset)
+                .unwrap()
+                .into_usize();
+            let to_copy =
+                &mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())];
            let bounds = dst.bounds();
-            let slice = self
-                .file
-                .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
-                .await?;
-            Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
+            let mut view = dst.slice({
+                let start =
+                    written_range.len().into_usize() + maybe_flushed_range.len().into_usize();
+                let end = start.checked_add(mutable_range.len().into_usize()).unwrap();
+                start..end
+            });
+            view.as_mut_rust_slice_full_zeroed()
+                .copy_from_slice(to_copy);
+            Slice::from_buf_bounds(Slice::into_inner(view), bounds)
        } else {
            dst
        };
@@ -342,24 +387,15 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
            dst
        };

-        let dst = if mutable_range.len() > 0 {
-            let offset_in_buffer = mutable_range
-                .0
-                .checked_sub(submitted_offset)
-                .unwrap()
-                .into_usize();
-            let to_copy =
-                &mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())];
+        drop(writer);
+
+        let dst = if written_range.len() > 0 {
            let bounds = dst.bounds();
-            let mut view = dst.slice({
-                let start =
-                    written_range.len().into_usize() + maybe_flushed_range.len().into_usize();
-                let end = start.checked_add(mutable_range.len().into_usize()).unwrap();
-                start..end
-            });
-            view.as_mut_rust_slice_full_zeroed()
-                .copy_from_slice(to_copy);
-            Slice::from_buf_bounds(Slice::into_inner(view), bounds)
+            let slice = self
+                .file
+                .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
+                .await?;
+            Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
        } else {
            dst
        };
@@ -460,13 +496,15 @@ mod tests {
        let gate = utils::sync::gate::Gate::default();
        let cancel = CancellationToken::new();

-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
+        let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
            .await
            .unwrap();

-        let mutable = file.buffered_writer.mutable();
+        let writer = file.buffered_writer.read().await;
+        let mutable = writer.mutable();
        let cap = mutable.capacity();
        let align = mutable.align();
+        drop(writer);

        let write_nbytes = cap * 2 + cap / 2;

@@ -504,10 +542,11 @@ mod tests {
        let file_contents = std::fs::read(file.file.path()).unwrap();
        assert!(file_contents == content[0..cap * 2]);

-        let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap();
+        let writer = file.buffered_writer.read().await;
+        let maybe_flushed_buffer_contents = writer.inspect_maybe_flushed().unwrap();
        assert_eq!(&maybe_flushed_buffer_contents[..], &content[cap..cap * 2]);

-        let mutable_buffer_contents = file.buffered_writer.mutable();
+        let mutable_buffer_contents = writer.mutable();
        assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]);
    }

@@ -517,12 +556,14 @@ mod tests {

        let gate = utils::sync::gate::Gate::default();
        let cancel = CancellationToken::new();
-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
+        let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
            .await
            .unwrap();

        // mutable buffer and maybe_flushed buffer each has `cap` bytes.
-        let cap = file.buffered_writer.mutable().capacity();
+        let writer = file.buffered_writer.read().await;
+        let cap = writer.mutable().capacity();
+        drop(writer);

        let content: Vec<u8> = rand::thread_rng()
            .sample_iter(rand::distributions::Standard)
@@ -540,12 +581,13 @@ mod tests {
            2 * cap.into_u64(),
            "buffered writer requires one write to be flushed if we write 2.5x buffer capacity"
        );
+        let writer = file.buffered_writer.read().await;
        assert_eq!(
-            &file.buffered_writer.inspect_maybe_flushed().unwrap()[0..cap],
+            &writer.inspect_maybe_flushed().unwrap()[0..cap],
            &content[cap..cap * 2]
        );
        assert_eq!(
-            &file.buffered_writer.mutable()[0..cap / 2],
+            &writer.mutable()[0..cap / 2],
            &content[cap * 2..cap * 2 + cap / 2]
        );
    }
@@ -563,13 +605,15 @@ mod tests {
        let gate = utils::sync::gate::Gate::default();
        let cancel = CancellationToken::new();

-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
+        let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &cancel, &ctx)
            .await
            .unwrap();

-        let mutable = file.buffered_writer.mutable();
+        let writer = file.buffered_writer.read().await;
+        let mutable = writer.mutable();
        let cap = mutable.capacity();
        let align = mutable.align();
+        drop(writer);
        let content: Vec<u8> = rand::thread_rng()
            .sample_iter(rand::distributions::Standard)
            .take(cap * 2 + cap / 2)
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -18,7 +18,6 @@
 //! [`IndexPart`]: super::remote_timeline_client::index::IndexPart

 use anyhow::ensure;
-use postgres_ffi::PgMajorVersion;
 use serde::{Deserialize, Serialize};
 use utils::bin_ser::{BeSer, SerializeError};
 use utils::id::TimelineId;
@@ -137,7 +136,7 @@ struct TimelineMetadataBodyV2 {
    latest_gc_cutoff_lsn: Lsn,

    initdb_lsn: Lsn,
-    pg_version: PgMajorVersion,
+    pg_version: u32,
 }

 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -168,7 +167,7 @@ impl TimelineMetadata {
        ancestor_lsn: Lsn,
        latest_gc_cutoff_lsn: Lsn,
        initdb_lsn: Lsn,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
    ) -> Self {
        Self {
            hdr: TimelineMetadataHeader {
@@ -216,7 +215,7 @@ impl TimelineMetadata {
            ancestor_lsn: body.ancestor_lsn,
            latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn,
            initdb_lsn: body.initdb_lsn,
-            pg_version: PgMajorVersion::PG14, // All timelines created before this version had pg_version 14
+            pg_version: 14, // All timelines created before this version had pg_version 14
        };

        hdr.format_version = METADATA_FORMAT_VERSION;
@@ -318,7 +317,7 @@ impl TimelineMetadata {
        self.body.initdb_lsn
    }

-    pub fn pg_version(&self) -> PgMajorVersion {
+    pub fn pg_version(&self) -> u32 {
        self.body.pg_version
    }

@@ -332,7 +331,7 @@ impl TimelineMetadata {
            Lsn::from_hex("00000000").unwrap(),
            Lsn::from_hex("00000000").unwrap(),
            Lsn::from_hex("00000000").unwrap(),
-            PgMajorVersion::PG14,
+            0,
        );
        let bytes = instance.to_bytes().unwrap();
        Self::from_bytes(&bytes).unwrap()
@@ -546,7 +545,7 @@ mod tests {
            Lsn(0),
            Lsn(0),
            Lsn(0),
-            PgMajorVersion::PG14, // All timelines created before this version had pg_version 14
+            14, // All timelines created before this version had pg_version 14
        );

        assert_eq!(
@@ -567,7 +566,7 @@ mod tests {
            Lsn(0),
            // Updating this version to 17 will cause the test to fail at the
            // next assert_eq!().
-            PgMajorVersion::PG16,
+            16,
        );
        let expected_bytes = vec![
            /* TimelineMetadataHeader */
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -427,8 +427,8 @@ impl GcBlocking {

 #[cfg(test)]
 mod tests {
-    use postgres_ffi::PgMajorVersion;
    use std::str::FromStr;
+
    use utils::id::TimelineId;

    use super::*;
@@ -831,7 +831,7 @@ mod tests {
                Lsn::INVALID,
                Lsn::from_str("0/1696070").unwrap(),
                Lsn::from_str("0/1696070").unwrap(),
-                PgMajorVersion::PG14,
+                14,
            ).with_recalculated_checksum().unwrap(),
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: None,
@@ -893,7 +893,7 @@ mod tests {
                Lsn::INVALID,
                Lsn::from_str("0/1696070").unwrap(),
                Lsn::from_str("0/1696070").unwrap(),
-                PgMajorVersion::PG14,
+                14,
            ).with_recalculated_checksum().unwrap(),
            deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")),
            archived_at: Some(parse_naive_datetime("2023-04-29T09:00:00.123000000")),
@@ -957,7 +957,7 @@ mod tests {
                Lsn::INVALID,
                Lsn::from_str("0/1696070").unwrap(),
                Lsn::from_str("0/1696070").unwrap(),
-                PgMajorVersion::PG14,
+                14,
            ).with_recalculated_checksum().unwrap(),
            deleted_at: None,
            lineage: Default::default(),
@@ -1033,7 +1033,7 @@ mod tests {
                Lsn::INVALID,
                Lsn::from_str("0/1696070").unwrap(),
                Lsn::from_str("0/1696070").unwrap(),
-                PgMajorVersion::PG14,
+                14,
            ).with_recalculated_checksum().unwrap(),
            deleted_at: None,
            lineage: Default::default(),
@@ -1114,7 +1114,7 @@ mod tests {
                Lsn::INVALID,
                Lsn::from_str("0/1696070").unwrap(),
                Lsn::from_str("0/1696070").unwrap(),
-                PgMajorVersion::PG14,
+                14,
            ).with_recalculated_checksum().unwrap(),
            deleted_at: None,
            lineage: Default::default(),
@@ -1199,7 +1199,7 @@ mod tests {
                Lsn::INVALID,
                Lsn::from_str("0/1696070").unwrap(),
                Lsn::from_str("0/1696070").unwrap(),
-                PgMajorVersion::PG14,
+                14,
            ).with_recalculated_checksum().unwrap(),
            deleted_at: None,
            lineage: Default::default(),
@@ -1287,7 +1287,7 @@ mod tests {
                Lsn::INVALID,
                Lsn::from_str("0/1696070").unwrap(),
                Lsn::from_str("0/1696070").unwrap(),
-                PgMajorVersion::PG14,
+                14,
            ).with_recalculated_checksum().unwrap(),
            deleted_at: None,
            lineage: Default::default(),
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -109,7 +109,7 @@ pub(crate) enum OnDiskValue {

 /// Reconstruct data accumulated for a single key during a vectored get
 #[derive(Debug, Default)]
-pub(crate) struct VectoredValueReconstructState {
+pub struct VectoredValueReconstructState {
    pub(crate) on_disk_values: Vec<(Lsn, OnDiskValueIoWaiter)>,

    pub(crate) situation: ValueReconstructSituation,
@@ -244,13 +244,60 @@ impl VectoredValueReconstructState {

        res
    }
+
+    /// Benchmarking utility to await for the completion of all pending ios
+    ///
+    /// # Cancel-Safety
+    ///
+    /// Technically fine to stop polling this future, but, the IOs will still
+    /// be executed to completion by the sidecar task and hold on to / consume resources.
+    /// Better not do it to make reasonsing about the system easier.
+    #[cfg(feature = "benchmarking")]
+    pub async fn sink_pending_ios(self) -> Result<(), std::io::Error> {
+        let mut res = Ok(());
+
+        // We should try hard not to bail early, so that by the time we return from this
+        // function, all IO for this value is done. It's not required -- we could totally
+        // stop polling the IO futures in the sidecar task, they need to support that,
+        // but just stopping to poll doesn't reduce the IO load on the disk. It's easier
+        // to reason about the system if we just wait for all IO to complete, even if
+        // we're no longer interested in the result.
+        //
+        // Revisit this when IO futures are replaced with a more sophisticated IO system
+        // and an IO scheduler, where we know which IOs were submitted and which ones
+        // just queued. Cf the comment on IoConcurrency::spawn_io.
+        for (_lsn, waiter) in self.on_disk_values {
+            let value_recv_res = waiter
+                .wait_completion()
+                // we rely on the caller to poll us to completion, so this is not a bail point
+                .await;
+
+            match (&mut res, value_recv_res) {
+                (Err(_), _) => {
+                    // We've already failed, no need to process more.
+                }
+                (Ok(_), Err(_wait_err)) => {
+                    // This shouldn't happen - likely the sidecar task panicked.
+                    unreachable!();
+                }
+                (Ok(_), Ok(Err(err))) => {
+                    let err: std::io::Error = err;
+                    res = Err(err);
+                }
+                (Ok(_ok), Ok(Ok(OnDiskValue::RawImage(_img)))) => {}
+                (Ok(_ok), Ok(Ok(OnDiskValue::WalRecordOrImage(_buf)))) => {}
+            }
+        }
+
+        res
+    }
 }

 /// Bag of data accumulated during a vectored get..
-pub(crate) struct ValuesReconstructState {
+pub struct ValuesReconstructState {
    /// The keys will be removed after `get_vectored` completes. The caller outside `Timeline`
    /// should not expect to get anything from this hashmap.
-    pub(crate) keys: HashMap<Key, VectoredValueReconstructState>,
+    pub keys: HashMap<Key, VectoredValueReconstructState>,
    /// The keys which are already retrieved
    keys_done: KeySpaceRandomAccum,

@@ -272,7 +319,7 @@ pub(crate) struct ValuesReconstructState {
 /// The desired end state is that we always do parallel IO.
 /// This struct and the dispatching in the impl will be removed once
 /// we've built enough confidence.
-pub(crate) enum IoConcurrency {
+pub enum IoConcurrency {
    Sequential,
    SidecarTask {
        task_id: usize,
@@ -317,10 +364,7 @@ impl IoConcurrency {
        Self::spawn(SelectedIoConcurrency::Sequential)
    }

-    pub(crate) fn spawn_from_conf(
-        conf: GetVectoredConcurrentIo,
-        gate_guard: GateGuard,
-    ) -> IoConcurrency {
+    pub fn spawn_from_conf(conf: GetVectoredConcurrentIo, gate_guard: GateGuard) -> IoConcurrency {
        let selected = match conf {
            GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential,
            GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard),
@@ -425,16 +469,6 @@ impl IoConcurrency {
        }
    }

-    pub(crate) fn clone(&self) -> Self {
-        match self {
-            IoConcurrency::Sequential => IoConcurrency::Sequential,
-            IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask {
-                task_id: *task_id,
-                ios_tx: ios_tx.clone(),
-            },
-        }
-    }
-
    /// Submit an IO to be executed in the background. DEADLOCK RISK, read the full doc string.
    ///
    /// The IO is represented as an opaque future.
@@ -573,6 +607,18 @@ impl IoConcurrency {
    }
 }

+impl Clone for IoConcurrency {
+    fn clone(&self) -> Self {
+        match self {
+            IoConcurrency::Sequential => IoConcurrency::Sequential,
+            IoConcurrency::SidecarTask { task_id, ios_tx } => IoConcurrency::SidecarTask {
+                task_id: *task_id,
+                ios_tx: ios_tx.clone(),
+            },
+        }
+    }
+}
+
 /// Make noise in case the [`ValuesReconstructState`] gets dropped while
 /// there are still IOs in flight.
 /// Refer to `collect_pending_ios` for why we prefer not to do that.
@@ -603,7 +649,7 @@ impl Drop for ValuesReconstructState {
 }

 impl ValuesReconstructState {
-    pub(crate) fn new(io_concurrency: IoConcurrency) -> Self {
+    pub fn new(io_concurrency: IoConcurrency) -> Self {
        Self {
            keys: HashMap::new(),
            keys_done: KeySpaceRandomAccum::new(),
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1622,6 +1622,11 @@ impl DeltaLayerIterator<'_> {
 pub(crate) mod test {
    use std::collections::BTreeMap;

+    use bytes::Bytes;
+    use itertools::MinMaxResult;
+    use rand::prelude::{SeedableRng, SliceRandom, StdRng};
+    use rand::{Rng, RngCore};
+
    use super::*;
    use crate::DEFAULT_PG_VERSION;
    use crate::context::DownloadBehavior;
@@ -1631,11 +1636,6 @@ pub(crate) mod test {
    use crate::tenant::storage_layer::{Layer, ResidentLayer};
    use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
    use crate::tenant::{TenantShard, Timeline};
-    use bytes::Bytes;
-    use itertools::MinMaxResult;
-    use postgres_ffi::PgMajorVersion;
-    use rand::prelude::{SeedableRng, SliceRandom, StdRng};
-    use rand::{Rng, RngCore};

    /// Construct an index for a fictional delta layer and and then
    /// traverse in order to plan vectored reads for a query. Finally,
@@ -1995,7 +1995,7 @@ pub(crate) mod test {
        let (tenant, ctx) = h.load().await;
        let ctx = &ctx;
        let timeline = tenant
-            .create_test_timeline(TimelineId::generate(), Lsn(0x10), PgMajorVersion::PG14, ctx)
+            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx)
            .await
            .unwrap();
        let ctx = &ctx.with_scope_timeline(&timeline);
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -70,23 +70,15 @@ pub struct InMemoryLayer {
    /// We use a separate lock for the index to reduce the critical section
    /// during which reads cannot be planned.
    ///
-    /// If you need access to both the index and the underlying file at the same time,
-    /// respect the following locking order to avoid deadlocks:
-    /// 1. [`InMemoryLayer::inner`]
-    /// 2. [`InMemoryLayer::index`]
-    ///
-    /// Note that the file backing [`InMemoryLayer::inner`] is append-only,
-    /// so it is not necessary to hold simultaneous locks on index.
-    /// This avoids holding index locks across IO, and is crucial for avoiding read tail latency.
+    /// Note that the file backing [`InMemoryLayer::file`] is append-only,
+    /// so it is not necessary to hold a lock on the index while reading or writing from the file.
    /// In particular:
-    /// 1. It is safe to read and release [`InMemoryLayer::index`] before locking and reading from [`InMemoryLayer::inner`].
-    /// 2. It is safe to write and release [`InMemoryLayer::inner`] before locking and updating [`InMemoryLayer::index`].
+    /// 1. It is safe to read and release [`InMemoryLayer::index`] before reading from [`InMemoryLayer::file`].
+    /// 2. It is safe to write to [`InMemoryLayer::file`] before locking and updating [`InMemoryLayer::index`].
    index: RwLock<BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>>,

-    /// The above fields never change, except for `end_lsn`, which is only set once,
-    /// and `index` (see rationale there).
-    /// All other changing parts are in `inner`, and protected by a mutex.
-    inner: RwLock<InMemoryLayerInner>,
+    /// Wrapper for the actual on-disk file. Uses interior mutability for concurrent reads/writes.
+    file: EphemeralFile,

    estimated_in_mem_size: AtomicU64,
 }
@@ -96,20 +88,10 @@ impl std::fmt::Debug for InMemoryLayer {
        f.debug_struct("InMemoryLayer")
            .field("start_lsn", &self.start_lsn)
            .field("end_lsn", &self.end_lsn)
-            .field("inner", &self.inner)
            .finish()
    }
 }

-pub struct InMemoryLayerInner {
-    /// The values are stored in a serialized format in this file.
-    /// Each serialized Value is preceded by a 'u32' length field.
-    /// PerSeg::page_versions map stores offsets into this file.
-    file: EphemeralFile,
-
-    resource_units: GlobalResourceUnits,
-}
-
 /// Support the same max blob length as blob_io, because ultimately
 /// all the InMemoryLayer contents end up being written into a delta layer,
 /// using the [`crate::tenant::blob_io`].
@@ -258,12 +240,6 @@ struct IndexEntryUnpacked {
    pos: u64,
 }

-impl std::fmt::Debug for InMemoryLayerInner {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("InMemoryLayerInner").finish()
-    }
-}
-
 /// State shared by all in-memory (ephemeral) layers.  Updated infrequently during background ticks in Timeline,
 /// to minimize contention.
 ///
@@ -280,7 +256,7 @@ pub(crate) struct GlobalResources {
 }

 // Per-timeline RAII struct for its contribution to [`GlobalResources`]
-struct GlobalResourceUnits {
+pub(crate) struct GlobalResourceUnits {
    // How many dirty bytes have I added to the global dirty_bytes: this guard object is responsible
    // for decrementing the global counter by this many bytes when dropped.
    dirty_bytes: u64,
@@ -292,7 +268,7 @@ impl GlobalResourceUnits {
    // updated when the Timeline "ticks" in the background.
    const MAX_SIZE_DRIFT: u64 = 10 * 1024 * 1024;

-    fn new() -> Self {
+    pub(crate) fn new() -> Self {
        GLOBAL_RESOURCES
            .dirty_layers
            .fetch_add(1, AtomicOrdering::Relaxed);
@@ -304,7 +280,7 @@ impl GlobalResourceUnits {
    ///
    /// Returns the effective layer size limit that should be applied, if any, to keep
    /// the total number of dirty bytes below the configured maximum.
-    fn publish_size(&mut self, size: u64) -> Option<u64> {
+    pub(crate) fn publish_size(&mut self, size: u64) -> Option<u64> {
        let new_global_dirty_bytes = match size.cmp(&self.dirty_bytes) {
            Ordering::Equal => GLOBAL_RESOURCES.dirty_bytes.load(AtomicOrdering::Relaxed),
            Ordering::Greater => {
@@ -349,7 +325,7 @@ impl GlobalResourceUnits {

    // Call publish_size if the input size differs from last published size by more than
    // the drift limit
-    fn maybe_publish_size(&mut self, size: u64) {
+    pub(crate) fn maybe_publish_size(&mut self, size: u64) {
        let publish = match size.cmp(&self.dirty_bytes) {
            Ordering::Equal => false,
            Ordering::Greater => size - self.dirty_bytes > Self::MAX_SIZE_DRIFT,
@@ -398,8 +374,8 @@ impl InMemoryLayer {
        }
    }

-    pub(crate) fn try_len(&self) -> Option<u64> {
-        self.inner.try_read().map(|i| i.file.len()).ok()
+    pub(crate) fn len(&self) -> u64 {
+        self.file.len()
    }

    pub(crate) fn assert_writable(&self) {
@@ -430,7 +406,7 @@ impl InMemoryLayer {

    // Look up the keys in the provided keyspace and update
    // the reconstruct state with whatever is found.
-    pub(crate) async fn get_values_reconstruct_data(
+    pub async fn get_values_reconstruct_data(
        self: &Arc<InMemoryLayer>,
        keyspace: KeySpace,
        lsn_range: Range<Lsn>,
@@ -479,14 +455,13 @@ impl InMemoryLayer {
                }
            }
        }
-        drop(index); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
+        drop(index); // release the lock before we spawn the IO
        let read_from = Arc::clone(self);
        let read_ctx = ctx.attached_child();
        reconstruct_state
            .spawn_io(async move {
-                let inner = read_from.inner.read().await;
                let f = vectored_dio_read::execute(
-                    &inner.file,
+                    &read_from.file,
                    reads
                        .iter()
                        .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
@@ -518,7 +493,6 @@ impl InMemoryLayer {
                // This is kinda forced for InMemoryLayer because we need to inner.read() anyway,
                // but it's less obvious for DeltaLayer and ImageLayer. So, keep this explicit
                // drop for consistency among all three layer types.
-                drop(inner);
                drop(read_from);
            })
            .await;
@@ -549,12 +523,6 @@ impl std::fmt::Display for InMemoryLayer {
 }

 impl InMemoryLayer {
-    /// Get layer size.
-    pub async fn size(&self) -> Result<u64> {
-        let inner = self.inner.read().await;
-        Ok(inner.file.len())
-    }
-
    pub fn estimated_in_mem_size(&self) -> u64 {
        self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
    }
@@ -587,10 +555,7 @@ impl InMemoryLayer {
            end_lsn: OnceLock::new(),
            opened_at: Instant::now(),
            index: RwLock::new(BTreeMap::new()),
-            inner: RwLock::new(InMemoryLayerInner {
-                file,
-                resource_units: GlobalResourceUnits::new(),
-            }),
+            file,
            estimated_in_mem_size: AtomicU64::new(0),
        })
    }
@@ -599,41 +564,37 @@ impl InMemoryLayer {
    ///
    /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from.
    /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable.
+    ///
+    /// This method shall not be called concurrently. We enforce this property via [`crate::tenant::Timeline::write_lock`].
+    ///
    /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
    pub async fn put_batch(
        &self,
        serialized_batch: SerializedValueBatch,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        let (base_offset, metadata) = {
-            let mut inner = self.inner.write().await;
-            self.assert_writable();
+        self.assert_writable();

-            let base_offset = inner.file.len();
+        let base_offset = self.file.len();

-            let SerializedValueBatch {
-                raw,
-                metadata,
-                max_lsn: _,
-                len: _,
-            } = serialized_batch;
+        let SerializedValueBatch {
+            raw,
+            metadata,
+            max_lsn: _,
+            len: _,
+        } = serialized_batch;

-            // Write the batch to the file
-            inner.file.write_raw(&raw, ctx).await?;
-            let new_size = inner.file.len();
+        // Write the batch to the file
+        self.file.write_raw(&raw, ctx).await?;
+        let new_size = self.file.len();

-            let expected_new_len = base_offset
-                .checked_add(raw.len().into_u64())
-                // write_raw would error if we were to overflow u64.
-                // also IndexEntry and higher levels in
-                //the code don't allow the file to grow that large
-                .unwrap();
-            assert_eq!(new_size, expected_new_len);
-
-            inner.resource_units.maybe_publish_size(new_size);
-
-            (base_offset, metadata)
-        };
+        let expected_new_len = base_offset
+            .checked_add(raw.len().into_u64())
+            // write_raw would error if we were to overflow u64.
+            // also IndexEntry and higher levels in
+            //the code don't allow the file to grow that large
+            .unwrap();
+        assert_eq!(new_size, expected_new_len);

        // Update the index with the new entries
        let mut index = self.index.write().await;
@@ -686,10 +647,8 @@ impl InMemoryLayer {
        self.opened_at
    }

-    pub(crate) async fn tick(&self) -> Option<u64> {
-        let mut inner = self.inner.write().await;
-        let size = inner.file.len();
-        inner.resource_units.publish_size(size)
+    pub(crate) fn tick(&self) -> Option<u64> {
+        self.file.tick()
    }

    pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
@@ -753,12 +712,6 @@ impl InMemoryLayer {
        gate: &utils::sync::gate::Gate,
        cancel: CancellationToken,
    ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
-        // Grab the lock in read-mode. We hold it over the I/O, but because this
-        // layer is not writeable anymore, no one should be trying to acquire the
-        // write lock on it, so we shouldn't block anyone. See the comment on
-        // [`InMemoryLayer::freeze`] to understand how locking between the append path
-        // and layer flushing works.
-        let inner = self.inner.read().await;
        let index = self.index.read().await;

        use l0_flush::Inner;
@@ -793,7 +746,7 @@ impl InMemoryLayer {

        match l0_flush_global_state {
            l0_flush::Inner::Direct { .. } => {
-                let file_contents = inner.file.load_to_io_buf(ctx).await?;
+                let file_contents = self.file.load_to_io_buf(ctx).await?;
                let file_contents = file_contents.freeze();

                for (key, vec_map) in index.iter() {
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,7 +1,6 @@
 use std::time::UNIX_EPOCH;

 use pageserver_api::key::{CONTROLFILE_KEY, Key};
-use postgres_ffi::PgMajorVersion;
 use tokio::task::JoinSet;
 use utils::completion::{self, Completion};
 use utils::id::TimelineId;
@@ -46,7 +45,7 @@ async fn smoke_test() {
        .create_test_timeline_with_layers(
            TimelineId::generate(),
            Lsn(0x10),
-            PgMajorVersion::PG14,
+            14,
            &ctx,
            Default::default(), // in-memory layers
            Default::default(),
@@ -257,12 +256,7 @@ async fn evict_and_wait_on_wanted_deleted() {
    let (tenant, ctx) = h.load().await;

    let timeline = tenant
-        .create_test_timeline(
-            TimelineId::generate(),
-            Lsn(0x10),
-            PgMajorVersion::PG14,
-            &ctx,
-        )
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
        .await
        .unwrap();

@@ -347,12 +341,7 @@ fn read_wins_pending_eviction() {
        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));

        let timeline = tenant
-            .create_test_timeline(
-                TimelineId::generate(),
-                Lsn(0x10),
-                PgMajorVersion::PG14,
-                &ctx,
-            )
+            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
            .await
            .unwrap();
        let ctx = ctx.with_scope_timeline(&timeline);
@@ -485,12 +474,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
        let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));

        let timeline = tenant
-            .create_test_timeline(
-                TimelineId::generate(),
-                Lsn(0x10),
-                PgMajorVersion::PG14,
-                &ctx,
-            )
+            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
            .await
            .unwrap();
        let ctx = ctx.with_scope_timeline(&timeline);
@@ -660,12 +644,7 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
    let (tenant, ctx) = h.load().await;

    let timeline = tenant
-        .create_test_timeline(
-            TimelineId::generate(),
-            Lsn(0x10),
-            PgMajorVersion::PG14,
-            &ctx,
-        )
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
        .await
        .unwrap();
    let ctx = ctx.with_scope_timeline(&timeline);
@@ -751,12 +730,7 @@ async fn evict_and_wait_does_not_wait_for_download() {
    let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));

    let timeline = tenant
-        .create_test_timeline(
-            TimelineId::generate(),
-            Lsn(0x10),
-            PgMajorVersion::PG14,
-            &ctx,
-        )
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
        .await
        .unwrap();
    let ctx = ctx.with_scope_timeline(&timeline);
@@ -862,12 +836,7 @@ async fn eviction_cancellation_on_drop() {
    let (tenant, ctx) = h.load().await;

    let timeline = tenant
-        .create_test_timeline(
-            TimelineId::generate(),
-            Lsn(0x10),
-            PgMajorVersion::PG14,
-            &ctx,
-        )
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
        .await
        .unwrap();

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -58,7 +58,7 @@ use pageserver_api::reltag::{BlockNumber, RelTag};
 use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId};
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::v14::xlog_utils;
-use postgres_ffi::{PgMajorVersion, WAL_SEGMENT_SIZE, to_pg_timestamp};
+use postgres_ffi::{WAL_SEGMENT_SIZE, to_pg_timestamp};
 use rand::Rng;
 use remote_storage::DownloadError;
 use serde_with::serde_as;
@@ -225,7 +225,7 @@ pub struct Timeline {
    /// to shards, and is constant through the lifetime of this Timeline.
    shard_identity: ShardIdentity,

-    pub pg_version: PgMajorVersion,
+    pub pg_version: u32,

    /// The tuple has two elements.
    /// 1. `LayerFileManager` keeps track of the various physical representations of the layer files (inmem, local, remote).
@@ -816,7 +816,7 @@ impl From<layer_manager::Shutdown> for FlushLayerError {
 }

 #[derive(thiserror::Error, Debug)]
-pub(crate) enum GetVectoredError {
+pub enum GetVectoredError {
    #[error("timeline shutting down")]
    Cancelled,

@@ -849,7 +849,7 @@ impl From<GetReadyAncestorError> for GetVectoredError {
 }

 #[derive(thiserror::Error, Debug)]
-pub(crate) enum GetReadyAncestorError {
+pub enum GetReadyAncestorError {
    #[error("ancestor LSN wait error")]
    AncestorLsnTimeout(#[from] WaitLsnError),

@@ -939,7 +939,7 @@ impl std::fmt::Debug for Timeline {
 }

 #[derive(thiserror::Error, Debug, Clone)]
-pub(crate) enum WaitLsnError {
+pub enum WaitLsnError {
    // Called on a timeline which is shutting down
    #[error("Shutdown")]
    Shutdown,
@@ -1902,16 +1902,11 @@ impl Timeline {
            return;
        };

-        let Some(current_size) = open_layer.try_len() else {
-            // Unexpected: since we hold the write guard, nobody else should be writing to this layer, so
-            // read lock to get size should always succeed.
-            tracing::warn!("Lock conflict while reading size of open layer");
-            return;
-        };
+        let current_size = open_layer.len();

        let current_lsn = self.get_last_record_lsn();

-        let checkpoint_distance_override = open_layer.tick().await;
+        let checkpoint_distance_override = open_layer.tick();

        if let Some(size_override) = checkpoint_distance_override {
            if current_size > size_override {
@@ -2918,7 +2913,7 @@ impl Timeline {
        shard_identity: ShardIdentity,
        walredo_mgr: Option<Arc<super::WalRedoManager>>,
        resources: TimelineResources,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
        state: TimelineState,
        attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
        create_idempotency: crate::tenant::CreateTimelineIdempotency,
@@ -7372,7 +7367,7 @@ impl TimelineWriter<'_> {
            .tl
            .get_layer_for_write(at, &self.write_guard, ctx)
            .await?;
-        let initial_size = layer.size().await?;
+        let initial_size = layer.len();

        let last_freeze_at = self.last_freeze_at.load();
        self.write_guard.replace(TimelineWriterState::new(
@@ -7600,7 +7595,6 @@ mod tests {
    use std::sync::Arc;

    use pageserver_api::key::Key;
-    use postgres_ffi::PgMajorVersion;
    use std::iter::Iterator;
    use tracing::Instrument;
    use utils::id::TimelineId;
@@ -7675,7 +7669,7 @@ mod tests {
            .create_test_timeline_with_layers(
                TimelineId::generate(),
                Lsn(0x10),
-                PgMajorVersion::PG14,
+                14,
                &ctx,
                Vec::new(), // in-memory layers
                delta_layers,
@@ -7811,7 +7805,7 @@ mod tests {
            .create_test_timeline_with_layers(
                TimelineId::generate(),
                Lsn(0x10),
-                PgMajorVersion::PG14,
+                14,
                &ctx,
                Vec::new(), // in-memory layers
                delta_layers,
@@ -7871,12 +7865,7 @@ mod tests {

        let (tenant, ctx) = harness.load().await;
        let timeline = tenant
-            .create_test_timeline(
-                TimelineId::generate(),
-                Lsn(0x10),
-                PgMajorVersion::PG14,
-                &ctx,
-            )
+            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
            .await
            .unwrap();

--- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
@@ -3,7 +3,7 @@ use std::sync::Arc;

 use anyhow::Context;
 use bytes::Bytes;
-use postgres_ffi::{ControlFileData, PgMajorVersion};
+use postgres_ffi::ControlFileData;
 use remote_storage::{
    Download, DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, Listing,
    ListingObject, RemotePath, RemoteStorageConfig,
@@ -264,7 +264,7 @@ impl ControlFile {
    pub(crate) fn base_lsn(&self) -> Lsn {
        Lsn(self.control_file_data.checkPoint).align()
    }
-    pub(crate) fn pg_version(&self) -> PgMajorVersion {
+    pub(crate) fn pg_version(&self) -> u32 {
        self.try_pg_version()
            .expect("prepare() checks that try_pg_version doesn't error")
    }
@@ -274,14 +274,13 @@ impl ControlFile {
    pub(crate) fn control_file_buf(&self) -> &Bytes {
        &self.control_file_buf
    }
-
-    fn try_pg_version(&self) -> anyhow::Result<PgMajorVersion> {
+    fn try_pg_version(&self) -> anyhow::Result<u32> {
        Ok(match self.control_file_data.catalog_version_no {
            // thesea are from catversion.h
-            202107181 => PgMajorVersion::PG14,
-            202209061 => PgMajorVersion::PG15,
-            202307071 => PgMajorVersion::PG16,
-            202406281 => PgMajorVersion::PG17,
+            202107181 => 14,
+            202209061 => 15,
+            202307071 => 16,
+            202406281 => 17,
            catversion => {
                anyhow::bail!("unrecognized catalog version {catversion}")
            }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -32,8 +32,8 @@ use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::walrecord::*;
 use postgres_ffi::{
-    PgMajorVersion, TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion,
-    enum_pgversion_dispatch, fsm_logical_to_physical, pg_constants,
+    TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch,
+    fsm_logical_to_physical, pg_constants,
 };
 use postgres_ffi_types::forknum::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use tracing::*;
@@ -781,7 +781,7 @@ impl WalIngest {
    ) -> Result<(), WalIngestError> {
        let (xact_common, is_commit, is_prepared) = match record {
            XactRecord::Prepare(XactPrepare { xl_xid, data }) => {
-                let xid: u64 = if modification.tline.pg_version >= PgMajorVersion::PG17 {
+                let xid: u64 = if modification.tline.pg_version >= 17 {
                    self.adjust_to_full_transaction_id(xl_xid)?
                } else {
                    xl_xid as u64
@@ -886,7 +886,7 @@ impl WalIngest {
                xl_xid, parsed.xid, lsn,
            );

-            let xid: u64 = if modification.tline.pg_version >= PgMajorVersion::PG17 {
+            let xid: u64 = if modification.tline.pg_version >= 17 {
                self.adjust_to_full_transaction_id(parsed.xid)?
            } else {
                parsed.xid as u64
@@ -1241,7 +1241,7 @@ impl WalIngest {
                if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
                    && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
                {
-                    let oldest_active_xid = if pg_version >= PgMajorVersion::PG17 {
+                    let oldest_active_xid = if pg_version >= 17 {
                        let mut oldest_active_full_xid = cp.nextXid.value;
                        for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
                            if xid < oldest_active_full_xid {
@@ -1475,11 +1475,10 @@ impl WalIngest {

                    const fn rate_limiter(
                        &self,
-                        pg_version: PgMajorVersion,
+                        pg_version: u32,
                    ) -> Option<&Lazy<Mutex<RateLimit>>> {
-                        const MIN_PG_VERSION: u32 = PgMajorVersion::PG14.major_version_num();
-                        const MAX_PG_VERSION: u32 = PgMajorVersion::PG17.major_version_num();
-                        let pg_version = pg_version.major_version_num();
+                        const MIN_PG_VERSION: u32 = 14;
+                        const MAX_PG_VERSION: u32 = 17;

                        if pg_version < MIN_PG_VERSION || pg_version > MAX_PG_VERSION {
                            return None;
@@ -1604,7 +1603,6 @@ async fn get_relsize(
 #[cfg(test)]
 mod tests {
    use anyhow::Result;
-    use postgres_ffi::PgMajorVersion;
    use postgres_ffi::RELSEG_SIZE;

    use super::*;
@@ -1627,7 +1625,7 @@ mod tests {

    #[tokio::test]
    async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> {
-        for i in PgMajorVersion::ALL {
+        for i in 14..=16 {
            dispatch_pgversion!(i, {
                pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;
            });
@@ -2337,7 +2335,7 @@ mod tests {
        // 5. Grep sk logs for "restart decoder" to get startpoint
        // 6. Run just the decoder from this test to get the endpoint.
        //    It's the last LSN the decoder will output.
-        let pg_version = PgMajorVersion::PG15; // The test data was generated by pg15
+        let pg_version = 15; // The test data was generated by pg15
        let path = "test_data/sk_wal_segment_from_pgbench";
        let wal_segment_path = format!("{path}/000000010000000000000001.zst");
        let source_initdb_path = format!("{path}/{INITDB_PATH}");
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -33,7 +33,6 @@ use bytes::{Bytes, BytesMut};
 use pageserver_api::key::Key;
 use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
 use pageserver_api::shard::TenantShardId;
-use postgres_ffi::PgMajorVersion;
 use tracing::*;
 use utils::lsn::Lsn;
 use utils::sync::gate::GateError;
@@ -166,7 +165,7 @@ impl PostgresRedoManager {
        lsn: Lsn,
        base_img: Option<(Lsn, Bytes)>,
        records: Vec<(Lsn, NeonWalRecord)>,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
        redo_attempt_type: RedoAttemptType,
    ) -> Result<Bytes, Error> {
        if records.is_empty() {
@@ -233,7 +232,7 @@ impl PostgresRedoManager {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub async fn ping(&self, pg_version: PgMajorVersion) -> Result<(), Error> {
+    pub async fn ping(&self, pg_version: u32) -> Result<(), Error> {
        self.do_with_walredo_process(pg_version, |proc| async move {
            proc.ping(Duration::from_secs(1))
                .await
@@ -343,7 +342,7 @@ impl PostgresRedoManager {
        O,
    >(
        &self,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
        closure: F,
    ) -> Result<O, Error> {
        let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
@@ -443,7 +442,7 @@ impl PostgresRedoManager {
        base_img_lsn: Lsn,
        records: &[(Lsn, NeonWalRecord)],
        wal_redo_timeout: Duration,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
        max_retry_attempts: u32,
    ) -> Result<Bytes, Error> {
        *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
@@ -573,7 +572,6 @@ mod tests {
    use bytes::Bytes;
    use pageserver_api::key::Key;
    use pageserver_api::shard::TenantShardId;
-    use postgres_ffi::PgMajorVersion;
    use tracing::Instrument;
    use utils::id::TenantId;
    use utils::lsn::Lsn;
@@ -588,7 +586,7 @@ mod tests {
        let h = RedoHarness::new().unwrap();

        h.manager
-            .ping(PgMajorVersion::PG14)
+            .ping(14)
            .instrument(h.span())
            .await
            .expect("ping should work");
@@ -614,7 +612,7 @@ mod tests {
                Lsn::from_str("0/16E2408").unwrap(),
                None,
                short_records(),
-                PgMajorVersion::PG14,
+                14,
                RedoAttemptType::ReadPage,
            )
            .instrument(h.span())
@@ -643,7 +641,7 @@ mod tests {
                Lsn::from_str("0/16E2408").unwrap(),
                None,
                short_records(),
-                PgMajorVersion::PG14,
+                14,
                RedoAttemptType::ReadPage,
            )
            .instrument(h.span())
@@ -665,7 +663,7 @@ mod tests {
                Lsn::INVALID,
                None,
                short_records(),
-                PgMajorVersion::PG16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
+                16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
                RedoAttemptType::ReadPage,
            )
            .instrument(h.span())
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -12,7 +12,7 @@ use anyhow::Context;
 use bytes::Bytes;
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::TenantShardId;
-use postgres_ffi::{BLCKSZ, PgMajorVersion};
+use postgres_ffi::BLCKSZ;
 use tokio::io::{AsyncReadExt, AsyncWriteExt};
 use tracing::{Instrument, debug, error, instrument};
 use utils::lsn::Lsn;
@@ -54,11 +54,11 @@ impl WalRedoProcess {
    //
    // Start postgres binary in special WAL redo mode.
    //
-    #[instrument(skip_all,fields(pg_version=pg_version.major_version_num()))]
+    #[instrument(skip_all,fields(pg_version=pg_version))]
    pub(crate) fn launch(
        conf: &'static PageServerConf,
        tenant_shard_id: TenantShardId,
-        pg_version: PgMajorVersion,
+        pg_version: u32,
    ) -> anyhow::Result<Self> {
        crate::span::debug_assert_current_span_has_tenant_id();

--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -1,10 +1,10 @@
 # pgxs/neon/Makefile

-
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
 	communicator.o \
+	communicator_new.o \
 	extension_server.o \
 	file_cache.o \
 	hll.o \
@@ -22,12 +22,18 @@ OBJS = \
 	walproposer.o \
 	walproposer_pg.o \
 	neon_ddl_handler.o \
-	walsender_hooks.o
+	walsender_hooks.o \
+	$(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a

 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl

+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S), Darwin)
+    SHLIB_LINK += -framework Security -framework CoreFoundation -framework SystemConfiguration
+endif
+
 EXTENSION = neon
 DATA = \
 	neon--1.0.sql \
@@ -54,6 +60,17 @@ WALPROP_OBJS = \
 	neon_utils.o \
 	walproposer_compat.o

+# libcommunicator.a is built by cargo from the Rust sources under communicator/
+# subdirectory. `cargo build` also generates communicator_bindings.h.
+neon.o: communicator/communicator_bindings.h
+
+$(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h &:
+	(cd $(srcdir)/communicator && cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE))
+
+# Force `cargo build` every time. Some of the Rust sources might have
+# changed.
+.PHONY: $(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h
+
 .PHONY: walproposer-lib
 walproposer-lib: CPPFLAGS += -DWALPROPOSER_LIB
 walproposer-lib: libwalproposer.a;
--- a/pgxn/neon/communicator/Cargo.lock
+++ b/pgxn/neon/communicator/Cargo.lock
@@ -0,0 +1,372 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "addr2line"
+version = "0.24.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
+dependencies = [
+ "gimli",
+]
+
+[[package]]
+name = "adler2"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
+
+[[package]]
+name = "backtrace"
+version = "0.3.74"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a"
+dependencies = [
+ "addr2line",
+ "cfg-if",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+ "windows-targets",
+]
+
+[[package]]
+name = "base64"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
+
+[[package]]
+name = "bytes"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "communicator"
+version = "0.1.0"
+dependencies = [
+ "tonic",
+]
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "futures-core"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
+
+[[package]]
+name = "gimli"
+version = "0.31.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
+
+[[package]]
+name = "http"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
+dependencies = [
+ "bytes",
+ "fnv",
+ "itoa",
+]
+
+[[package]]
+name = "http-body"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
+dependencies = [
+ "bytes",
+ "http",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
+
+[[package]]
+name = "libc"
+version = "0.2.171"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6"
+
+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.8.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff70ce3e48ae43fa075863cef62e8b43b71a4f2382229920e0df362592919430"
+dependencies = [
+ "adler2",
+]
+
+[[package]]
+name = "object"
+version = "0.36.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
+
+[[package]]
+name = "percent-encoding"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
+
+[[package]]
+name = "pin-project"
+version = "1.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
+dependencies = [
+ "pin-project-internal",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "1.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.94"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rustc-demangle"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
+
+[[package]]
+name = "syn"
+version = "2.0.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "tokio"
+version = "1.44.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48"
+dependencies = [
+ "backtrace",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "tokio-stream"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047"
+dependencies = [
+ "futures-core",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tonic"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85839f0b32fd242bb3209262371d07feda6d780d16ee9d2bc88581b89da1549b"
+dependencies = [
+ "base64",
+ "bytes",
+ "http",
+ "http-body",
+ "http-body-util",
+ "percent-encoding",
+ "pin-project",
+ "tokio-stream",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+
+[[package]]
+name = "tower-service"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
+
+[[package]]
+name = "tracing"
+version = "0.1.41"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
+dependencies = [
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
+dependencies = [
+ "once_cell",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
--- a/pgxn/neon/communicator/Cargo.toml
+++ b/pgxn/neon/communicator/Cargo.toml
@@ -0,0 +1,39 @@
+[package]
+name = "communicator"
+version = "0.1.0"
+edition = "2024"
+
+[features]
+testing = []
+
+[lib]
+crate-type = ["staticlib"]
+
+[dependencies]
+axum.workspace = true
+bytes.workspace = true
+clashmap.workspace = true
+http.workspace = true
+libc.workspace = true
+nix.workspace = true
+atomic_enum = "0.3.0"
+prometheus.workspace = true
+prost.workspace = true
+tonic = { version = "0.12.0", default-features = false, features=["codegen", "prost", "transport"] }
+tokio = { version = "1.43.1", features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] }
+tokio-pipe = { version = "0.2.12" }
+thiserror.workspace = true
+tracing.workspace = true
+tracing-subscriber.workspace = true
+
+metrics.workspace = true
+uring-common = { workspace = true, features = ["bytes"] }
+
+pageserver_client_grpc.workspace = true
+pageserver_page_api.workspace = true
+
+neon-shmem.workspace = true
+utils.workspace = true
+
+[build-dependencies]
+cbindgen.workspace = true
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
David Freifeld	86fb7b966a	Update `integrated_cache.rs` to use new hashmap API	2025-07-02 12:18:37 -07:00
David Freifeld	0c099b0944	Merge branch 'quantumish/lfc-resizable-map' into quantumish/comm-lfc-integration	2025-07-02 12:05:24 -07:00
David Freifeld	2fe27f510d	Make neon-shmem tests thread-safe and report errno in panics	2025-07-02 11:57:49 -07:00
David Freifeld	19b5618578	Switch to neon_shmem::sync lock_api and integrate into hashmap	2025-07-02 11:44:38 -07:00
David Freifeld	9d3e07ef2c	Add initial prototype of shmem sync primitives	2025-06-30 17:07:07 -07:00
David Freifeld	74330920ee	Simplify API, squash bugs, and expand hashmap test suite	2025-06-27 17:11:22 -07:00
David Freifeld	c3c136ef3a	Remove statistics utilities from neon_shmem crate	2025-06-27 17:10:52 -07:00
David Freifeld	78b6da270b	Sketchily integrate hashmap rewrite with `integrated_cache`	2025-06-26 16:45:48 -07:00
David Freifeld	47664e40d4	Initial work in visualizing properties of hashmap	2025-06-26 16:00:33 -07:00
David Freifeld	b1e3161d4e	Satisfy `cargo clippy` lints, simplify shrinking API	2025-06-26 14:32:32 -07:00
David Freifeld	4713715c59	Merge branch 'communicator-rewrite' of github.com:neondatabase/neon into communicator-rewrite	2025-06-26 10:26:41 -07:00
David Freifeld	1e74b52f7e	Merge branch 'quantumish/lfc-resizable-map' into communicator-rewrite	2025-06-26 10:26:22 -07:00
Erik Grinaker	e3ecdfbecc	pgxn/neon: actually use UNAME_S	2025-06-26 12:38:44 +02:00
Erik Grinaker	d08e553835	pgxn/neon: fix `callback_get_request_lsn_unsafe` return type	2025-06-26 12:33:59 +02:00
Erik Grinaker	7fffb5b4df	pgxn/neon: fix macOS build	2025-06-26 12:33:39 +02:00
David Freifeld	1fb3639170	Properly change type of HashMapInit in .with_hasher()	2025-06-25 03:03:19 -07:00
David Freifeld	00dfaa2eb4	Add Criterion microbenchmarks for rehashing and insertions	2025-06-24 16:30:59 -07:00
David Freifeld	ae740ca1bb	Document hashmap implementation, fix `get_bucket_for_value` Previously, `get_bucket_for_value` incorrectly divided by the size of `V` to get the bucket index. Now it divides by the size of `Bucket<K,V>`.	2025-06-24 16:27:17 -07:00
David Freifeld	24e6c68772	Remove prev entry tracking, refactor HashMapInit into proper builder	2025-06-24 13:34:22 -07:00
David Freifeld	93a45708ff	Change `finish_shrink` to remap entries in shrunk space	2025-06-23 16:15:43 -07:00
Heikki Linnakangas	46b5c0be0b	Remove duplicated migration script I messed this up during the merge I guess?	2025-06-23 19:46:32 +03:00
Heikki Linnakangas	2d913ff125	fix some mismerges	2025-06-23 18:21:16 +03:00
Heikki Linnakangas	e90be06d46	silence a few compiler warnings about unnecessary 'mut's and 'use's	2025-06-23 18:16:54 +03:00
Heikki Linnakangas	356ba67607	Merge remote-tracking branch 'origin/main' into HEAD I also included build script changes from https://github.com/neondatabase/neon/pull/12266, which is not yet merged but will be soon.	2025-06-23 17:46:30 +03:00
Alex Chi Z.	5e2c444525	fix(pageserver): reduce default feature flag refresh interval (#12246 ) ## Problem Part of #11813 ## Summary of changes The current interval is 30s and it costs a lot of $$$. This patch reduced it to 600s refresh interval (which means that it takes 10min for feature flags to propagate from UI to the pageserver). In the future we can let storcon retrieve the feature flags and push it to pageservers. We can consider creating a new release or we can postpone this to the week after the next week. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-06-23 13:51:21 +00:00
Heikki Linnakangas	8d711229c1	ci: Fix bogus skipping of 'make all' step in CI (#12318 ) The 'make all' step must run always. PR #12311 accidentally left the condition in there to skip it if there were no changes in postgres v14 sources. That condition belonged to a whole different step that was removed altogether in PR#12311, and the condition should've been removed too. Per CI failure: https://github.com/neondatabase/neon/actions/runs/15820148967/job/44587394469	2025-06-23 13:23:33 +00:00
Vlad Lazar	0e490f3be7	pageserver: allow concurrent rw IO on in-mem layer (#12151 ) ## Problem Previously, we couldn't read from an in-memory layer while a batch was being written to it. Vice-versa, we couldn't write to it while there was an on-going read. ## Summary of Changes The goal of this change is to improve concurrency. Writes happened through a &mut self method so the enforcement was at the type system level. We attempt to improve by: 1. Adding interior mutability to EphemeralLayer. This involves wrapping the buffered writer in a read-write lock. 2. Minimise the time that the read lock is held for. Only hold the read lock while reading from the buffers (recently flushed or pending flush). If we need to read from the file, drop the lock and allow IO to be concurrent. The new benchmark variants with concurrent reads improve between 70 to 200 percent (against main). Benchmark results are in this [commit](`891f094ce6`). ## Future Changes We can push the interior mutability into the buffered writer. The mutable tail goes under a read lock, the flushed part goes into an ArcSwap and then we can read from anything that is flushed _without_ any locking.	2025-06-23 13:17:30 +00:00
Erik Grinaker	7e41ef1bec	pageserver: set gRPC basebackup chunk size to 256 KB (#12314 ) gRPC base backups send a stream of fixed-size 64KB chunks. pagebench basebackup with compression enabled shows this to reduce throughput: * 64 KB: 55 RPS * 128 KB: 69 RPS * 256 KB: 73 RPS * 1024 KB: 73 RPS This patch sets the base backup chunk size to 256 KB.	2025-06-23 12:41:11 +00:00
David Freifeld	610ea22c46	Generalize map to allow arbitrary hash fns, add clear() helper method	2025-06-20 11:46:02 -07:00
Heikki Linnakangas	1847f4de54	Add missing #include. Got a warning on macos without this	2025-06-18 17:26:20 +03:00
David Freifeld	477648b8cd	Clean up hashmap implementation, add bucket tests	2025-06-17 11:23:10 -07:00
Heikki Linnakangas	e8af3a2811	remove unused struct in example code, to silence compiler warning	2025-06-17 02:09:21 +03:00
Heikki Linnakangas	b603e3dddb	Silence compiler warnings in example code	2025-06-17 02:07:33 +03:00
Heikki Linnakangas	83007782fd	fix compilation of example	2025-06-17 02:07:15 +03:00
David Freifeld	bb1e359872	Add testing utilities for hash map, freelist bugfixes	2025-06-16 16:02:39 -07:00
David Freifeld	ac87544e79	Implement shrinking, add basic tests for core operations	2025-06-16 13:13:38 -07:00
David Freifeld	b6b122e07b	nw: add shrinking and deletion skeletons	2025-06-16 10:20:30 -07:00
Erik Grinaker	782062014e	Fix `test_normal_work` endpoint restart	2025-06-16 10:16:27 +02:00
Erik Grinaker	d0b3629412	Tweak base backups	2025-06-13 13:47:26 -07:00
Heikki Linnakangas	16d6898e44	git add missing file	2025-06-12 02:37:59 +03:00
Erik Grinaker	f4d51c0f5c	Use gRPC for `test_normal_work`	2025-06-09 22:51:15 +02:00
Erik Grinaker	ec17ae0658	Handle gRPC basebackups in compute_ctl	2025-06-09 22:50:57 +02:00
Erik Grinaker	9ecce60ded	Plumb gRPC addr through storage-controller	2025-06-09 20:24:18 +02:00
Erik Grinaker	e74a957045	test_runner: initial gRPC protocol support	2025-06-06 16:56:33 +02:00
Erik Grinaker	396a16a3b2	test_runner: enable gRPC Pageserver	2025-06-06 14:55:29 +02:00
Elizabeth Murray	7140a50225	Minor changes to get integration tests to run for communicator.	2025-06-06 04:32:51 +02:00
Elizabeth Murray	68f18ccacf	Request Tracker Prototype Does not include splitting requests across shards.	2025-06-05 13:32:18 -07:00
Heikki Linnakangas	786888d93f	Instead of a fixed TCP port for metrics, listen on a unix domain socket That avoids clashes if you run two computes at the same time. More secure too. We might want to have a TCP port in the long run, but this is less trouble for now. To see the metrics with curl you can use: curl --unix-socket .neon/endpoints/ep-main/pgdata/.metrics.socket http://localhost/metrics	2025-06-05 21:28:11 +03:00
Heikki Linnakangas	255537dda1	avoid hitting assertion failure in MarkPostmasterChildWalSender()	2025-06-05 20:08:32 +03:00
Erik Grinaker	8b494f6a24	Ignore communicator_bindings.h	2025-06-05 17:52:50 +02:00
Erik Grinaker	28a61741b3	Mangle gRPC connstrings to use port 51051	2025-06-05 17:46:58 +02:00
Heikki Linnakangas	10b936bf03	Use a custom Rust implementation to replace the LFC hash table The new implementation lives in a separately allocated shared memory area, which could be resized. Resizing it isn't actually implemented yet, though. It would require some co-operation from the LFC code.	2025-06-05 18:31:29 +03:00
Erik Grinaker	2fb6164bf8	Misc build fixes	2025-06-05 17:22:11 +02:00
Erik Grinaker	328f28dfe5	impl Default for SlabBlockHeader	2025-06-05 17:18:28 +02:00
Erik Grinaker	95838056da	Fix `RelTag` fields	2025-06-05 17:13:51 +02:00
Heikki Linnakangas	6145cfd1c2	Move neon-shmem facility to separate module within the crate	2025-06-05 18:13:03 +03:00
Erik Grinaker	6d451654f1	Remove generated communicator_bindings.h	2025-06-05 17:12:13 +02:00
Heikki Linnakangas	96b4de1de6	Make LFC chunk size a compile-time constant A runtime setting is nicer, but the next commit will replace the hash table with a different implementation that requires the value size to be a compile-time constant.	2025-06-05 18:08:40 +03:00
Heikki Linnakangas	9fdf5fbb7e	Use a separate freelist to track LFC "holes" When the LFC is shrunk, we punch holes in the underlying file to release the disk space to the OS. We tracked it in the same hash table as the in-use entries, because that was convenient. However, I'm working on being able to shrink the hash table too, and once we do that, we'll need some other place to track the holes. Implement a simple scheme of an in-memory array and a chain of on-disk blocks for that.	2025-06-05 18:08:35 +03:00
Erik Grinaker	37c58522a2	Merge branch 'main' into communicator-rewrite	2025-06-05 15:08:05 +02:00
Erik Grinaker	4b6f02e47d	Merge branch 'main' into communicator-rewrite	2025-06-04 10:23:29 +02:00
Erik Grinaker	8202c6172f	Merge branch 'main' into communicator-rewrite	2025-06-03 16:04:31 +02:00
Erik Grinaker	69a47d789d	pageserver: remove gRPC compute service prototype	2025-06-03 13:47:21 +02:00
Erik Grinaker	b36f880710	Fix Linux build failures	2025-06-03 13:37:56 +02:00
Erik Grinaker	745b750f33	Merge branch 'main' into communicator-rewrite	2025-06-03 13:29:45 +02:00
Heikki Linnakangas	f06bb2bbd8	Implement growing the hash table. Fix unit tests.	2025-05-29 15:54:55 +03:00
Heikki Linnakangas	b3c25418a6	Add metrics to track memory usage of the rust communicator	2025-05-29 02:14:01 +03:00
Heikki Linnakangas	33549bad1d	use separate hash tables for relsize cache and block mappings	2025-05-28 23:57:55 +03:00
Heikki Linnakangas	009168d711	Add placeholder shmem hashmap implementation Use that instead of the half-baked Adaptive Radix Tree implementation. ART would probably be better in the long run, but more complicated to implement.	2025-05-28 11:08:35 +03:00
Elizabeth Murray	7c9bd542a6	Fix compile warnings, minor cleanup.	2025-05-26 06:30:48 -07:00
Elizabeth Murray	014823b305	Add a new iteration of a new client pool with some updates.	2025-05-26 05:29:32 -07:00
Elizabeth Murray	af9379ccf6	Use a sempahore to gate access to connections. Add metrics for testing.	2025-05-26 05:28:50 -07:00
Heikki Linnakangas	bb28109ffa	Merge remote-tracking branch 'origin/main' into communicator-rewrite-with-integrated-cache There were conflicts because of the differences in the page_api protocol that was merged to main vs what was on the branch. I adapted the code for the protocol in main.	2025-05-26 11:52:32 +03:00
Elizabeth Murray	60a0bec1c0	Set default max consumers per connection to a high number.	2025-05-19 07:00:39 -07:00
Elizabeth Murray	31fa7a545d	Remove unnecessary info include now that the info message is gone.	2025-05-19 06:52:07 -07:00
Elizabeth Murray	ac464c5f2c	Return info message that was used for debugging.	2025-05-19 06:39:16 -07:00
Elizabeth Murray	0dddb1e373	Add back whitespace that was removed.	2025-05-19 06:34:52 -07:00
Elizabeth Murray	3acb263e62	Add first iteration of simulating a flakey network with a custom TCP.	2025-05-19 06:33:30 -07:00
Elizabeth Murray	1e83398cdd	Correct out-of-date comment.	2025-05-14 07:31:52 -07:00
Elizabeth Murray	be8ed81532	Connection pool: update error accounting, sweep idle connections, add config options.	2025-05-14 07:31:52 -07:00
Heikki Linnakangas	12b08c4b82	Fix shutdown	2025-05-14 01:49:55 +03:00
Heikki Linnakangas	827358dd03	Handle OOMs a little more gracefully	2025-05-12 23:33:22 +03:00
Heikki Linnakangas	d367273000	minor cleanup	2025-05-12 23:11:55 +03:00
Heikki Linnakangas	e2bad5d9e9	Add debugging HTTP endpoint for dumping the cache tree	2025-05-12 22:54:03 +03:00
Heikki Linnakangas	5623e4665b	bunch of fixes	2025-05-12 18:40:54 +03:00
Heikki Linnakangas	8abb4dab6d	implement shrinking nodes	2025-05-12 03:57:10 +03:00
Heikki Linnakangas	731667ac37	better metrics of the art tree	2025-05-12 02:08:51 +03:00
Heikki Linnakangas	6a1374d106	Pack tree node structs more tightly, avoiding alignment padding	2025-05-12 01:01:58 +03:00
Heikki Linnakangas	f7c908f2f0	more metrics	2025-05-12 01:01:50 +03:00
Heikki Linnakangas	86671e3a0b	Add a bunch of metric counters	2025-05-11 20:11:13 +03:00
Heikki Linnakangas	319cd74f73	Fix eviction	2025-05-11 19:34:50 +03:00
Heikki Linnakangas	0efefbf77c	Add a few metrics, fix page eviction	2025-05-10 03:13:28 +03:00
Heikki Linnakangas	e6a4171fa1	fix concurrency issues with the LFC - Add another locking hash table to track which cached pages are currently being modified, by smgrwrite() or smgrread() or by prefetch. - Use single-value Leaf pages in the art tree. That seems simpler after all, and it eliminates some corner cases where a Value needed to be cloned, which made it tricky to use atomics or other interior mutability on the Values	2025-05-10 02:36:48 +03:00
Heikki Linnakangas	0c25ea9e31	reduce LOG noise	2025-05-09 18:27:36 +03:00
Heikki Linnakangas	6692321026	Remove dependency on io_uring, use plain std::fs ops instead io_uring is a great idea in the long term, but for now, let's make it easier to develop locally on macos, where io_uring is not available.	2025-05-06 17:46:21 +03:00
Heikki Linnakangas	791df28755	Linked list fix and add unit test	2025-05-06 16:46:54 +03:00
Heikki Linnakangas	d20da994f4	git add missing file	2025-05-06 15:36:48 +03:00
Heikki Linnakangas	6dbbdaae73	run 'cargo fmt'	2025-05-06 15:35:56 +03:00
Heikki Linnakangas	977bc09d2a	Bunch of fixes, smarter iterator, metrics exporter	2025-05-06 15:28:50 +03:00
Heikki Linnakangas	44269fcd5e	Implement simple eviction and free block tracking	2025-05-06 15:28:15 +03:00
Heikki Linnakangas	44cc648dc8	Implement iterator over keys the implementation is not very optimized, but probably good enough for an MVP	2025-05-06 15:27:38 +03:00
Heikki Linnakangas	884e028a4a	implement deletion in art tree	2025-05-06 15:27:38 +03:00
Heikki Linnakangas	42df3e5453	debugging stats	2025-05-06 15:27:38 +03:00
Heikki Linnakangas	fc743e284f	more work on allocators	2025-05-06 15:27:38 +03:00
Heikki Linnakangas	d02f9a2139	Collect garbage, handle OOMs	2025-05-06 15:27:38 +03:00
Heikki Linnakangas	083118e98e	Implement epoch system	2025-05-06 15:27:38 +03:00
Heikki Linnakangas	54cd2272f1	more memory allocation stuff	2025-05-06 15:27:38 +03:00
Heikki Linnakangas	e40193e3c8	simple block-based allocator	2025-05-06 15:27:38 +03:00
Heikki Linnakangas	ce9f7bacc1	Fix communicator client for recent changes in protocol and client code	2025-05-06 15:26:51 +03:00
Heikki Linnakangas	b7891f8fe8	Include 'neon-shard-id' header in client requests	2025-05-06 15:23:30 +03:00
Elizabeth Murray	5f2adaa9ad	Remove some additional debug info messages.	2025-05-02 10:50:53 -07:00
Elizabeth Murray	3e5e396c8d	Remove some debug info messages.	2025-05-02 10:24:18 -07:00
Elizabeth Murray	9d781c6fda	Add a connection pool module to the grpc client.	2025-05-02 10:22:33 -07:00
Erik Grinaker	cf5d038472	service documentation	2025-05-02 15:20:12 +02:00
Erik Grinaker	d785100c02	page_api: add `GetPageRequest::class`	2025-05-02 10:48:32 +02:00
Erik Grinaker	2c0d930e3d	page_api: add `GetPageResponse::status`	2025-04-30 16:48:45 +02:00
Erik Grinaker	66171a117b	page_api: add `GetPageRequestBatch`	2025-04-30 15:31:11 +02:00
Erik Grinaker	df2806e7a0	page_api: add `GetPageRequest::id`	2025-04-30 15:00:16 +02:00
Erik Grinaker	07631692db	page_api: protobuf comments	2025-04-30 12:36:11 +02:00
Erik Grinaker	4c77397943	Add `neon-shard-id` header	2025-04-30 11:18:06 +02:00
Erik Grinaker	7bb58be546	Use `authorization` header instead of `neon-auth-token`	2025-04-30 10:38:44 +02:00
Erik Grinaker	b5373de208	page_api: add `get_slru_segment()`	2025-04-29 17:59:27 +02:00
Erik Grinaker	b86c610f42	page_api: tweaks	2025-04-29 17:23:51 +02:00
Erik Grinaker	0f520d79ab	pageserver: rename `data_api` to `page_api`	2025-04-29 15:58:52 +02:00
Heikki Linnakangas	93eb7bb6b8	include lots of changes that went missing by accident	2025-04-29 15:32:27 +03:00
Heikki Linnakangas	e58d0fece1	New communicator, with "integrated" cache accessible from all processes	2025-04-29 11:52:44 +03:00