Document problems and pitfalls with fine-grained hashmap impl

Fix concurrency bugs in resizing (WIP)
Add initial work in implementing incremental resizing (WIP)
2026-02-01 01:30:38 +00:00 · 2025-08-12 13:42:48 -07:00 · 2025-08-12 10:34:29 -07:00 · 2025-07-14 09:02:56 -07:00 · 2025-07-11 12:47:27 -07:00 · 2025-07-10 17:34:30 -07:00
229 changed files with 14770 additions and 8866 deletions
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -33,7 +33,6 @@ workspace-members = [
    "compute_api",
    "consumption_metrics",
    "desim",
-    "json",
    "metrics",
    "pageserver_api",
    "postgres_backend",
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -7,7 +7,6 @@ self-hosted-runner:
    - small-metal
    - small-arm64
    - unit-perf
-    - unit-perf-aws-arm
    - us-east-2
 config-variables:
  - AWS_ECR_REGION
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -87,24 +87,6 @@ jobs:
    uses: ./.github/workflows/build-build-tools-image.yml
    secrets: inherit

-  lint-openapi-spec:
-    runs-on: ubuntu-22.04
-    needs: [ meta, check-permissions ]
-    # We do need to run this in `.*-rc-pr` because of hotfixes.
-    if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
-    steps:
-      - name: Harden the runner (Audit all outbound calls)
-        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
-        with:
-          egress-policy: audit
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-      - run: make lint-openapi-spec
-
  check-codestyle-python:
    needs: [ meta, check-permissions, build-build-tools-image ]
    # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
@@ -324,14 +306,14 @@ jobs:
      statuses: write
      contents: write
      pull-requests: write
-    runs-on: [ self-hosted, unit-perf-aws-arm ]
+    runs-on: [ self-hosted, unit-perf ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
      credentials:
        username: ${{ github.actor }}
        password: ${{ secrets.GITHUB_TOKEN }}
      # for changed limits, see comments on `options:` earlier in this file
-      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 --ulimit nofile=65536:65536 --security-opt seccomp=unconfined
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
    strategy:
      fail-fast: false
      matrix:
@@ -1004,7 +986,6 @@ jobs:
      - name: Verify docker-compose example and test extensions
        timeout-minutes: 60
        env:
-          PARALLEL_COMPUTES: 3
          TAG: >-
            ${{
              needs.meta.outputs.run-kind == 'compute-rc-pr'
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -1,4 +1,4 @@
-name: Periodic pagebench performance test on unit-perf-aws-arm runners
+name: Periodic pagebench performance test on unit-perf hetzner runner

 on:
  schedule:
@@ -40,7 +40,7 @@ jobs:
      statuses: write
      contents: write
      pull-requests: write
-    runs-on: [ self-hosted, unit-perf-aws-arm ]
+    runs-on: [ self-hosted, unit-perf ]
    container:
      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
      credentials:
--- a/.github/workflows/proxy-benchmark.yml
+++ b/.github/workflows/proxy-benchmark.yml
@@ -1,4 +1,4 @@
-name: Periodic proxy performance test on unit-perf-aws-arm runners
+name: Periodic proxy performance test on unit-perf hetzner runner

 on:
  push: # TODO: remove after testing
@@ -32,7 +32,7 @@ jobs:
      statuses: write
      contents: write
      pull-requests: write
-    runs-on: [self-hosted, unit-perf-aws-arm]
+    runs-on: [self-hosted, unit-perf]
    timeout-minutes: 60  # 1h timeout
    container:
      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
--- a/.gitignore
+++ b/.gitignore
@@ -15,7 +15,7 @@ neon.iml
 /.neon
 /integration_tests/.neon
 compaction-suite-results.*
-docker-compose/docker-compose-parallel.yml
+pgxn/neon/communicator/communicator_bindings.h

 # Coverage
 *.profraw
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -35,6 +35,7 @@ members = [
    "libs/pq_proto",
    "libs/tenant_size_model",
    "libs/metrics",
+    "libs/neonart",
    "libs/postgres_connection",
    "libs/remote_storage",
    "libs/tracing-utils",
@@ -43,7 +44,6 @@ members = [
    "libs/walproposer",
    "libs/wal_decoder",
    "libs/postgres_initdb",
-    "libs/proxy/json",
    "libs/proxy/postgres-protocol2",
    "libs/proxy/postgres-types2",
    "libs/proxy/tokio-postgres2",
@@ -92,6 +92,7 @@ clap = { version = "4.0", features = ["derive", "env"] }
 clashmap = { version = "1.0", features = ["raw-api"] }
 comfy-table = "7.1"
 const_format = "0.2"
+crossbeam-utils = "0.8.21"
 crc32c = "0.6"
 diatomic-waker = { version = "0.2.3" }
 either = "1.8"
@@ -130,7 +131,6 @@ jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] }
 jsonwebtoken = "9"
 lasso = "0.7"
 libc = "0.2"
-libproc = "0.14"
 md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
@@ -151,6 +151,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pem = "3.0.3"
+peekable = "0.3.0"
 pin-project-lite = "0.2"
 pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
 procfs = "0.16"
@@ -187,6 +188,7 @@ smallvec = "1.11"
 smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 spki = "0.7.3"
+spin = "0.9.8"
 strum = "0.26"
 strum_macros = "0.26"
 "subtle"  = "2.5.0"
@@ -198,7 +200,6 @@ thiserror = "1.0"
 tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
 tokio = { version = "1.43.1", features = ["macros"] }
-tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
 tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
@@ -240,6 +241,9 @@ x509-cert = { version = "0.2.5" }
 env_logger = "0.11"
 log = "0.4"

+tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
+uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
+
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
@@ -279,7 +283,6 @@ safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
 safekeeper_client = { path = "./safekeeper/client" }
 storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
 storage_controller_client = { path = "./storage_controller/client" }
-tempfile = "3"
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
--- a/2
+++ b/2
@@ -109,8 +109,6 @@ RUN set -e \
        libreadline-dev \
        libseccomp-dev \
        ca-certificates \
-        bpfcc-tools \
-        sudo \
        openssl \
        unzip \
        curl \
--- a/9
+++ b/9
@@ -220,15 +220,6 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
 setup-pre-commit-hook:
 	ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit

-.PHONY: lint-openapi-spec
-lint-openapi-spec:
-	# operation-2xx-response: pageserver timeline delete returns 404 on success
-	find . -iname "openapi_spec.y*ml" -exec\
-		docker run --rm -v ${PWD}:/spec ghcr.io/redocly/cli:1.34.4\
-			--skip-rule=operation-operationId --skip-rule=operation-summary --extends=minimal\
-			--skip-rule=no-server-example.com --skip-rule=operation-2xx-response\
-			lint {} \+
-
 # Targets for building PostgreSQL are defined in postgres.mk.
 #
 # But if the caller has indicated that PostgreSQL is already
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -61,9 +61,6 @@ RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
        libpq5 \
        libpq-dev \
        libzstd-dev \
-        linux-perf \
-        bpfcc-tools \
-        linux-headers-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac) \
        postgresql-16 \
        postgresql-server-dev-16 \
        postgresql-common  \
@@ -108,21 +105,15 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
 #
 # 'gdb' is included so that we get backtraces of core dumps produced in
 # regression tests
-RUN set -ex \
-    && KERNEL_VERSION="$(uname -r | cut -d'-' -f1 | sed 's/\.0$//')" \
-    && echo KERNEL_VERSION=${KERNEL_VERSION} >> /etc/environment \
-    && KERNEL_ARCH=$(uname -m | awk '{ if ($1 ~ /^(x86_64|i[3-6]86)$/) print "x86"; else if ($1 ~ /^(aarch64|arm.*)$/) print "aarch"; else print $1 }') \
-    && echo KERNEL_ARCH=${KERNEL_ARCH} >> /etc/environment \
+RUN set -e \
    && apt update \
    && apt install -y \
        autoconf \
        automake \
-        bc \
        bison \
        build-essential \
        ca-certificates \
        cmake \
-        cpio \
        curl \
        flex \
        gdb \
@@ -131,10 +122,8 @@ RUN set -ex \
        gzip \
        jq \
        jsonnet \
-        kmod \
        libcurl4-openssl-dev \
        libbz2-dev \
-        libelf-dev \
        libffi-dev \
        liblzma-dev \
        libncurses5-dev \
@@ -148,11 +137,6 @@ RUN set -ex \
        libxml2-dev \
        libxmlsec1-dev \
        libxxhash-dev \
-        linux-perf \
-        bpfcc-tools \
-        libbpfcc \
-        libbpfcc-dev \
-        linux-headers-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac) \
        lsof \
        make \
        netcat-openbsd \
@@ -160,8 +144,6 @@ RUN set -ex \
        openssh-client \
        parallel \
        pkg-config \
-        rsync \
-        sudo \
        unzip \
        wget \
        xz-utils \
@@ -216,8 +198,6 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /
 # Configure sudo & docker
 RUN usermod -aG sudo nonroot && \
    echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \
-    mkdir -p /etc/sudoers.d && \
-    echo 'nonroot ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/nonroot && \
    usermod -aG docker nonroot

 # AWS CLI
--- a/clippy.toml
+++ b/clippy.toml
@@ -1,12 +1,9 @@
 disallowed-methods = [
    "tokio::task::block_in_place",
-
    # Allow this for now, to deny it later once we stop using Handle::block_on completely
    # "tokio::runtime::Handle::block_on",
-
-    # tokio-epoll-uring:
-    # - allow-invalid because the method doesn't exist on macOS
-    { path = "tokio_epoll_uring::thread_local_system", replacement = "tokio_epoll_uring_ext module inside pageserver crate", allow-invalid = true }
+    # use tokio_epoll_uring_ext instead
+    "tokio_epoll_uring::thread_local_system",
 ]

 disallowed-macros = [
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -149,9 +149,6 @@ RUN case $DEBIAN_VERSION in \
    ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \
    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \
-    bpfcc-tools \
-    libbpfcc \
-    libbpfcc-dev \
    libclang-dev \
    jsonnet \
    $VERSION_INSTALLS \
@@ -1918,10 +1915,10 @@ RUN cd /ext-src/pg_repack-src && patch -p1 </ext-src/pg_repack.patch && rm -f /e

 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
 RUN echo /usr/local/pgsql/lib > /etc/ld.so.conf.d/00-neon.conf && /sbin/ldconfig
-RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq parallel \
+RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq \
   && apt clean && rm -rf /ext-src/*.tar.gz /ext-src/*.patch /var/lib/apt/lists/*
 ENV PATH=/usr/local/pgsql/bin:$PATH
-ENV PGHOST=compute1
+ENV PGHOST=compute
 ENV PGPORT=55433
 ENV PGUSER=cloud_admin
 ENV PGDATABASE=postgres
@@ -1991,10 +1988,6 @@ RUN apt update && \
        locales \
        lsof \
        procps \
-        bpfcc-tools \
-        libbpfcc \
-        libbpfcc-dev \
-        libclang-dev \
        rsyslog-gnutls \
        screen \
        tcpdump \
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -39,14 +39,6 @@ commands:
    user: nobody
    sysvInitAction: respawn
    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
-  - name: enable-kernel-modules
-    user: root
-    sysvInitAction: sysinit
-    shell: mkdir -p /lib/ && ln -s /neonvm/tools/lib/modules /lib/
-  - name: enable-bpfs
-    user: root
-    sysvInitAction: sysinit
-    shell: mkdir -p /sys/kernel/debug && mount -t debugfs debugfs /sys/kernel/debug && mount -t bpf bpf /sys/fs/bpf && chmod 755 /sys/fs/bpf
  # Rsyslog by default creates a unix socket under /dev/log . That's where Postgres sends logs also.
  # We run syslog with postgres user so it can't create /dev/log. Instead we configure rsyslog to
  # use a different path for the socket. The symlink actually points to our custom path.
@@ -73,7 +65,7 @@ files:
      # regardless of hostname (ALL)
      #
      # Also allow it to shut down the VM. The fast_import job does that when it's finished.
-      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd, /neonvm/tools/bin/perf, /usr/sbin/profile-bpfcc
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
  - filename: cgconfig.conf
    content: |
      # Configuration for cgroups in VM compute nodes
@@ -160,8 +152,6 @@ merge: |
  RUN set -e \
      && chmod 0644 /etc/cgconfig.conf

-  ENV PERF_BINARY_PATH=/neonvm/tools/bin/perf
-

  COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
  RUN chmod 0666 /etc/compute_rsyslog.conf
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -39,14 +39,6 @@ commands:
    user: nobody
    sysvInitAction: respawn
    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
-  - name: enable-kernel-modules
-    user: root
-    sysvInitAction: sysinit
-    shell: mkdir -p /lib/ && ln -s /neonvm/tools/lib/modules /lib/
-  - name: enable-bpfs
-    user: root
-    sysvInitAction: sysinit
-    shell: mkdir -p /sys/kernel/debug && mount -t debugfs debugfs /sys/kernel/debug && mount -t bpf bpf /sys/fs/bpf && chmod 755 /sys/fs/bpf
  # Rsyslog by default creates a unix socket under /dev/log . That's where Postgres sends logs also.
  # We run syslog with postgres user so it can't create /dev/log. Instead we configure rsyslog to
  # use a different path for the socket. The symlink actually points to our custom path.
@@ -73,7 +65,7 @@ files:
      # regardless of hostname (ALL)
      #
      # Also allow it to shut down the VM. The fast_import job does that when it's finished.
-      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd, /neonvm/tools/bin/perf, /usr/sbin/profile-bpfcc
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
  - filename: cgconfig.conf
    content: |
      # Configuration for cgroups in VM compute nodes
@@ -156,8 +148,6 @@ merge: |
  RUN set -e \
      && chmod 0644 /etc/cgconfig.conf

-  ENV PERF_BINARY_PATH=/neonvm/tools/bin/perf
-
  COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
  RUN chmod 0666 /etc/compute_rsyslog.conf
  RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -31,7 +31,6 @@ hostname-validator = "1.1"
 indexmap.workspace = true
 itertools.workspace = true
 jsonwebtoken.workspace = true
-libproc.workspace = true
 metrics.workspace = true
 nix.workspace = true
 notify.workspace = true
@@ -50,7 +49,6 @@ serde_with.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
 tar.workspace = true
-tempfile.workspace = true
 tower.workspace = true
 tower-http.workspace = true
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
@@ -68,7 +66,7 @@ url.workspace = true
 uuid.workspace = true
 walkdir.workspace = true
 x509-cert.workspace = true
-postgres-types.workspace = true
+
 postgres_versioninfo.workspace = true
 postgres_initdb.workspace = true
 compute_api.workspace = true
@@ -80,10 +78,3 @@ zstd = "0.13"
 bytes = "1.0"
 rust-ini = "0.20.0"
 rlimit = "0.10.1"
-
-inferno = { version = "0.12", default-features = false, features = [
-    "multithreaded",
-    "nameattr",
-] }
-pprof = { version = "0.15", features = ["protobuf-codec", "flamegraph"] }
-prost = "0.12"
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -46,14 +46,11 @@ stateDiagram-v2
  Configuration --> Failed : Failed to configure the compute
  Configuration --> Running : Compute has been configured
  Empty --> Init : Compute spec is immediately available
-  Empty --> TerminationPendingFast : Requested termination
-  Empty --> TerminationPendingImmediate : Requested termination
+  Empty --> TerminationPending : Requested termination
  Init --> Failed : Failed to start Postgres
  Init --> Running : Started Postgres
-  Running --> TerminationPendingFast : Requested termination
-  Running --> TerminationPendingImmediate : Requested termination
-  TerminationPendingFast --> Terminated compute with 30s delay for cplane to inspect status
-  TerminationPendingImmediate --> Terminated : Terminated compute immediately
+  Running --> TerminationPending : Requested termination
+  TerminationPending --> Terminated : Terminated compute
  Failed --> [*] : Compute exited
  Terminated --> [*] : Compute exited
 ```
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,12 +1,13 @@
-use anyhow::{Context, Result};
+use anyhow::{Context, Result, anyhow};
 use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
 use compute_api::responses::{
    ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
-    LfcPrewarmState, PromoteState, TlsConfig,
+    LfcPrewarmState, TlsConfig,
 };
 use compute_api::spec::{
-    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
+    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverConnectionInfo,
+    PageserverShardConnectionInfo, PgIdent,
 };
 use futures::StreamExt;
 use futures::future::join_all;
@@ -29,7 +30,8 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
 use std::sync::{Arc, Condvar, Mutex, RwLock};
 use std::time::{Duration, Instant};
 use std::{env, fs};
-use tokio::{spawn, sync::watch, task::JoinHandle, time};
+use tokio::task::JoinHandle;
+use tokio::{spawn, time};
 use tracing::{Instrument, debug, error, info, instrument, warn};
 use url::Url;
 use utils::id::{TenantId, TimelineId};
@@ -174,7 +176,6 @@ pub struct ComputeState {
    /// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
    /// mode == ComputeMode::Primary. None otherwise
    pub terminate_flush_lsn: Option<Lsn>,
-    pub promote_state: Option<watch::Receiver<PromoteState>>,

    pub metrics: ComputeMetrics,
 }
@@ -192,7 +193,6 @@ impl ComputeState {
            lfc_prewarm_state: LfcPrewarmState::default(),
            lfc_offload_state: LfcOffloadState::default(),
            terminate_flush_lsn: None,
-            promote_state: None,
        }
    }

@@ -225,7 +225,7 @@ pub struct ParsedSpec {
    pub spec: ComputeSpec,
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
-    pub pageserver_connstr: String,
+    pub pageserver_conninfo: PageserverConnectionInfo,
    pub safekeeper_connstrings: Vec<String>,
    pub storage_auth_token: Option<String>,
    /// k8s dns name and port
@@ -272,6 +272,27 @@ impl ParsedSpec {
    }
 }

+fn extract_pageserver_conninfo_from_guc(
+    pageserver_connstring_guc: &str,
+) -> PageserverConnectionInfo {
+    PageserverConnectionInfo {
+        shards: pageserver_connstring_guc
+            .split(',')
+            .enumerate()
+            .map(|(i, connstr)| {
+                (
+                    i as u32,
+                    PageserverShardConnectionInfo {
+                        libpq_url: Some(connstr.to_string()),
+                        grpc_url: None,
+                    },
+                )
+            })
+            .collect(),
+        prefer_grpc: false,
+    }
+}
+
 impl TryFrom<ComputeSpec> for ParsedSpec {
    type Error = String;
    fn try_from(spec: ComputeSpec) -> Result<Self, String> {
@@ -281,11 +302,17 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
        // For backwards-compatibility, the top-level fields in the spec file
        // may be empty. In that case, we need to dig them from the GUCs in the
        // cluster.settings field.
-        let pageserver_connstr = spec
-            .pageserver_connstring
-            .clone()
-            .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
-            .ok_or("pageserver connstr should be provided")?;
+        let pageserver_conninfo = match &spec.pageserver_connection_info {
+            Some(x) => x.clone(),
+            None => {
+                if let Some(guc) = spec.cluster.settings.find("neon.pageserver_connstring") {
+                    extract_pageserver_conninfo_from_guc(&guc)
+                } else {
+                    return Err("pageserver connstr should be provided".to_string());
+                }
+            }
+        };
+
        let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
            if matches!(spec.mode, ComputeMode::Primary) {
                spec.cluster
@@ -335,7 +362,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {

        let res = ParsedSpec {
            spec,
-            pageserver_connstr,
+            pageserver_conninfo,
            safekeeper_connstrings,
            storage_auth_token,
            tenant_id,
@@ -371,9 +398,7 @@ fn maybe_cgexec(cmd: &str) -> Command {
    }
 }

-/// A handle to the Postgres process that is running in the compute
-/// node.
-pub struct PostgresHandle {
+struct PostgresHandle {
    postgres: std::process::Child,
    log_collector: JoinHandle<Result<()>>,
 }
@@ -427,7 +452,7 @@ impl ComputeNode {

        let mut new_state = ComputeState::new();
        if let Some(spec) = config.spec {
-            let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
+            let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow!(msg))?;
            new_state.pspec = Some(pspec);
        }

@@ -958,20 +983,14 @@ impl ComputeNode {
            None
        };

+        let mut delay_exit = false;
        let mut state = self.state.lock().unwrap();
        state.terminate_flush_lsn = lsn;
-
-        let delay_exit = state.status == ComputeStatus::TerminationPendingFast;
-        if state.status == ComputeStatus::TerminationPendingFast
-            || state.status == ComputeStatus::TerminationPendingImmediate
-        {
-            info!(
-                "Changing compute status from {} to {}",
-                state.status,
-                ComputeStatus::Terminated
-            );
+        if let ComputeStatus::TerminationPending { mode } = state.status {
            state.status = ComputeStatus::Terminated;
            self.state_changed.notify_all();
+            // we were asked to terminate gracefully, don't exit to avoid restart
+            delay_exit = mode == compute_api::responses::TerminateMode::Fast
        }
        drop(state);

@@ -1034,12 +1053,11 @@ impl ComputeNode {
    fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");

-        let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
        let started = Instant::now();
-
-        let (connected, size) = match PageserverProtocol::from_connstring(shard0_connstr)? {
-            PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?,
-            PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
+        let (connected, size) = if spec.pageserver_conninfo.prefer_grpc {
+            self.try_get_basebackup_grpc(spec, lsn)?
+        } else {
+            self.try_get_basebackup_libpq(spec, lsn)?
        };

        let mut state = self.state.lock().unwrap();
@@ -1054,20 +1072,21 @@ impl ComputeNode {
    /// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
    /// the connection was established, and the (compressed) size of the basebackup.
    fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
-        let shard0_connstr = spec
-            .pageserver_connstr
-            .split(',')
-            .next()
-            .unwrap()
-            .to_string();
-        let shard_index = match spec.pageserver_connstr.split(',').count() as u8 {
+        let shard0 = spec
+            .pageserver_conninfo
+            .shards
+            .get(&0)
+            .expect("shard 0 connection info missing");
+        let shard0_url = shard0.grpc_url.clone().expect("no grpc_url for shard 0");
+
+        let shard_index = match spec.pageserver_conninfo.shards.len() as u8 {
            0 | 1 => ShardIndex::unsharded(),
            count => ShardIndex::new(ShardNumber(0), ShardCount(count)),
        };

        let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
            let mut client = page_api::Client::connect(
-                shard0_connstr,
+                shard0_url,
                spec.tenant_id,
                spec.timeline_id,
                shard_index,
@@ -1102,8 +1121,13 @@ impl ComputeNode {
    /// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp
    /// when the connection was established, and the (compressed) size of the basebackup.
    fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
-        let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
-        let mut config = postgres::Config::from_str(shard0_connstr)?;
+        let shard0 = spec
+            .pageserver_conninfo
+            .shards
+            .get(&0)
+            .expect("shard 0 connection info missing");
+        let shard0_connstr = shard0.libpq_url.clone().expect("no libpq_url for shard 0");
+        let mut config = postgres::Config::from_str(&shard0_connstr)?;

        // Use the storage auth token from the config file, if given.
        // Note: this overrides any password set in the connection string.
@@ -1189,10 +1213,7 @@ impl ComputeNode {
                    return result;
                }
                Err(ref e) if attempts < max_attempts => {
-                    warn!(
-                        "Failed to get basebackup: {} (attempt {}/{})",
-                        e, attempts, max_attempts
-                    );
+                    warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})");
                    std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
                    retry_period_ms *= 1.5;
                }
@@ -1401,16 +1422,8 @@ impl ComputeNode {
            }
        };

-        info!(
-            "getting basebackup@{} from pageserver {}",
-            lsn, &pspec.pageserver_connstr
-        );
-        self.get_basebackup(compute_state, lsn).with_context(|| {
-            format!(
-                "failed to get basebackup@{} from pageserver {}",
-                lsn, &pspec.pageserver_connstr
-            )
-        })?;
+        self.get_basebackup(compute_state, lsn)
+            .with_context(|| format!("failed to get basebackup@{lsn}"))?;

        // Update pg_hba.conf received with basebackup.
        update_pg_hba(pgdata_path)?;
@@ -1813,8 +1826,6 @@ impl ComputeNode {
            tls_config,
        )?;

-        self.pg_reload_conf()?;
-
        if !spec.skip_pg_catalog_updates {
            let max_concurrent_connections = spec.reconfigure_concurrency;
            // Temporarily reset max_cluster_size in config
@@ -1834,9 +1845,10 @@ impl ComputeNode {

                Ok(())
            })?;
-            self.pg_reload_conf()?;
        }

+        self.pg_reload_conf()?;
+
        let unknown_op = "unknown".to_string();
        let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
        info!(
@@ -1909,8 +1921,7 @@ impl ComputeNode {

                            // exit loop
                            ComputeStatus::Failed
-                            | ComputeStatus::TerminationPendingFast
-                            | ComputeStatus::TerminationPendingImmediate
+                            | ComputeStatus::TerminationPending { .. }
                            | ComputeStatus::Terminated => break 'cert_update,

                            // wait
@@ -2076,7 +2087,7 @@ LIMIT 100",
            self.params
                .remote_ext_base_url
                .as_ref()
-                .ok_or(DownloadError::BadInput(anyhow::anyhow!(
+                .ok_or(DownloadError::BadInput(anyhow!(
                    "Remote extensions storage is not configured",
                )))?;

@@ -2272,7 +2283,7 @@ LIMIT 100",
        let remote_extensions = spec
            .remote_extensions
            .as_ref()
-            .ok_or(anyhow::anyhow!("Remote extensions are not configured"))?;
+            .ok_or(anyhow!("Remote extensions are not configured"))?;

        info!("parse shared_preload_libraries from spec.cluster.settings");
        let mut libs_vec = Vec::new();
@@ -2351,22 +2362,22 @@ LIMIT 100",
    /// The operation will time out after a specified duration.
    pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
        let state = self.state.lock().unwrap();
-        let old_pageserver_connstr = state
+        let old_pageserver_conninfo = state
            .pspec
            .as_ref()
            .expect("spec must be set")
-            .pageserver_connstr
+            .pageserver_conninfo
            .clone();
        let mut unchanged = true;
        let _ = self
            .state_changed
            .wait_timeout_while(state, duration, |s| {
-                let pageserver_connstr = &s
+                let pageserver_conninfo = &s
                    .pspec
                    .as_ref()
                    .expect("spec must be set")
-                    .pageserver_connstr;
-                unchanged = pageserver_connstr == &old_pageserver_connstr;
+                    .pageserver_conninfo;
+                unchanged = pageserver_conninfo == &old_pageserver_conninfo;
                unchanged
            })
            .unwrap();
@@ -2444,11 +2455,19 @@ LIMIT 100",
        // If the value is -1, we never suspend so set the value to default collection.
        // If the value is 0, it means default, we will just continue to use the default.
        if spec.suspend_timeout_seconds == -1 || spec.suspend_timeout_seconds == 0 {
+            info!(
+                "[NEON_EXT_INT_UPD] Spec Timeout: {}, New Timeout: {}",
+                spec.suspend_timeout_seconds, DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL
+            );
            self.params.installed_extensions_collection_interval.store(
                DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL,
                std::sync::atomic::Ordering::SeqCst,
            );
        } else {
+            info!(
+                "[NEON_EXT_INT_UPD] Spec Timeout: {}",
+                spec.suspend_timeout_seconds
+            );
            self.params.installed_extensions_collection_interval.store(
                spec.suspend_timeout_seconds as u64,
                std::sync::atomic::Ordering::SeqCst,
--- a/compute_tools/src/compute_prewarm.rs
+++ b/compute_tools/src/compute_prewarm.rs
@@ -70,7 +70,7 @@ impl ComputeNode {
            }
        };
        let row = match client
-            .query_one("select * from neon.get_prewarm_info()", &[])
+            .query_one("select * from get_prewarm_info()", &[])
            .await
        {
            Ok(row) => row,
@@ -105,8 +105,7 @@ impl ComputeNode {
                cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed;
                return;
            };
-            crate::metrics::LFC_PREWARM_ERRORS.inc();
-            error!(%err, "prewarming lfc");
+            error!(%err);
            cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Failed {
                error: err.to_string(),
            };
@@ -146,7 +145,7 @@ impl ComputeNode {
        ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
            .await
            .context("connecting to postgres")?
-            .query_one("select neon.prewarm_local_cache($1)", &[&uncompressed])
+            .query_one("select prewarm_local_cache($1)", &[&uncompressed])
            .await
            .context("loading LFC state into postgres")
            .map(|_| ())
@@ -181,8 +180,7 @@ impl ComputeNode {
            self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
            return;
        };
-        crate::metrics::LFC_OFFLOAD_ERRORS.inc();
-        error!(%err, "offloading lfc");
+        error!(%err);
        self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
            error: err.to_string(),
        };
@@ -196,7 +194,7 @@ impl ComputeNode {
        ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
            .await
            .context("connecting to postgres")?
-            .query_one("select neon.get_local_cache_state()", &[])
+            .query_one("select get_local_cache_state()", &[])
            .await
            .context("querying LFC state")?
            .try_get::<usize, &[u8]>(0)
--- a/compute_tools/src/compute_promote.rs
+++ b/compute_tools/src/compute_promote.rs
@@ -1,132 +0,0 @@
-use crate::compute::ComputeNode;
-use anyhow::{Context, Result, bail};
-use compute_api::{
-    responses::{LfcPrewarmState, PromoteState, SafekeepersLsn},
-    spec::ComputeMode,
-};
-use std::{sync::Arc, time::Duration};
-use tokio::time::sleep;
-use utils::lsn::Lsn;
-
-impl ComputeNode {
-    /// Returns only when promote fails or succeeds. If a network error occurs
-    /// and http client disconnects, this does not stop promotion, and subsequent
-    /// calls block until promote finishes.
-    /// Called by control plane on secondary after primary endpoint is terminated
-    pub async fn promote(self: &Arc<Self>, safekeepers_lsn: SafekeepersLsn) -> PromoteState {
-        let cloned = self.clone();
-        let start_promotion = || {
-            let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted);
-            tokio::spawn(async move {
-                tx.send(match cloned.promote_impl(safekeepers_lsn).await {
-                    Ok(_) => PromoteState::Completed,
-                    Err(err) => {
-                        tracing::error!(%err, "promoting");
-                        PromoteState::Failed {
-                            error: err.to_string(),
-                        }
-                    }
-                })
-            });
-            rx
-        };
-
-        let mut task;
-        // self.state is unlocked after block ends so we lock it in promote_impl
-        // and task.changed() is reached
-        {
-            task = self
-                .state
-                .lock()
-                .unwrap()
-                .promote_state
-                .get_or_insert_with(start_promotion)
-                .clone()
-        }
-        task.changed().await.expect("promote sender dropped");
-        task.borrow().clone()
-    }
-
-    // Why do we have to supply safekeepers?
-    // For secondary we use primary_connection_conninfo so safekeepers field is empty
-    async fn promote_impl(&self, safekeepers_lsn: SafekeepersLsn) -> Result<()> {
-        {
-            let state = self.state.lock().unwrap();
-            let mode = &state.pspec.as_ref().unwrap().spec.mode;
-            if *mode != ComputeMode::Replica {
-                bail!("{} is not replica", mode.to_type_str());
-            }
-
-            // we don't need to query Postgres so not self.lfc_prewarm_state()
-            match &state.lfc_prewarm_state {
-                LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming => {
-                    bail!("prewarm not requested or pending")
-                }
-                LfcPrewarmState::Failed { error } => {
-                    tracing::warn!(%error, "replica prewarm failed")
-                }
-                _ => {}
-            }
-        }
-
-        let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
-            .await
-            .context("connecting to postgres")?;
-
-        let primary_lsn = safekeepers_lsn.wal_flush_lsn;
-        let mut last_wal_replay_lsn: Lsn = Lsn::INVALID;
-        const RETRIES: i32 = 20;
-        for i in 0..=RETRIES {
-            let row = client
-                .query_one("SELECT pg_last_wal_replay_lsn()", &[])
-                .await
-                .context("getting last replay lsn")?;
-            let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
-            last_wal_replay_lsn = lsn.into();
-            if last_wal_replay_lsn >= primary_lsn {
-                break;
-            }
-            tracing::info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}");
-            sleep(Duration::from_secs(1)).await;
-        }
-        if last_wal_replay_lsn < primary_lsn {
-            bail!("didn't catch up with primary in {RETRIES} retries");
-        }
-
-        // using $1 doesn't work with ALTER SYSTEM SET
-        let safekeepers_sql = format!(
-            "ALTER SYSTEM SET neon.safekeepers='{}'",
-            safekeepers_lsn.safekeepers
-        );
-        client
-            .query(&safekeepers_sql, &[])
-            .await
-            .context("setting safekeepers")?;
-        client
-            .query("SELECT pg_reload_conf()", &[])
-            .await
-            .context("reloading postgres config")?;
-        let row = client
-            .query_one("SELECT * FROM pg_promote()", &[])
-            .await
-            .context("pg_promote")?;
-        if !row.get::<usize, bool>(0) {
-            bail!("pg_promote() returned false");
-        }
-
-        let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
-            .await
-            .context("connecting to postgres")?;
-        let row = client
-            .query_one("SHOW transaction_read_only", &[])
-            .await
-            .context("getting transaction_read_only")?;
-        if row.get::<usize, &str>(0) == "on" {
-            bail!("replica in read only mode after promotion");
-        }
-
-        let mut state = self.state.lock().unwrap();
-        state.pspec.as_mut().unwrap().spec.mode = ComputeMode::Primary;
-        Ok(())
-    }
-}
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -56,9 +56,51 @@ pub fn write_postgres_conf(

    // Add options for connecting to storage
    writeln!(file, "# Neon storage settings")?;
-    if let Some(s) = &spec.pageserver_connstring {
-        writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
+
+    if let Some(conninfo) = &spec.pageserver_connection_info {
+        let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
+        let mut grpc_urls: Option<Vec<String>> = Some(Vec::new());
+
+        for shardno in 0..conninfo.shards.len() {
+            let info = conninfo.shards.get(&(shardno as u32)).ok_or_else(|| {
+                anyhow::anyhow!("shard {shardno} missing from pageserver_connection_info shard map")
+            })?;
+
+            if let Some(url) = &info.libpq_url {
+                if let Some(ref mut urls) = libpq_urls {
+                    urls.push(url.clone());
+                }
+            } else {
+                libpq_urls = None
+            }
+            if let Some(url) = &info.grpc_url {
+                if let Some(ref mut urls) = grpc_urls {
+                    urls.push(url.clone());
+                }
+            } else {
+                grpc_urls = None
+            }
+        }
+        if let Some(libpq_urls) = libpq_urls {
+            writeln!(
+                file,
+                "neon.pageserver_connstring={}",
+                escape_conf_value(&libpq_urls.join(","))
+            )?;
+        } else {
+            writeln!(file, "# no neon.pageserver_connstring")?;
+        }
+        if let Some(grpc_urls) = grpc_urls {
+            writeln!(
+                file,
+                "neon.pageserver_grpc_urls={}",
+                escape_conf_value(&grpc_urls.join(","))
+            )?;
+        } else {
+            writeln!(file, "# no neon.pageserver_grpc_urls")?;
+        }
    }
+
    if let Some(stripe_size) = spec.shard_stripe_size {
        writeln!(file, "neon.stripe_size={stripe_size}")?;
    }
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -83,87 +83,6 @@ paths:
              schema:
                $ref: "#/components/schemas/DbsAndRoles"

-  /promote:
-    post:
-      tags:
-        - Promotion
-      summary: Promote secondary replica to primary
-      description: ""
-      operationId: promoteReplica
-      requestBody:
-        description: Promote requests data
-        required: true
-        content:
-          application/json:
-            schema:
-                $ref: "#/components/schemas/SafekeepersLsn"
-      responses:
-        200:
-          description: Promote succeeded or wasn't started
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/PromoteState"
-        500:
-          description: Promote failed
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/PromoteState"
-
-  /lfc/prewarm:
-    post:
-      summary: Request LFC Prewarm
-      parameters:
-        - name: from_endpoint
-          in: query
-          schema:
-            type: string
-      description: ""
-      operationId: lfcPrewarm
-      responses:
-        202:
-          description: LFC prewarm started
-        429:
-          description: LFC prewarm ongoing
-    get:
-      tags:
-        - Prewarm
-      summary: Get LFC prewarm state
-      description: ""
-      operationId: getLfcPrewarmState
-      responses:
-        200:
-          description: Prewarm state
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/LfcPrewarmState"
-
-  /lfc/offload:
-    post:
-      summary: Request LFC offload
-      description: ""
-      operationId: lfcOffload
-      responses:
-        202:
-          description: LFC offload started
-        429:
-          description: LFC offload ongoing
-    get:
-      tags:
-        - Prewarm
-      summary: Get LFC offloading state
-      description: ""
-      operationId: getLfcOffloadState
-      responses:
-        200:
-          description: Offload state
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/LfcOffloadState"
-
  /database_schema:
    get:
      tags:
@@ -371,28 +290,9 @@ paths:
      summary: Terminate Postgres and wait for it to exit
      description: ""
      operationId: terminate
-      parameters:
-        - name: mode
-          in: query
-          description: "Terminate mode: fast (wait 30s before returning) and immediate"
-          required: false
-          schema:
-            type: string
-            enum: ["fast", "immediate"]
-            default: fast
      responses:
        200:
          description: Result
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/TerminateResponse"
-        201:
-          description: Result if compute is already terminated
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/TerminateResponse"
        412:
          description: "wrong state"
          content:
@@ -435,6 +335,15 @@ components:
        total_startup_ms:
          type: integer

+    Info:
+      type: object
+      description: Information about VM/Pod.
+      required:
+        - num_cpus
+      properties:
+        num_cpus:
+          type: integer
+
    DbsAndRoles:
      type: object
      description: Databases and Roles
@@ -549,14 +458,11 @@ components:
      type: string
      enum:
        - empty
-        - configuration_pending
        - init
-        - running
-        - configuration
        - failed
-        - termination_pending_fast
-        - termination_pending_immediate
-        - terminated
+        - running
+        - configuration_pending
+        - configuration
      example: running

    ExtensionInstallRequest:
@@ -591,69 +497,25 @@ components:
          type: string
          example: "1.0.0"

-    SafekeepersLsn:
+    InstalledExtensions:
      type: object
-      required:
-        - safekeepers
-        - wal_flush_lsn
      properties:
-        safekeepers:
-          description: Primary replica safekeepers
-          type: string
-        wal_flush_lsn:
-          description: Primary last WAL flush LSN
-          type: string
-
-    LfcPrewarmState:
-      type: object
-      required:
-        - status
-        - total
-        - prewarmed
-        - skipped
-      properties:
-        status:
-          description: Lfc prewarm status
-          enum: [not_prewarmed, prewarming, completed, failed]
-          type: string
-        error:
-          description: Lfc prewarm error, if any
-          type: string
-        total:
-          description: Total pages processed
-          type: integer
-        prewarmed:
-          description: Total pages prewarmed
-          type: integer
-        skipped:
-          description: Pages processed but not prewarmed
-          type: integer
-
-    LfcOffloadState:
-      type: object
-      required:
-        - status
-      properties:
-        status:
-          description: Lfc offload status
-          enum: [not_offloaded, offloading, completed, failed]
-          type: string
-        error:
-          description: Lfc offload error, if any
-          type: string
-
-    PromoteState:
-      type: object
-      required:
-        - status
-      properties:
-        status:
-          description: Promote result
-          enum: [not_promoted, completed, failed]
-          type: string
-        error:
-          description: Promote error, if any
-          type: string
+        extensions:
+          description: Contains list of installed extensions.
+          type: array
+          items:
+            type: object
+            properties:
+              extname:
+                type: string
+              version:
+                type: string
+                items:
+                  type: string
+              n_databases:
+                type: integer
+              owned_by_superuser:
+                type: integer

    SetRoleGrantsRequest:
      type: object
@@ -682,17 +544,6 @@ components:
          description: Role name.
          example: "neon"

-    TerminateResponse:
-      type: object
-      required:
-        - lsn
-      properties:
-        lsn:
-          type: string
-          nullable: true
-          description: "last WAL flush LSN"
-          example: "0/028F10D8"
-
    SetRoleGrantsResponse:
      type: object
      required:
--- a/compute_tools/src/http/routes/mod.rs
+++ b/compute_tools/src/http/routes/mod.rs
@@ -14,8 +14,6 @@ pub(in crate::http) mod insights;
 pub(in crate::http) mod lfc;
 pub(in crate::http) mod metrics;
 pub(in crate::http) mod metrics_json;
-pub(in crate::http) mod promote;
-pub(in crate::http) mod profile;
 pub(in crate::http) mod status;
 pub(in crate::http) mod terminate;

--- a/compute_tools/src/http/routes/profile.rs
+++ b/compute_tools/src/http/routes/profile.rs
@@ -1,217 +0,0 @@
-//! Contains the route for profiling the compute.
-//!
-//! Profiling the compute means generating a pprof profile of the
-//! postgres processes.
-//!
-//! The profiling is done using the `perf` tool, which is expected to be
-//! available somewhere in `$PATH`.
-use std::sync::atomic::Ordering;
-
-use axum::Json;
-use axum::response::IntoResponse;
-use http::StatusCode;
-use nix::unistd::Pid;
-use once_cell::sync::Lazy;
-use tokio::sync::Mutex;
-
-use crate::http::JsonResponse;
-
-static CANCEL_CHANNEL: Lazy<Mutex<Option<tokio::sync::broadcast::Sender<()>>>> =
-    Lazy::new(|| Mutex::new(None));
-
-fn default_sampling_frequency() -> u16 {
-    100
-}
-
-fn default_timeout_seconds() -> u8 {
-    5
-}
-
-fn deserialize_sampling_frequency<'de, D>(deserializer: D) -> Result<u16, D::Error>
-where
-    D: serde::Deserializer<'de>,
-{
-    use serde::Deserialize;
-
-    const MIN_SAMPLING_FREQUENCY: u16 = 1;
-    const MAX_SAMPLING_FREQUENCY: u16 = 1000;
-
-    let value = u16::deserialize(deserializer)?;
-
-    if !(MIN_SAMPLING_FREQUENCY..=MAX_SAMPLING_FREQUENCY).contains(&value) {
-        return Err(serde::de::Error::custom(format!(
-            "sampling_frequency must be between {MIN_SAMPLING_FREQUENCY} and {MAX_SAMPLING_FREQUENCY}, got {value}"
-        )));
-    }
-    Ok(value)
-}
-
-fn deserialize_profiling_timeout<'de, D>(deserializer: D) -> Result<u8, D::Error>
-where
-    D: serde::Deserializer<'de>,
-{
-    use serde::Deserialize;
-
-    const MIN_TIMEOUT_SECONDS: u8 = 1;
-    const MAX_TIMEOUT_SECONDS: u8 = 60;
-
-    let value = u8::deserialize(deserializer)?;
-
-    if !(MIN_TIMEOUT_SECONDS..=MAX_TIMEOUT_SECONDS).contains(&value) {
-        return Err(serde::de::Error::custom(format!(
-            "timeout_seconds must be between {MIN_TIMEOUT_SECONDS} and {MAX_TIMEOUT_SECONDS}, got {value}"
-        )));
-    }
-    Ok(value)
-}
-
-/// Request parameters for profiling the compute.
-#[derive(Debug, Clone, serde::Deserialize)]
-pub(in crate::http) struct ProfileRequest {
-    /// The profiling tool to use, currently only `perf` is supported.
-    profiler: crate::profiling::ProfileGenerator,
-    #[serde(default = "default_sampling_frequency")]
-    #[serde(deserialize_with = "deserialize_sampling_frequency")]
-    sampling_frequency: u16,
-    #[serde(default = "default_timeout_seconds")]
-    #[serde(deserialize_with = "deserialize_profiling_timeout")]
-    timeout_seconds: u8,
-    #[serde(default)]
-    archive: bool,
-}
-
-/// The HTTP request handler for reporting the profiling status of
-/// the compute.
-pub(in crate::http) async fn profile_status() -> impl IntoResponse {
-    tracing::info!("Profile status request received.");
-
-    let cancel_channel = CANCEL_CHANNEL.lock().await;
-
-    if let Some(tx) = cancel_channel.as_ref() {
-        if tx.receiver_count() > 0 {
-            return JsonResponse::create_response(
-                StatusCode::OK,
-                "Profiling is currently in progress.",
-            );
-        }
-    }
-
-    JsonResponse::create_response(StatusCode::NO_CONTENT, "Profiling is not in progress.")
-}
-
-/// The HTTP request handler for stopping profiling the compute.
-pub(in crate::http) async fn profile_stop() -> impl IntoResponse {
-    tracing::info!("Profile stop request received.");
-
-    match CANCEL_CHANNEL.lock().await.take() {
-        Some(tx) => {
-            if tx.send(()).is_err() {
-                tracing::error!("Failed to send cancellation signal.");
-                return JsonResponse::create_response(
-                    StatusCode::INTERNAL_SERVER_ERROR,
-                    "Failed to send cancellation signal",
-                );
-            }
-            JsonResponse::create_response(StatusCode::OK, "Profiling stopped successfully.")
-        }
-        None => JsonResponse::create_response(
-            StatusCode::PRECONDITION_FAILED,
-            "Profiling is not in progress, there is nothing to stop.",
-        ),
-    }
-}
-
-/// The HTTP request handler for starting profiling the compute.
-pub(in crate::http) async fn profile_start(
-    Json(request): Json<ProfileRequest>,
-) -> impl IntoResponse {
-    tracing::info!("Profile start request received: {request:?}");
-
-    let tx = tokio::sync::broadcast::Sender::<()>::new(1);
-
-    {
-        let mut cancel_channel = CANCEL_CHANNEL.lock().await;
-
-        if cancel_channel.is_some() {
-            return JsonResponse::create_response(
-                StatusCode::CONFLICT,
-                "Profiling is already in progress.",
-            );
-        }
-        *cancel_channel = Some(tx.clone());
-    }
-
-    tracing::info!("Profiling will start with parameters: {request:?}");
-    let pg_pid = Pid::from_raw(crate::compute::PG_PID.load(Ordering::SeqCst) as _);
-
-    let run_with_sudo = !cfg!(feature = "testing");
-
-    let options = crate::profiling::ProfileGenerationOptions {
-        profiler: request.profiler,
-        run_with_sudo,
-        pids: [pg_pid].into_iter().collect(),
-        follow_forks: true,
-        sampling_frequency: request.sampling_frequency as u32,
-        blocklist_symbols: vec![
-            "libc".to_owned(),
-            "libgcc".to_owned(),
-            "pthread".to_owned(),
-            "vdso".to_owned(),
-        ],
-        archive: request.archive,
-    };
-
-    let options = crate::profiling::ProfileGenerationTaskOptions {
-        options,
-        timeout: std::time::Duration::from_secs(request.timeout_seconds as u64),
-        should_stop: Some(tx),
-    };
-
-    let pprof_data = crate::profiling::generate_pprof_profile(options).await;
-
-    if CANCEL_CHANNEL.lock().await.take().is_none() {
-        tracing::error!("Profiling was cancelled from another request.");
-
-        return JsonResponse::create_response(
-            StatusCode::NO_CONTENT,
-            "Profiling was cancelled from another request.",
-        );
-    }
-
-    let pprof_data = match pprof_data {
-        Ok(data) => data,
-        Err(e) => {
-            tracing::error!(error = ?e, "failed to generate pprof data");
-            return JsonResponse::create_response(
-                StatusCode::INTERNAL_SERVER_ERROR,
-                format!("Failed to generate pprof data: {e:?}"),
-            );
-        }
-    };
-
-    tracing::info!("Profiling has completed successfully.");
-
-    let mut headers = http::HeaderMap::new();
-
-    if request.archive {
-        headers.insert(
-            http::header::CONTENT_TYPE,
-            http::HeaderValue::from_static("application/gzip"),
-        );
-        headers.insert(
-            http::header::CONTENT_DISPOSITION,
-            http::HeaderValue::from_static("attachment; filename=\"profile.pb.gz\""),
-        );
-    } else {
-        headers.insert(
-            http::header::CONTENT_TYPE,
-            http::HeaderValue::from_static("application/octet-stream"),
-        );
-        headers.insert(
-            http::header::CONTENT_DISPOSITION,
-            http::HeaderValue::from_static("attachment; filename=\"profile.pb\""),
-        );
-    }
-
-    (headers, pprof_data.0).into_response()
-}
--- a/compute_tools/src/http/routes/promote.rs
+++ b/compute_tools/src/http/routes/promote.rs
@@ -1,14 +0,0 @@
-use crate::http::JsonResponse;
-use axum::Form;
-use http::StatusCode;
-
-pub(in crate::http) async fn promote(
-    compute: axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>,
-    Form(safekeepers_lsn): Form<compute_api::responses::SafekeepersLsn>,
-) -> axum::response::Response {
-    let state = compute.promote(safekeepers_lsn).await;
-    if let compute_api::responses::PromoteState::Failed { error } = state {
-        return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, error);
-    }
-    JsonResponse::success(StatusCode::OK, state)
-}
--- a/compute_tools/src/http/routes/terminate.rs
+++ b/compute_tools/src/http/routes/terminate.rs
@@ -3,7 +3,7 @@ use crate::http::JsonResponse;
 use axum::extract::State;
 use axum::response::Response;
 use axum_extra::extract::OptionalQuery;
-use compute_api::responses::{ComputeStatus, TerminateMode, TerminateResponse};
+use compute_api::responses::{ComputeStatus, TerminateResponse};
 use http::StatusCode;
 use serde::Deserialize;
 use std::sync::Arc;
@@ -12,7 +12,7 @@ use tracing::info;

 #[derive(Deserialize, Default)]
 pub struct TerminateQuery {
-    mode: TerminateMode,
+    mode: compute_api::responses::TerminateMode,
 }

 /// Terminate the compute.
@@ -24,16 +24,16 @@ pub(in crate::http) async fn terminate(
    {
        let mut state = compute.state.lock().unwrap();
        if state.status == ComputeStatus::Terminated {
-            let response = TerminateResponse {
-                lsn: state.terminate_flush_lsn,
-            };
-            return JsonResponse::success(StatusCode::CREATED, response);
+            return JsonResponse::success(StatusCode::CREATED, state.terminate_flush_lsn);
        }

        if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
            return JsonResponse::invalid_status(state.status);
        }
-        state.set_status(mode.into(), &compute.state_changed);
+        state.set_status(
+            ComputeStatus::TerminationPending { mode },
+            &compute.state_changed,
+        );
    }

    forward_termination_signal(false);
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -23,11 +23,10 @@ use super::{
    middleware::authorize::Authorize,
    routes::{
        check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
-        grants, insights, lfc, metrics, metrics_json, promote, status, terminate,
+        grants, insights, lfc, metrics, metrics_json, status, terminate,
    },
 };
 use crate::compute::ComputeNode;
-use crate::http::routes::profile;

 /// `compute_ctl` has two servers: internal and external. The internal server
 /// binds to the loopback interface and handles communication from clients on
@@ -82,19 +81,12 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
            Server::External {
                config, compute_id, ..
            } => {
-                let unauthenticated_router = Router::<Arc<ComputeNode>>::new()
-                    .route("/metrics", get(metrics::get_metrics))
-                    .route(
-                        "/profile/cpu",
-                        get(profile::profile_status)
-                            .post(profile::profile_start)
-                            .delete(profile::profile_stop),
-                    );
+                let unauthenticated_router =
+                    Router::<Arc<ComputeNode>>::new().route("/metrics", get(metrics::get_metrics));

                let authenticated_router = Router::<Arc<ComputeNode>>::new()
                    .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
                    .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload))
-                    .route("/promote", post(promote::promote))
                    .route("/check_writability", post(check_writability::is_writable))
                    .route("/configure", post(configure::configure))
                    .route("/database_schema", get(database_schema::get_schema_dump))
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -12,7 +12,6 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod compute_prewarm;
-pub mod compute_promote;
 pub mod disk_quota;
 pub mod extension_server;
 pub mod installed_extensions;
@@ -24,7 +23,6 @@ pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod pgbouncer;
-pub mod profiling;
 pub mod rsyslog;
 pub mod spec;
 mod spec_apply;
--- a/compute_tools/src/lsn_lease.rs
+++ b/compute_tools/src/lsn_lease.rs
@@ -4,8 +4,7 @@ use std::thread;
 use std::time::{Duration, SystemTime};

 use anyhow::{Result, bail};
-use compute_api::spec::{ComputeMode, PageserverProtocol};
-use itertools::Itertools as _;
+use compute_api::spec::{ComputeMode, PageserverConnectionInfo};
 use pageserver_page_api as page_api;
 use postgres::{NoTls, SimpleQueryMessage};
 use tracing::{info, warn};
@@ -78,17 +77,16 @@ fn acquire_lsn_lease_with_retry(

    loop {
        // Note: List of pageservers is dynamic, need to re-read configs before each attempt.
-        let (connstrings, auth) = {
+        let (conninfo, auth) = {
            let state = compute.state.lock().unwrap();
            let spec = state.pspec.as_ref().expect("spec must be set");
            (
-                spec.pageserver_connstr.clone(),
+                spec.pageserver_conninfo.clone(),
                spec.storage_auth_token.clone(),
            )
        };

-        let result =
-            try_acquire_lsn_lease(&connstrings, auth.as_deref(), tenant_id, timeline_id, lsn);
+        let result = try_acquire_lsn_lease(conninfo, auth.as_deref(), tenant_id, timeline_id, lsn);
        match result {
            Ok(Some(res)) => {
                return Ok(res);
@@ -112,17 +110,16 @@ fn acquire_lsn_lease_with_retry(

 /// Tries to acquire LSN leases on all Pageserver shards.
 fn try_acquire_lsn_lease(
-    connstrings: &str,
+    conninfo: PageserverConnectionInfo,
    auth: Option<&str>,
    tenant_id: TenantId,
    timeline_id: TimelineId,
    lsn: Lsn,
 ) -> Result<Option<SystemTime>> {
-    let connstrings = connstrings.split(',').collect_vec();
-    let shard_count = connstrings.len();
+    let shard_count = conninfo.shards.len();
    let mut leases = Vec::new();

-    for (shard_number, &connstring) in connstrings.iter().enumerate() {
+    for (shard_number, shard) in conninfo.shards.into_iter() {
        let tenant_shard_id = match shard_count {
            0 | 1 => TenantShardId::unsharded(tenant_id),
            shard_count => TenantShardId {
@@ -132,13 +129,22 @@ fn try_acquire_lsn_lease(
            },
        };

-        let lease = match PageserverProtocol::from_connstring(connstring)? {
-            PageserverProtocol::Libpq => {
-                acquire_lsn_lease_libpq(connstring, auth, tenant_shard_id, timeline_id, lsn)?
-            }
-            PageserverProtocol::Grpc => {
-                acquire_lsn_lease_grpc(connstring, auth, tenant_shard_id, timeline_id, lsn)?
-            }
+        let lease = if conninfo.prefer_grpc {
+            acquire_lsn_lease_grpc(
+                &shard.grpc_url.unwrap(),
+                auth,
+                tenant_shard_id,
+                timeline_id,
+                lsn,
+            )?
+        } else {
+            acquire_lsn_lease_libpq(
+                &shard.libpq_url.unwrap(),
+                auth,
+                tenant_shard_id,
+                timeline_id,
+                lsn,
+            )?
        };
        leases.push(lease);
    }
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -105,14 +105,6 @@ pub(crate) static LFC_PREWARMS: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static LFC_PREWARM_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "compute_ctl_lfc_prewarm_errors_total",
-        "Total number of LFC prewarm errors",
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "compute_ctl_lfc_offloads_total",
@@ -121,14 +113,6 @@ pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static LFC_OFFLOAD_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "compute_ctl_lfc_offload_errors_total",
-        "Total number of LFC offload errors",
-    )
-    .expect("failed to define a metric")
-});
-
 pub fn collect() -> Vec<MetricFamily> {
    let mut metrics = COMPUTE_CTL_UP.collect();
    metrics.extend(INSTALLED_EXTENSIONS.collect());
@@ -139,8 +123,6 @@ pub fn collect() -> Vec<MetricFamily> {
    metrics.extend(PG_CURR_DOWNTIME_MS.collect());
    metrics.extend(PG_TOTAL_DOWNTIME_MS.collect());
    metrics.extend(LFC_PREWARMS.collect());
-    metrics.extend(LFC_PREWARM_ERRORS.collect());
    metrics.extend(LFC_OFFLOADS.collect());
-    metrics.extend(LFC_OFFLOAD_ERRORS.collect());
    metrics
 }
--- a/compute_tools/src/migrations/0002-alter_roles.sql
+++ b/compute_tools/src/migrations/0002-alter_roles.sql
@@ -1,16 +1,3 @@
-- On December 8th, 2023, an engineering escalation (INC-110) was opened after
-- it was found that BYPASSRLS was being applied to all roles.
--
-- PR that introduced the issue: https://github.com/neondatabase/neon/pull/5657
-- Subsequent commit on main: https://github.com/neondatabase/neon/commit/ad99fa5f0393e2679e5323df653c508ffa0ac072
--
-- NOBYPASSRLS and INHERIT are the defaults for a Postgres role, but because it
-- isn't easy to know if a Postgres cluster is affected by the issue, we need to
-- keep the migration around for a long time, if not indefinitely, so any
-- cluster can be fixed.
--
-- Branching is the gift that keeps on giving...
-
 DO $$
 DECLARE
    role_name text;
--- a/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql
@@ -1 +0,0 @@
-GRANT pg_signal_backend TO neon_superuser WITH ADMIN OPTION;
--- a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
@@ -7,17 +7,13 @@ BEGIN
        INTO monitor
        FROM pg_auth_members
        WHERE roleid = 'pg_monitor'::regrole
-            AND member = 'neon_superuser'::regrole;
+            AND member = 'pg_monitor'::regrole;

-    IF monitor IS NULL THEN
-        RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor';
-    END IF;
-
-    IF monitor.admin IS NULL OR NOT monitor.member THEN
+    IF NOT monitor.member THEN
        RAISE EXCEPTION 'neon_superuser is not a member of pg_monitor';
    END IF;

-    IF monitor.admin IS NULL OR NOT monitor.admin THEN
+    IF NOT monitor.admin THEN
        RAISE EXCEPTION 'neon_superuser cannot grant pg_monitor';
    END IF;
 END $$;
--- a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql
@@ -1,23 +0,0 @@
-DO $$
-DECLARE
-    signal_backend record;
-BEGIN
-    SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
-            admin_option AS admin
-        INTO signal_backend
-        FROM pg_auth_members
-        WHERE roleid = 'pg_signal_backend'::regrole
-            AND member = 'neon_superuser'::regrole;
-
-    IF signal_backend IS NULL THEN
-        RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_signal_backend';
-    END IF;
-
-    IF signal_backend.member IS NULL OR NOT signal_backend.member THEN
-        RAISE EXCEPTION 'neon_superuser is not a member of pg_signal_backend';
-    END IF;
-
-    IF signal_backend.admin IS NULL OR NOT signal_backend.admin THEN
-        RAISE EXCEPTION 'neon_superuser cannot grant pg_signal_backend';
-    END IF;
-END $$;
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -84,8 +84,7 @@ impl ComputeMonitor {
        if matches!(
            compute_status,
            ComputeStatus::Terminated
-                | ComputeStatus::TerminationPendingFast
-                | ComputeStatus::TerminationPendingImmediate
+                | ComputeStatus::TerminationPending { .. }
                | ComputeStatus::Failed
        ) {
            info!(
--- a/compute_tools/src/profiling/mod.rs
+++ b/compute_tools/src/profiling/mod.rs
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -197,7 +197,6 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> {
        include_str!(
            "./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
        ),
-        include_str!("./migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql"),
    ];

    MigrationRunner::new(client, &migrations)
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -16,7 +16,7 @@ use std::time::Duration;
 use anyhow::{Context, Result, anyhow, bail};
 use clap::Parser;
 use compute_api::requests::ComputeClaimsScope;
-use compute_api::spec::{ComputeMode, PageserverProtocol};
+use compute_api::spec::{ComputeMode, PageserverConnectionInfo, PageserverShardConnectionInfo};
 use control_plane::broker::StorageBroker;
 use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode};
 use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
@@ -1516,29 +1516,35 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                )?;
            }

-            let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
-                let conf = env.get_pageserver_conf(pageserver_id).unwrap();
-                // Use gRPC if requested.
-                let pageserver = if endpoint.grpc {
-                    let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
-                    let (host, port) = parse_host_port(grpc_addr)?;
-                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
-                    (PageserverProtocol::Grpc, host, port)
-                } else {
+            let (shards, stripe_size) = if let Some(ps_id) = pageserver_id {
+                let conf = env.get_pageserver_conf(ps_id).unwrap();
+                let libpq_url = Some({
                    let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
                    let port = port.unwrap_or(5432);
-                    (PageserverProtocol::Libpq, host, port)
+                    format!("postgres://no_user@{host}:{port}")
+                });
+                let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
+                    let (host, port) = parse_host_port(grpc_addr)?;
+                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
+                    Some(format!("grpc://no_user@{host}:{port}"))
+                } else {
+                    None
                };
+                let pageserver = PageserverShardConnectionInfo {
+                    libpq_url,
+                    grpc_url,
+                };
+
                // If caller is telling us what pageserver to use, this is not a tenant which is
                // fully managed by storage controller, therefore not sharded.
-                (vec![pageserver], DEFAULT_STRIPE_SIZE)
+                (vec![(0, pageserver)], DEFAULT_STRIPE_SIZE)
            } else {
                // Look up the currently attached location of the tenant, and its striping metadata,
                // to pass these on to postgres.
                let storage_controller = StorageController::from_env(env);
                let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
-                let pageservers = futures::future::try_join_all(
-                    locate_result.shards.into_iter().map(|shard| async move {
+                let shards = futures::future::try_join_all(locate_result.shards.into_iter().map(
+                    |shard| async move {
                        if let ComputeMode::Static(lsn) = endpoint.mode {
                            // Initialize LSN leases for static computes.
                            let conf = env.get_pageserver_conf(shard.node_id).unwrap();
@@ -1550,28 +1556,34 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                                .await?;
                        }

-                        let pageserver = if endpoint.grpc {
-                            (
-                                PageserverProtocol::Grpc,
-                                Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))?,
-                                shard.listen_grpc_port.expect("no gRPC port"),
-                            )
+                        let libpq_host = Host::parse(&shard.listen_pg_addr)?;
+                        let libpq_port = shard.listen_pg_port;
+                        let libpq_url =
+                            Some(format!("postgres://no_user@{libpq_host}:{libpq_port}"));
+
+                        let grpc_url = if let Some(grpc_host) = shard.listen_grpc_addr {
+                            let grpc_port = shard.listen_grpc_port.expect("no gRPC port");
+                            Some(format!("grpc://no_user@{grpc_host}:{grpc_port}"))
                        } else {
-                            (
-                                PageserverProtocol::Libpq,
-                                Host::parse(&shard.listen_pg_addr)?,
-                                shard.listen_pg_port,
-                            )
+                            None
                        };
-                        anyhow::Ok(pageserver)
-                    }),
-                )
+                        let pageserver = PageserverShardConnectionInfo {
+                            libpq_url,
+                            grpc_url,
+                        };
+                        anyhow::Ok((shard.shard_id.shard_number.0 as u32, pageserver))
+                    },
+                ))
                .await?;
                let stripe_size = locate_result.shard_params.stripe_size;

-                (pageservers, stripe_size)
+                (shards, stripe_size)
+            };
+            assert!(!shards.is_empty());
+            let pageserver_conninfo = PageserverConnectionInfo {
+                shards: shards.into_iter().collect(),
+                prefer_grpc: endpoint.grpc,
            };
-            assert!(!pageservers.is_empty());

            let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
@@ -1601,7 +1613,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                endpoint_storage_addr,
                safekeepers_generation,
                safekeepers,
-                pageservers,
+                pageserver_conninfo,
                remote_ext_base_url: remote_ext_base_url.clone(),
                shard_stripe_size: stripe_size.0 as usize,
                create_test_user: args.create_test_user,
@@ -1620,20 +1632,27 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                .endpoints
                .get(endpoint_id.as_str())
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id {
+            let shards = if let Some(ps_id) = args.endpoint_pageserver_id {
                let conf = env.get_pageserver_conf(ps_id)?;
-                // Use gRPC if requested.
-                let pageserver = if endpoint.grpc {
-                    let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
-                    let (host, port) = parse_host_port(grpc_addr)?;
-                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
-                    (PageserverProtocol::Grpc, host, port)
-                } else {
+                let libpq_url = Some({
                    let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
                    let port = port.unwrap_or(5432);
-                    (PageserverProtocol::Libpq, host, port)
+                    format!("postgres://no_user@{host}:{port}")
+                });
+                let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
+                    let (host, port) = parse_host_port(grpc_addr)?;
+                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
+                    Some(format!("grpc://no_user@{host}:{port}"))
+                } else {
+                    None
                };
-                vec![pageserver]
+                let pageserver = PageserverShardConnectionInfo {
+                    libpq_url,
+                    grpc_url,
+                };
+                // If caller is telling us what pageserver to use, this is not a tenant which is
+                // fully managed by storage controller, therefore not sharded.
+                vec![(0, pageserver)]
            } else {
                let storage_controller = StorageController::from_env(env);
                storage_controller
@@ -1643,28 +1662,36 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                    .into_iter()
                    .map(|shard| {
                        // Use gRPC if requested.
-                        if endpoint.grpc {
-                            (
-                                PageserverProtocol::Grpc,
-                                Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))
-                                    .expect("bad hostname"),
-                                shard.listen_grpc_port.expect("no gRPC port"),
-                            )
+                        let libpq_host = Host::parse(&shard.listen_pg_addr).expect("bad hostname");
+                        let libpq_port = shard.listen_pg_port;
+                        let libpq_url =
+                            Some(format!("postgres://no_user@{libpq_host}:{libpq_port}"));
+
+                        let grpc_url = if let Some(grpc_host) = shard.listen_grpc_addr {
+                            let grpc_port = shard.listen_grpc_port.expect("no gRPC port");
+                            Some(format!("grpc://no_user@{grpc_host}:{grpc_port}"))
                        } else {
-                            (
-                                PageserverProtocol::Libpq,
-                                Host::parse(&shard.listen_pg_addr).expect("bad hostname"),
-                                shard.listen_pg_port,
-                            )
-                        }
+                            None
+                        };
+                        (
+                            shard.shard_id.shard_number.0 as u32,
+                            PageserverShardConnectionInfo {
+                                libpq_url,
+                                grpc_url,
+                            },
+                        )
                    })
                    .collect::<Vec<_>>()
            };
+            let pageserver_conninfo = PageserverConnectionInfo {
+                shards: shards.into_iter().collect(),
+                prefer_grpc: endpoint.grpc,
+            };
            // If --safekeepers argument is given, use only the listed
            // safekeeper nodes; otherwise all from the env.
            let safekeepers = parse_safekeepers(&args.safekeepers)?;
            endpoint
-                .reconfigure(Some(pageservers), None, safekeepers, None)
+                .reconfigure(Some(pageserver_conninfo), None, safekeepers, None)
                .await?;
        }
        EndpointCmd::Stop(args) => {
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -56,9 +56,13 @@ use compute_api::responses::{
    TlsConfig,
 };
 use compute_api::spec::{
-    Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol,
-    PgIdent, RemoteExtSpec, Role,
+    Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
+    RemoteExtSpec, Role,
 };
+
+// re-export these, because they're used in the reconfigure() function
+pub use compute_api::spec::{PageserverConnectionInfo, PageserverShardConnectionInfo};
+
 use jsonwebtoken::jwk::{
    AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
    OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
@@ -74,7 +78,6 @@ use sha2::{Digest, Sha256};
 use spki::der::Decode;
 use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
 use tracing::debug;
-use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};

 use crate::local_env::LocalEnv;
@@ -379,7 +382,7 @@ pub struct EndpointStartArgs {
    pub endpoint_storage_addr: String,
    pub safekeepers_generation: Option<SafekeeperGeneration>,
    pub safekeepers: Vec<NodeId>,
-    pub pageservers: Vec<(PageserverProtocol, Host, u16)>,
+    pub pageserver_conninfo: PageserverConnectionInfo,
    pub remote_ext_base_url: Option<String>,
    pub shard_stripe_size: usize,
    pub create_test_user: bool,
@@ -653,14 +656,6 @@ impl Endpoint {
        }
    }

-    fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String {
-        pageservers
-            .iter()
-            .map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}"))
-            .collect::<Vec<_>>()
-            .join(",")
-    }
-
    /// Map safekeepers ids to the actual connection strings.
    fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
        let mut safekeeper_connstrings = Vec::new();
@@ -706,9 +701,6 @@ impl Endpoint {
            std::fs::remove_dir_all(self.pgdata())?;
        }

-        let pageserver_connstring = Self::build_pageserver_connstr(&args.pageservers);
-        assert!(!pageserver_connstring.is_empty());
-
        let safekeeper_connstrings = self.build_safekeepers_connstrs(args.safekeepers)?;

        // check for file remote_extensions_spec.json
@@ -767,7 +759,7 @@ impl Endpoint {
                branch_id: None,
                endpoint_id: Some(self.endpoint_id.clone()),
                mode: self.mode,
-                pageserver_connstring: Some(pageserver_connstring),
+                pageserver_connection_info: Some(args.pageserver_conninfo),
                safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()),
                safekeeper_connstrings,
                storage_auth_token: args.auth_token.clone(),
@@ -922,8 +914,7 @@ impl Endpoint {
                        ComputeStatus::Empty
                        | ComputeStatus::ConfigurationPending
                        | ComputeStatus::Configuration
-                        | ComputeStatus::TerminationPendingFast
-                        | ComputeStatus::TerminationPendingImmediate
+                        | ComputeStatus::TerminationPending { .. }
                        | ComputeStatus::Terminated => {
                            bail!("unexpected compute status: {:?}", state.status)
                        }
@@ -981,7 +972,7 @@ impl Endpoint {

    pub async fn reconfigure(
        &self,
-        pageservers: Option<Vec<(PageserverProtocol, Host, u16)>>,
+        pageserver_conninfo: Option<PageserverConnectionInfo>,
        stripe_size: Option<ShardStripeSize>,
        safekeepers: Option<Vec<NodeId>>,
        safekeeper_generation: Option<SafekeeperGeneration>,
@@ -997,15 +988,17 @@ impl Endpoint {
        let postgresql_conf = self.read_postgresql_conf()?;
        spec.cluster.postgresql_conf = Some(postgresql_conf);

-        // If pageservers are not specified, don't change them.
-        if let Some(pageservers) = pageservers {
-            anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
-
-            let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
-            spec.pageserver_connstring = Some(pageserver_connstr);
-            if stripe_size.is_some() {
-                spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
-            }
+        if let Some(pageserver_conninfo) = pageserver_conninfo {
+            // If pageservers are provided, we need to ensure that they are not empty.
+            // This is a requirement for the compute_ctl configuration.
+            anyhow::ensure!(
+                !pageserver_conninfo.shards.is_empty(),
+                "no pageservers provided"
+            );
+            spec.pageserver_connection_info = Some(pageserver_conninfo);
+        }
+        if stripe_size.is_some() {
+            spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
        }

        // If safekeepers are not specified, don't change them.
@@ -1054,7 +1047,7 @@ impl Endpoint {

    pub async fn reconfigure_pageservers(
        &self,
-        pageservers: Vec<(PageserverProtocol, Host, u16)>,
+        pageservers: PageserverConnectionInfo,
        stripe_size: Option<ShardStripeSize>,
    ) -> Result<()> {
        self.reconfigure(Some(pageservers), stripe_size, None, None)
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -452,12 +452,6 @@ impl PageServerNode {
                .map(|x| x.parse::<usize>())
                .transpose()
                .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
-            // HADRON
-            image_layer_force_creation_period: settings
-                .remove("image_layer_force_creation_period")
-                .map(humantime::parse_duration)
-                .transpose()
-                .context("Failed to parse 'image_layer_force_creation_period' as duration")?,
            image_layer_creation_check_threshold: settings
                .remove("image_layer_creation_check_threshold")
                .map(|x| x.parse::<u8>())
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -54,16 +54,14 @@ else
    printf '%s\n' "${result}" | jq .
  fi

-  if [[ "${RUN_PARALLEL:-false}" != "true" ]]; then
-    echo "Check if a timeline present"
-    PARAMS=(
-         -X GET
-         -H "Content-Type: application/json"
-        "http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
-    )
-    timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
-  fi
-  if [[ -z "${timeline_id:-}" || "${timeline_id:-}" = null ]]; then
+  echo "Check if a timeline present"
+  PARAMS=(
+       -X GET
+       -H "Content-Type: application/json"
+       "http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
+  )
+  timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
+  if [[ -z "${timeline_id}" || "${timeline_id}" = null ]]; then
    generate_id timeline_id
    PARAMS=(
        -sbf
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -142,7 +142,7 @@ services:
      - "storage_broker"
      - "--listen-addr=0.0.0.0:50051"

-  compute1:
+  compute:
    restart: always
    build:
      context: ./compute_wrapper/
@@ -152,7 +152,6 @@ services:
        - TAG=${COMPUTE_TAG:-${TAG:-latest}}
        - http_proxy=${http_proxy:-}
        - https_proxy=${https_proxy:-}
-    image: built-compute
    environment:
      - PG_VERSION=${PG_VERSION:-16}
      - TENANT_ID=${TENANT_ID:-}
@@ -167,11 +166,6 @@ services:
      - 3080:3080 # http endpoints
    entrypoint:
      - "/shell/compute.sh"
-    # Ad an alias for compute1 for compatibility
-    networks:
-      default:
-        aliases:
-            - compute
    depends_on:
      - safekeeper1
      - safekeeper2
@@ -180,20 +174,15 @@ services:

  compute_is_ready:
    image: postgres:latest
-    environment:
-      - PARALLEL_COMPUTES=1
    entrypoint:
-      - "/bin/sh"
+      - "/bin/bash"
      - "-c"
    command:
-      - "for i in $(seq 1 $${PARALLEL_COMPUTES}); do
-           until pg_isready -h compute$$i -p 55433 -U cloud_admin ; do
-             sleep 1;
-           done;
-         done;
-         echo All computes are started"
+      - "until pg_isready -h compute -p 55433 -U cloud_admin ; do
+            echo 'Waiting to start compute...' && sleep 1;
+         done"
    depends_on:
-      - compute1
+      - compute

  neon-test-extensions:
    profiles: ["test-extensions"]
@@ -207,4 +196,4 @@ services:
    command:
      - sleep 3600
    depends_on:
-      - compute1
+      - compute
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -1,4 +1,4 @@
-#!/usr/bin/env bash
+#!/bin/bash

 # A basic test to ensure Docker images are built correctly.
 # Build a wrapper around the compute, start all services and runs a simple SQL query.
@@ -13,36 +13,9 @@
 #
 set -eux -o pipefail

-cd "$(dirname "${0}")"
 export COMPOSE_FILE='docker-compose.yml'
 export COMPOSE_PROFILES=test-extensions
-export PARALLEL_COMPUTES=${PARALLEL_COMPUTES:-1}
-READY_MESSAGE="All computes are started"
-COMPUTES=()
-for i in $(seq 1 "${PARALLEL_COMPUTES}"); do
-  COMPUTES+=("compute${i}")
-done
-CURRENT_TMPDIR=$(mktemp -d)
-trap 'rm -rf ${CURRENT_TMPDIR} docker-compose-parallel.yml' EXIT
-if [[ ${PARALLEL_COMPUTES} -gt 1 ]]; then
-  export COMPOSE_FILE=docker-compose-parallel.yml
-  cp docker-compose.yml docker-compose-parallel.yml
-  # Replace the environment variable PARALLEL_COMPUTES with the actual value
-  yq eval -i ".services.compute_is_ready.environment |=  map(select(. | test(\"^PARALLEL_COMPUTES=\") | not)) + [\"PARALLEL_COMPUTES=${PARALLEL_COMPUTES}\"]" ${COMPOSE_FILE}
-  for i in $(seq 2 "${PARALLEL_COMPUTES}"); do
-    # Duplicate compute1 as compute${i} for parallel execution
-    yq eval -i ".services.compute${i} = .services.compute1" ${COMPOSE_FILE}
-    # We don't need these sections, so delete them
-    yq eval -i "(del .services.compute${i}.build) | (del .services.compute${i}.ports) | (del .services.compute${i}.networks)" ${COMPOSE_FILE}
-    # Let the compute 1 be the only dependence
-    yq eval -i ".services.compute${i}.depends_on = [\"compute1\"]" ${COMPOSE_FILE}
-    # Set RUN_PARALLEL=true for compute2. They will generate tenant_id and timeline_id to avoid using the same as other computes
-    yq eval -i ".services.compute${i}.environment += [\"RUN_PARALLEL=true\"]" ${COMPOSE_FILE}
-    # Remove TENANT_ID and TIMELINE_ID from the environment variables of the generated computes
-    # They will create new TENANT_ID and TIMELINE_ID anyway.
-    yq eval -i ".services.compute${i}.environment |= map(select(. | (test(\"^TENANT_ID=\") or test(\"^TIMELINE_ID=\")) | not))" ${COMPOSE_FILE}
-  done
-fi
+cd "$(dirname "${0}")"
 PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"

 function cleanup() {
@@ -54,11 +27,11 @@ function cleanup() {

 for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
    pg_version=${pg_version/v/}
-    echo "clean up containers if exist"
+    echo "clean up containers if exists"
    cleanup
    PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
-    PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose build compute1
-    PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose up --quiet-pull -d
+    PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose up --quiet-pull --build -d
+
    echo "wait until the compute is ready. timeout after 60s. "
    cnt=0
    while sleep 3; do
@@ -68,50 +41,45 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
            echo "timeout before the compute is ready."
            exit 1
        fi
-        if docker compose logs compute_is_ready | grep -q "${READY_MESSAGE}"; then
+        if docker compose logs "compute_is_ready" | grep -q "accepting connections"; then
            echo "OK. The compute is ready to connect."
            echo "execute simple queries."
-            for compute in "${COMPUTES[@]}"; do
-              docker compose exec "${compute}" /bin/bash -c "psql ${PSQL_OPTION} -c 'SELECT 1'"
-            done
+            docker compose exec compute /bin/bash -c "psql ${PSQL_OPTION} -c 'SELECT 1'"
            break
        fi
    done

    if [[ ${pg_version} -ge 16 ]]; then
-        mkdir "${CURRENT_TMPDIR}"/{pg_hint_plan-src,file_fdw,postgis-src}
-        docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${CURRENT_TMPDIR}/postgis-src/test"
-        docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${CURRENT_TMPDIR}/postgis-src/00-regress-install"
-        docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${CURRENT_TMPDIR}/pg_hint_plan-src/data"
-        docker compose cp neon-test-extensions:/postgres/contrib/file_fdw/data "${CURRENT_TMPDIR}/file_fdw/data"
-
-        for compute in "${COMPUTES[@]}"; do
-          # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
-          # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
-          echo Adding dummy config on "${compute}"
-          docker compose exec "${compute}" touch /var/db/postgres/compute/compute_ctl_temp_override.conf
-          # Prepare for the PostGIS test
-          docker compose exec "${compute}" mkdir -p /tmp/pgis_reg/pgis_reg_tmp /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install
-          docker compose cp "${CURRENT_TMPDIR}/postgis-src/test" "${compute}":/ext-src/postgis-src/raster/test
-          docker compose cp "${CURRENT_TMPDIR}/postgis-src/00-regress-install" "${compute}":/ext-src/postgis-src/regress
-          # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
-          docker compose cp "${CURRENT_TMPDIR}/pg_hint_plan-src/data" "${compute}":/ext-src/pg_hint_plan-src/
-          # The following block does the same for the contrib/file_fdw test
-          docker compose cp "${CURRENT_TMPDIR}/file_fdw/data" "${compute}":/postgres/contrib/file_fdw/data
-        done
+        # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
+        # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
+        echo Adding dummy config
+        docker compose exec compute touch /var/db/postgres/compute/compute_ctl_temp_override.conf
+        # Prepare for the PostGIS test
+        docker compose exec compute mkdir -p /tmp/pgis_reg/pgis_reg_tmp
+        TMPDIR=$(mktemp -d)
+        docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${TMPDIR}"
+        docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${TMPDIR}"
+        docker compose exec compute mkdir -p /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install
+        docker compose cp "${TMPDIR}/test" compute:/ext-src/postgis-src/raster/test
+        docker compose cp "${TMPDIR}/00-regress-install" compute:/ext-src/postgis-src/regress
+        rm -rf "${TMPDIR}"
+        # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
+        TMPDIR=$(mktemp -d)
+        docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${TMPDIR}/data"
+        docker compose cp "${TMPDIR}/data" compute:/ext-src/pg_hint_plan-src/
+        rm -rf "${TMPDIR}"
+        # The following block does the same for the contrib/file_fdw test
+        TMPDIR=$(mktemp -d)
+        docker compose cp neon-test-extensions:/postgres/contrib/file_fdw/data "${TMPDIR}/data"
+        docker compose cp "${TMPDIR}/data" compute:/postgres/contrib/file_fdw/data
+        rm -rf "${TMPDIR}"
        # Apply patches
        docker compose exec -T neon-test-extensions bash -c "(cd /postgres && patch -p1)" <"../compute/patches/contrib_pg${pg_version}.patch"
        # We are running tests now
        rm -f testout.txt testout_contrib.txt
-        # We want to run the longest tests first to better utilize parallelization and reduce overall test time.
-        # Tests listed in the RUN_FIRST variable will be run before others.
-        # If parallelization is not used, this environment variable will be ignored.
-
        docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \
-        -e RUN_FIRST=hll-src,postgis-src,pgtap-src -e PARALLEL_COMPUTES="${PARALLEL_COMPUTES}" \
        neon-test-extensions /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
        docker compose exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
-        -e PARALLEL_COMPUTES="${PARALLEL_COMPUTES}" \
        neon-test-extensions /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
        if [[ ${EXT_SUCCESS} -eq 0 || ${CONTRIB_SUCCESS} -eq 0 ]]; then
            CONTRIB_FAILED=
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,4 +1,4 @@
-#!/usr/bin/env bash
+#!/bin/bash
 set -x

 if [[ -v BENCHMARK_CONNSTR ]]; then
@@ -26,9 +26,8 @@ if [[ -v BENCHMARK_CONNSTR ]]; then
  fi
 fi
 REGULAR_USER=false
-PARALLEL_COMPUTES=${PARALLEL_COMPUTES:-1}
-while getopts pr arg; do
-  case ${arg} in
+while getopts r arg; do
+  case $arg in
  r)
    REGULAR_USER=true
    shift $((OPTIND-1))
@@ -42,49 +41,26 @@ extdir=${1}

 cd "${extdir}" || exit 2
 FAILED=
-export FAILED_FILE=/tmp/failed
-rm -f ${FAILED_FILE}
-mapfile -t LIST < <( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u)
-if [[ ${PARALLEL_COMPUTES} -gt 1 ]]; then
-  # Avoid errors if RUN_FIRST is not defined
-  RUN_FIRST=${RUN_FIRST:-}
-  # Move entries listed in the RUN_FIRST variable to the beginning
-  ORDERED_LIST=$(printf "%s\n" "${LIST[@]}" | grep -x -Ff <(echo -e "${RUN_FIRST//,/$'\n'}"); printf "%s\n" "${LIST[@]}" | grep -vx -Ff <(echo -e "${RUN_FIRST//,/$'\n'}"))
-  parallel -j"${PARALLEL_COMPUTES}" "[[ -d {} ]] || exit 0
-                export PGHOST=compute{%}
-                if ! psql -c 'select 1'>/dev/null; then
-                  exit 1
-                fi
-                echo Running on \${PGHOST}
-                if [[ -f ${extdir}/{}/neon-test.sh ]]; then
-                  echo Running from script
-                  ${extdir}/{}/neon-test.sh || echo {} >> ${FAILED_FILE};
-                else
-                  echo Running using make;
-                  USE_PGXS=1 make -C {} installcheck || echo {} >> ${FAILED_FILE};
-                fi" ::: ${ORDERED_LIST}
-  [[ ! -f ${FAILED_FILE} ]] && exit 0
-else
-  for d in "${LIST[@]}"; do
-      [ -d "${d}" ] || continue
-      if ! psql -w -c "select 1" >/dev/null; then
-        FAILED="${d} ${FAILED}"
-        break
-      fi
-      if [[ ${REGULAR_USER} = true ]] && [ -f "${d}"/regular-test.sh ]; then
-        "${d}/regular-test.sh" || FAILED="${d} ${FAILED}"
-        continue
-      fi
+LIST=$( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u)
+for d in ${LIST}; do
+    [ -d "${d}" ] || continue
+    if ! psql -w -c "select 1" >/dev/null; then
+      FAILED="${d} ${FAILED}"
+      break
+    fi
+    if [[ ${REGULAR_USER} = true ]] && [ -f "${d}"/regular-test.sh ]; then
+       "${d}/regular-test.sh" || FAILED="${d} ${FAILED}"
+       continue
+    fi

-      if [ -f "${d}/neon-test.sh" ]; then
-        "${d}/neon-test.sh" || FAILED="${d} ${FAILED}"
-      else
-        USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
-      fi
-  done
-  [[ -z ${FAILED} ]]  && exit 0
-fi
-for d in ${FAILED} $([[ ! -f ${FAILED_FILE} ]] || cat ${FAILED_FILE}); do
+    if [ -f "${d}/neon-test.sh" ]; then
+       "${d}/neon-test.sh" || FAILED="${d} ${FAILED}"
+    else
+       USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
+    fi
+done
+[ -z "${FAILED}" ] && exit 0
+for d in ${FAILED}; do
  cat "$(find $d -name regression.diffs)"
 done
 for postgis_diff in /tmp/pgis_reg/*_diff; do
@@ -92,5 +68,4 @@ for postgis_diff in /tmp/pgis_reg/*_diff; do
  cat "${postgis_diff}"
 done
 echo "${FAILED}"
-cat ${FAILED_FILE}
 exit 1
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -1,4 +1,4 @@
-#!/usr/bin/env bash
+#!/bin/bash
 set -eux -o pipefail
 cd "$(dirname "${0}")"
 # Takes a variable name as argument. The result is stored in that variable.
@@ -60,8 +60,8 @@ function check_timeline() {
 # Restarts the compute node with the required compute tag and timeline.
 # Accepts the tag for the compute node and the timeline as parameters.
 function restart_compute() {
-  docker compose down compute1 compute_is_ready
-  COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute1 compute_is_ready
+  docker compose down compute compute_is_ready
+  COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute compute_is_ready
  wait_for_ready
  check_timeline ${2}
 }
--- a/docs/continuous-profiling.md
+++ b/docs/continuous-profiling.md
@@ -1,58 +0,0 @@
-# Continuous Crofiling (Compute)
-
-The continuous profiling of the compute node is performed by `perf` or `bcc-tools`, the latter is preferred.
-
-The executables profiled are all the postgres-related ones only, excluding the actual compute code (Rust). This can be done as well but
-was not the main goal.
-
-## Tools
-
-The aforementioned tools are available within the same Docker image as
-the compute node itself, but the corresponding dependencies linux the
-linux kernel headers and the linux kernel itself are not and can't be
-for obvious reasons. To solve that, as we run the compute nodes as a
-virtual machine (qemu), we need to deliver these dependencies to it.
-This is done by the `autoscaling` part, which builds and deploys the
-kernel headers, needed modules, and the `perf` binary into an ext4-fs
-disk image, which is later attached to the VM and is symlinked to be
-made available for the compute node.
-
-## Output
-
-The output of the profiling is always a binary file in the same format
-of `pprof`. It can, however, be archived by `gzip` additionally, if the
-corresponding argument is provided in the JSON request.
-
-## REST API
-
-### Test profiling
-
-One can test the profiling after connecting to the VM and running:
-
-```sh
-curl -X POST -H "Content-Type: application/json" http://localhost:3080/profile/cpu -d '{"profiler": {"BccProfile": null}, "sampling_frequency": 99, "timeout_seconds": 5, "archive": false}' -v --output profile.pb
-```
-
-This uses the `Bcc` profiler and does not archive the output. The
-profiling data will be saved into the `profile.pb` file locally.
-
-**Only one profiling session can be run at a time.**
-
-To check the profiling status (to see whether it is already running or
-not), one can perform the `GET` request:
-
-```sh
-curl http://localhost:3080/profile/cpu -v
-```
-
-The profiling can be stopped by performing the `DELETE` request:
-
-```sh
-curl -X DELETE http://localhost:3080/profile/cpu -v
-```
-
-## Supported profiling data
-
-For now, only the CPU profiling is done and ther is no heap profiling.
-Also, only the postgres-related executables are tracked, the compute
-(Rust) part itself **is not tracked**.
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -20,7 +20,7 @@ In our case consensus leader is compute (walproposer), and we don't want to wake
 up all computes for the change. Neither we want to fully reimplement the leader
 logic second time outside compute. Because of that the proposed algorithm relies
 for issuing configurations on the external fault tolerant (distributed) strongly
-consistent storage with simple API: CAS (compare-and-swap) on the single key.
+consisent storage with simple API: CAS (compare-and-swap) on the single key.
 Properly configured postgres suits this.

 In the system consensus is implemented at the timeline level, so algorithm below
@@ -34,7 +34,7 @@ A configuration is

 ```
 struct Configuration {
-    generation: SafekeeperGeneration, // a number uniquely identifying configuration
+    generation: Generation, // a number uniquely identifying configuration
    sk_set: Vec<NodeId>, // current safekeeper set
    new_sk_set: Optional<Vec<NodeId>>,
 }
@@ -81,11 +81,11 @@ configuration generation in them is less than its current one. Namely, it
 refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
 response it sends its current configuration generation to let walproposer know.

-Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/membership`
-accepting `Configuration`. Safekeeper switches to the given conf if it is higher than its
+Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
+accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
 current one and ignores it otherwise. In any case it replies with
 ```
-struct TimelineMembershipSwitchResponse {
+struct ConfigurationSwitchResponse {
    conf: Configuration,
    term: Term,
    last_log_term: Term,
@@ -108,7 +108,7 @@ establishes this configuration as its own and moves to voting.
 It should stop talking to safekeepers not listed in the configuration at this
 point, though it is not unsafe to continue doing so.

-To be elected it must receive votes from both majorities if `new_sk_set` is present.
+To be elected it must receive votes from both majorites if `new_sk_set` is present.
 Similarly, to commit WAL it must receive flush acknowledge from both majorities.

 If walproposer hears from safekeeper configuration higher than his own (i.e.
@@ -130,7 +130,7 @@ storage are reachable.
 1) Fetch current timeline configuration from the configuration storage.
 2) If it is already joint one and `new_set` is different from `desired_set`
   refuse to change. However, assign join conf to (in memory) var
-   `joint_conf` and proceed to step 4 to finish the ongoing change.
+   `join_conf` and proceed to step 4 to finish the ongoing change.
 3) Else, create joint `joint_conf: Configuration`: increment current conf number
   `n` and put `desired_set` to `new_sk_set`. Persist it in the configuration
   storage by doing CAS on the current generation: change happens only if
@@ -161,11 +161,11 @@ storage are reachable.
   because `pull_timeline` already includes it and plus additionally would be
   broadcast by compute. More importantly, we may proceed to the next step
   only when `<last_log_term, flush_lsn>` on the majority of the new set reached
-   `sync_position`. Similarly, on the happy path no waiting is needed because
+   `sync_position`. Similarly, on the happy path no waiting is not needed because
   `pull_timeline` already includes it. However, we should double
    check to be safe. For example, timeline could have been created earlier e.g.
    manually or after try-to-migrate, abort, try-to-migrate-again sequence.
-7) Create `new_conf: Configuration` incrementing `joint_conf` generation and having new
+7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
   safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
   storage under one more CAS.
 8) Call `PUT` `configuration` on safekeepers from the new set,
@@ -178,12 +178,12 @@ spec of it.

 Description above focuses on safety. To make the flow practical and live, here a few more
 considerations.
-1) It makes sense to ping new set to ensure we are migrating to live node(s) before
+1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
  step 3.
 2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
   it is safe to rollback to the old conf with one more CAS.
 3) On step 4 timeline might be already created on members of the new set for various reasons;
-   the simplest is the procedure restart. There are more complicated scenarios like mentioned
+   the simplest is the procedure restart. There are more complicated scenarious like mentioned
   in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
   generations, so seems simpler to treat existing timeline as success. However, this also
   has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
@@ -192,7 +192,7 @@ considerations.
 4) In the end timeline should be locally deleted on the safekeeper(s) which are
   in the old set but not in the new one, unless they are unreachable. To be
   safe this also should be done under generation number (deletion proceeds only if
-   current configuration is <= than one in request and safekeeper is not member of it).
+   current configuration is <= than one in request and safekeeper is not memeber of it).
 5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
   jump to step 7, using it as `new_conf`.

@@ -261,14 +261,14 @@ Timeline (branch) creation in cplane should call storage_controller POST
 Response should be augmented with `safekeepers_generation` and `safekeepers`
 fields like described in `/notify-safekeepers` above. Initially (currently)
 these fields may be absent; in this case cplane chooses safekeepers on its own
-like it currently does. The call should be retried until it succeeds.
+like it currently does. The call should be retried until succeeds.

 Timeline deletion and tenant deletion in cplane should call appropriate
 storage_controller endpoints like it currently does for sharded tenants. The
 calls should be retried until they succeed.

-When compute receives safekeeper list from control plane it needs to know the
-generation to check whether it should be updated (note that compute may get
+When compute receives safekeepers list from control plane it needs to know the
+generation to checked whether it should be updated (note that compute may get
 safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers`
 GUC is just a comma separates list of `host:port`. Let's prefix it with
 `g#<generation>:` to this end, so it will look like
@@ -305,8 +305,8 @@ enum MigrationRequest {
 ```

 `FinishPending` requests to run the procedure to ensure state is clean: current
-configuration is not joint and the majority of safekeepers are aware of it, but do
-not attempt to migrate anywhere. If the current configuration fetched on step 1 is
+configuration is not joint and majority of safekeepers are aware of it, but do
+not attempt to migrate anywhere. If current configuration fetched on step 1 is
 not joint it jumps to step 7. It should be run at startup for all timelines (but
 similarly, in the first version it is ok to trigger it manually).

@@ -315,7 +315,7 @@ similarly, in the first version it is ok to trigger it manually).
 `safekeepers` table mirroring current `nodes` should be added, except that for
 `scheduling_policy`: it is enough to have at least in the beginning only 3
 fields: 1) `active` 2) `paused` (initially means only not assign new tlis there
-3) `decommissioned` (node is removed).
+3) `decomissioned` (node is removed).

 `timelines` table:
 ```
@@ -326,10 +326,9 @@ table! {
        tenant_id -> Varchar,
        start_lsn -> pg_lsn,
        generation -> Int4,
-        sk_set -> Array<Int8>, // list of safekeeper ids
+        sk_set -> Array<Int4>, // list of safekeeper ids
        new_sk_set -> Nullable<Array<Int8>>, // list of safekeeper ids, null if not joint conf
        cplane_notified_generation -> Int4,
-        sk_set_notified_generation -> Int4, // the generation a quorum of sk_set knows about
        deleted_at -> Nullable<Timestamptz>,
    }
 }
@@ -339,23 +338,13 @@ table! {
 might also want to add ancestor_timeline_id to preserve the hierarchy, but for
 this RFC it is not needed.

-`cplane_notified_generation` and `sk_set_notified_generation` fields are used to
-track the last stage of the algorithm, when we need to notify safekeeper set and cplane
-with the final configuration after it's already committed to DB.
-
-The timeline is up-to-date (no migration in progress) if `new_sk_set` is null and
-`*_notified_generation` fields are up to date with `generation`. 
-
-It's possible to replace `*_notified_generation` with one boolean field `migration_completed`,
-but for better observability it's nice to have them separately.
-
 #### API

 Node management is similar to pageserver:
-1) POST `/control/v1/safekeeper` inserts safekeeper.
-2) GET `/control/v1/safekeeper` lists safekeepers.
-3) GET `/control/v1/safekeeper/:node_id` gets safekeeper.
-4) PUT `/control/v1/safekeper/:node_id/scheduling_policy` changes status to e.g.
+1) POST `/control/v1/safekeepers` inserts safekeeper.
+2) GET `/control/v1/safekeepers` lists safekeepers.
+3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
+4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
   `offline` or `decomissioned`. Initially it is simpler not to schedule any
    migrations here.

@@ -379,8 +368,8 @@ Migration API: the first version is the simplest and the most imperative:
 all timelines from one safekeeper to another. It accepts json
 ```
 {
-    "src_sk": NodeId,
-    "dst_sk": NodeId,
+    "src_sk": u32,
+    "dst_sk": u32,
    "limit": Optional<u32>,
 }
 ```
@@ -390,15 +379,12 @@ Returns list of scheduled requests.
 2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest`
   to move single timeline to given set of safekeepers:
 ```
-struct TimelineSafekeeperMigrateRequest {
-    "new_sk_set": Vec<NodeId>,
+{
+    "desired_set": Vec<u32>,
 }
 ```

-In the first version the handler migrates the timeline to `new_sk_set` synchronously.
-Should be retried until success.
-
-In the future we might change it to asynchronous API and return scheduled request.
+Returns scheduled request.

 Similar call should be added for the tenant.

@@ -448,9 +434,6 @@ table! {
 }
 ```

-We load all pending ops from the table on startup into the memory.
-The table is needed only to preserve the state between restarts.
-
 `op_type` can be `include` (seed from peers and ensure generation is up to
 date), `exclude` (remove locally) and `delete`. Field is actually not strictly
 needed as it can be computed from current configuration, but gives more explicit
@@ -491,7 +474,7 @@ actions must be idempotent. Now, a tricky point here is timeline start LSN. For
 the initial (tenant creation) call cplane doesn't know it. However, setting
 start_lsn on safekeepers during creation is a good thing -- it provides a
 guarantee that walproposer can always find a common point in WAL histories of
-safekeeper and its own, and so absence of it would be a clear sign of
+safekeeper and its own, and so absense of it would be a clear sign of
 corruption. The following sequence works:
 1) Create timeline (or observe that it exists) on pageserver,
   figuring out last_record_lsn in response.
@@ -514,9 +497,11 @@ corruption. The following sequence works:
   retries the call until 200 response.

   There is a small question how request handler (timeline creation in this
-   case) would interact with per sk reconciler. In the current implementation
-   we first persist the request in the DB, and then send an in-memory request
-   to each safekeeper reconciler to process it.
+   case) would interact with per sk reconciler. As always I prefer to do the
+   simplest possible thing and here it seems to be just waking it up so it
+   re-reads the db for work to do. Passing work in memory is faster, but
+   that shouldn't matter, and path to scan db for work will exist anyway, 
+   simpler to reuse it.

 For pg version / wal segment size: while we may persist them in `timelines`
 table, it is not necessary as initial creation at step 3 can take them from
@@ -524,40 +509,30 @@ pageserver or cplane creation call and later pull_timeline will carry them
 around.

 Timeline migration.
-1) CAS to the db to create joint conf. Since this moment the migration is considered to be 
-   "in progress". We can detect all "in-progress" migrations looking into the database.
-2) Do steps 4-6 from the algorithm, including `pull_timeline` onto `new_sk_set`, update membership
-   configuration on all safekeepers, notify cplane, etc. All operations are idempotent,
-   so we don't need to persist anything in the database at this stage. If any errors occur,
-   it's safe to retry or abort the migration.
-3) Once it becomes possible per alg description above, get out of joint conf
-   with another CAS. Also should insert `exclude` entries into `safekeeper_timeline_pending_ops`
-   in the same DB transaction. Adding `exclude` entries atomically is nesessary because after
-   CAS we don't have the list of excluded safekeepers in the `timelines` table anymore, but we
-   need to have them persisted somewhere in case the migration is interrupted right after the CAS.
-4) Finish the migration. The final membership configuration is committed to the DB at this stage.
-   So, the migration can not be aborted anymore. But it can still be retried if the migration fails
-   past stage 3. To finish the migration we need to send the new membership configuration to
-   a new quorum of safekeepers, notify cplane with the new safekeeper list and schedule the `exclude`
-   requests to in-memory queue for safekeeper reconciler. If the algrorithm is retried, it's
-   possible that we have already committed `exclude` requests to DB, but didn't send them to
-   the in-memory queue. In this case we need to read them from `safekeeper_timeline_pending_ops`
-   because it's the only place where they are persistent. The fields `sk_set_notified_generation`
-   and `cplane_notified_generation` are updated after each step. The migration is considered
-   fully completed when they match the `generation` field.
-
-In practice, we can report "success" after stage 3 and do the "finish" step in per-timeline
-reconciler (if we implement it). But it's wise to at least try to finish them synchronously,
-so the timeline is always in a "good state" and doesn't require an old quorum to commit
-WAL after the migration reported "success".
+1) CAS to the db to create joint conf, and in the same transaction create
+   `safekeeper_timeline_pending_ops` `include` entries to initialize new members
+   as well as deliver this conf to current ones; poke per sk reconcilers to work
+   on it. Also any conf change should also poke cplane notifier task(s).
+2) Once it becomes possible per alg description above, get out of joint conf
+   with another CAS. Task should get wakeups from per sk reconcilers because 
+   conf switch is required for advancement; however retries should be sleep
+   based as well as LSN advancement might be needed, though in happy path 
+   it isn't. To see whether further transition is possible on wakup migration
+   executor polls safekeepers per the algorithm. CAS creating new conf with only
+   new members should again insert entries to `safekeeper_timeline_pending_ops`
+   to switch them there, as well as `exclude` rows to remove timeline from 
+   old members.

 Timeline deletion: just set `deleted_at` on the timeline row and insert
 `safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by
 per sk reconcilers.

-When node is removed (set to `decommissioned`), `safekeeper_timeline_pending_ops`
+When node is removed (set to `decomissioned`), `safekeeper_timeline_pending_ops`
 for it must be cleared in the same transaction.

+One more task pool should infinitely retry notifying control plane about changed
+safekeeper sets (trying making `cplane_notified_generation` equal `generation`).
+
 #### Dealing with multiple instances of storage_controller

 Operations described above executed concurrently might create some errors but do
@@ -566,7 +541,7 @@ of storage_controller it is fine to have it temporarily, e.g. during redeploy.

 To harden against some controller instance creating some work in
 `safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up
-the job per sk reconcilers apart from explicit wakeups should scan for work
+the job per sk reconcilers apart from explicit wakups should scan for work
 periodically. It is possible to remove that though if all db updates are
 protected with leadership token/term -- then such scans are needed only after
 leadership is acquired.
@@ -588,7 +563,7 @@ There should be following layers of tests:
   safekeeper communication and pull_timeline need to be mocked and main switch
   procedure wrapped to as a node (thread) in simulation tests, using these
   mocks. Test would inject migrations like it currently injects
-   safekeeper/walproposer restarts. Main assert is the same -- committed WAL must
+   safekeeper/walproposer restars. Main assert is the same -- committed WAL must
   not be lost.

 3) Since simulation testing injects at relatively high level points (not
@@ -638,7 +613,7 @@ Let's have the following implementation bits for gradual rollout:
  `notify-safekeepers`.

 Then the rollout for a region would be:
- Current situation: safekeepers are chosen by control_plane.
+- Current situation: safekeepers are choosen by control_plane.
 - We manually migrate some timelines, test moving them around.
 - Then we enable `--set-safekeepers` so that all new timelines
  are on storage controller.
--- a/endpoint_storage/src/app.rs
+++ b/endpoint_storage/src/app.rs
@@ -13,8 +13,6 @@ use utils::backoff::retry;
 pub fn app(state: Arc<Storage>) -> Router<()> {
    use axum::routing::{delete as _delete, get as _get};
    let delete_prefix = _delete(delete_prefix);
-    // NB: On any changes do not forget to update the OpenAPI spec
-    // in /endpoint_storage/src/openapi_spec.yml.
    Router::new()
        .route(
            "/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}",
--- a/endpoint_storage/src/openapi_spec.yml
+++ b/endpoint_storage/src/openapi_spec.yml
@@ -1,146 +0,0 @@
-openapi: "3.0.2"
-info:
-  title: Endpoint Storage API
-  description: Endpoint Storage API
-  version: "1.0"
-  license:
-    name: "Apache"
-    url: https://github.com/neondatabase/neon/blob/main/LICENSE
-servers:
-  - url: ""
-paths:
-  /status:
-    description: Healthcheck endpoint
-    get:
-      description: Healthcheck
-      security: []
-      responses:
-        "200":
-          description: OK
-
-  /{tenant_id}/{timeline_id}/{endpoint_id}/{key}:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: endpoint_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: key
-        in: path
-        required: true
-        schema:
-          type: string
-    get:
-      description: Get file from blob storage
-      responses:
-        "200":
-          description: "File stream from blob storage"
-          content:
-            application/octet-stream:
-              schema:
-                type: string
-                format: binary
-        "400":
-          description: File was not found
-        "403":
-          description: JWT does not authorize request to this route
-    put:
-      description: Insert file into blob storage. If file exists, override it
-      requestBody:
-        content:
-          application/octet-stream:
-            schema:
-              type: string
-              format: binary
-      responses:
-        "200":
-          description: File was inserted successfully
-        "403":
-          description: JWT does not authorize request to this route
-    delete:
-      description: Delete file from blob storage
-      responses:
-        "200":
-          description: File was successfully deleted or not found
-        "403":
-          description: JWT does not authorize request to this route
-
-  /{tenant_id}/{timeline_id}/{endpoint_id}:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: endpoint_id
-        in: path
-        required: true
-        schema:
-          type: string
-    delete:
-      description: Delete endpoint data from blob storage
-      responses:
-        "200":
-          description: Endpoint data was deleted
-        "403":
-          description: JWT does not authorize request to this route
-
-  /{tenant_id}/{timeline_id}:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-      - name: timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-    delete:
-      description: Delete timeline data from blob storage
-      responses:
-        "200":
-          description: Timeline data was deleted
-        "403":
-          description: JWT does not authorize request to this route
-
-  /{tenant_id}:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-    delete:
-      description: Delete tenant data from blob storage
-      responses:
-        "200":
-          description: Tenant data was deleted
-        "403":
-          description: JWT does not authorize request to this route
-
-components:
-  securitySchemes:
-    JWT:
-      type: http
-      scheme: bearer
-      bearerFormat: JWT
-
-security:
-  - JWT: []
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -46,7 +46,7 @@ pub struct ExtensionInstallResponse {
    pub version: ExtVersion,
 }

-#[derive(Serialize, Default, Debug, Clone, PartialEq)]
+#[derive(Serialize, Default, Debug, Clone)]
 #[serde(tag = "status", rename_all = "snake_case")]
 pub enum LfcPrewarmState {
    #[default]
@@ -58,17 +58,6 @@ pub enum LfcPrewarmState {
    },
 }

-impl Display for LfcPrewarmState {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
-            LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
-            LfcPrewarmState::Completed => f.write_str("Completed"),
-            LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
-        }
-    }
-}
-
 #[derive(Serialize, Default, Debug, Clone, PartialEq)]
 #[serde(tag = "status", rename_all = "snake_case")]
 pub enum LfcOffloadState {
@@ -81,23 +70,6 @@ pub enum LfcOffloadState {
    },
 }

-#[derive(Serialize, Debug, Clone, PartialEq)]
-#[serde(tag = "status", rename_all = "snake_case")]
-/// Response of /promote
-pub enum PromoteState {
-    NotPromoted,
-    Completed,
-    Failed { error: String },
-}
-
-#[derive(Deserialize, Serialize, Default, Debug, Clone)]
-#[serde(rename_all = "snake_case")]
-/// Result of /safekeepers_lsn
-pub struct SafekeepersLsn {
-    pub safekeepers: String,
-    pub wal_flush_lsn: utils::lsn::Lsn,
-}
-
 /// Response of the /status API
 #[derive(Serialize, Debug, Deserialize)]
 #[serde(rename_all = "snake_case")]
@@ -121,15 +93,6 @@ pub enum TerminateMode {
    Immediate,
 }

-impl From<TerminateMode> for ComputeStatus {
-    fn from(mode: TerminateMode) -> Self {
-        match mode {
-            TerminateMode::Fast => ComputeStatus::TerminationPendingFast,
-            TerminateMode::Immediate => ComputeStatus::TerminationPendingImmediate,
-        }
-    }
-}
-
 #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
@@ -150,9 +113,7 @@ pub enum ComputeStatus {
    // control-plane to terminate it.
    Failed,
    // Termination requested
-    TerminationPendingFast,
-    // Termination requested, without waiting 30s before returning from /terminate
-    TerminationPendingImmediate,
+    TerminationPending { mode: TerminateMode },
    // Terminated Postgres
    Terminated,
 }
@@ -171,10 +132,7 @@ impl Display for ComputeStatus {
            ComputeStatus::Running => f.write_str("running"),
            ComputeStatus::Configuration => f.write_str("configuration"),
            ComputeStatus::Failed => f.write_str("failed"),
-            ComputeStatus::TerminationPendingFast => f.write_str("termination-pending-fast"),
-            ComputeStatus::TerminationPendingImmediate => {
-                f.write_str("termination-pending-immediate")
-            }
+            ComputeStatus::TerminationPending { .. } => f.write_str("termination-pending"),
            ComputeStatus::Terminated => f.write_str("terminated"),
        }
    }
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -105,7 +105,11 @@ pub struct ComputeSpec {
    // updated to fill these fields, we can make these non optional.
    pub tenant_id: Option<TenantId>,
    pub timeline_id: Option<TimelineId>,
-    pub pageserver_connstring: Option<String>,
+
+    // Pageserver information can be passed in two different ways:
+    // 1. Here
+    // 2. in cluster.settings. This is legacy, we are switching to method 1.
+    pub pageserver_connection_info: Option<PageserverConnectionInfo>,

    // More neon ids that we expose to the compute_ctl
    // and to postgres as neon extension GUCs.
@@ -214,6 +218,20 @@ pub enum ComputeFeature {
    UnknownFeature,
 }

+/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
+#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
+pub struct PageserverConnectionInfo {
+    pub shards: HashMap<u32, PageserverShardConnectionInfo>,
+
+    pub prefer_grpc: bool,
+}
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
+pub struct PageserverShardConnectionInfo {
+    pub libpq_url: Option<String>,
+    pub grpc_url: Option<String>,
+}
+
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct RemoteExtSpec {
    pub public_extensions: Option<Vec<String>>,
@@ -331,6 +349,12 @@ impl ComputeMode {
    }
 }

+impl Display for ComputeMode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.to_type_str())
+    }
+}
+
 /// Log level for audit logging
 #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeAudit {
--- a/libs/http-utils/src/endpoint.rs
+++ b/libs/http-utils/src/endpoint.rs
@@ -20,7 +20,6 @@ use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::io::ReaderStream;
 use tracing::{Instrument, debug, info, info_span, warn};
 use utils::auth::{AuthError, Claims, SwappableJwtAuth};
-use utils::metrics_collector::{METRICS_COLLECTOR, METRICS_STALE_MILLIS};

 use crate::error::{ApiError, api_error_handler, route_error_handler};
 use crate::request::{get_query_param, parse_query_param};
@@ -251,28 +250,9 @@ impl std::io::Write for ChannelWriter {
    }
 }

-pub async fn prometheus_metrics_handler(
-    req: Request<Body>,
-    force_metric_collection_on_scrape: bool,
-) -> Result<Response<Body>, ApiError> {
+pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    SERVE_METRICS_COUNT.inc();

-    // HADRON
-    let requested_use_latest = parse_query_param(&req, "use_latest")?;
-
-    let use_latest = match requested_use_latest {
-        None => force_metric_collection_on_scrape,
-        Some(true) => true,
-        Some(false) => {
-            if force_metric_collection_on_scrape {
-                // We don't cache in this case
-                true
-            } else {
-                false
-            }
-        }
-    };
-
    let started_at = std::time::Instant::now();

    let (tx, rx) = mpsc::channel(1);
@@ -297,18 +277,12 @@ pub async fn prometheus_metrics_handler(

        let _span = span.entered();

-        // HADRON
-        let collected = if use_latest {
-            // Skip caching the results if we always force metric collection on scrape.
-            METRICS_COLLECTOR.run_once(!force_metric_collection_on_scrape)
-        } else {
-            METRICS_COLLECTOR.last_collected()
-        };
+        let metrics = metrics::gather();

        let gathered_at = std::time::Instant::now();

        let res = encoder
-            .encode(&collected.metrics, &mut writer)
+            .encode(&metrics, &mut writer)
            .and_then(|_| writer.flush().map_err(|e| e.into()));

        // this instant is not when we finally got the full response sent, sending is done by hyper
@@ -321,10 +295,6 @@ pub async fn prometheus_metrics_handler(
        let encoded_in = encoded_at - gathered_at - writer.wait_time();
        let total = encoded_at - started_at;

-        // HADRON
-        let staleness_ms = (encoded_at - collected.collected_at).as_millis();
-        METRICS_STALE_MILLIS.set(staleness_ms as i64);
-
        match res {
            Ok(()) => {
                tracing::info!(
@@ -333,7 +303,6 @@ pub async fn prometheus_metrics_handler(
                    spawning_ms = spawned_in.as_millis(),
                    collection_ms = collected_in.as_millis(),
                    encoding_ms = encoded_in.as_millis(),
-                    stalenss_ms = staleness_ms,
                    "responded /metrics"
                );
            }
--- a/libs/http-utils/src/request.rs
+++ b/libs/http-utils/src/request.rs
@@ -41,35 +41,17 @@ pub fn get_query_param<'a>(
        Some(q) => q,
        None => return Ok(None),
    };
-    let values = url::form_urlencoded::parse(query.as_bytes())
+    let mut values = url::form_urlencoded::parse(query.as_bytes())
        .filter_map(|(k, v)| if k == param_name { Some(v) } else { None })
        // we call .next() twice below. If it's None the first time, .fuse() ensures it's None afterwards
        .fuse();

-    // Work around an issue with Alloy's pyroscope scrape where the "seconds"
-    // parameter is added several times. https://github.com/grafana/alloy/issues/3026
-    // TODO: revert after Alloy is fixed.
-    let value1 = values
-        .map(Ok)
-        .reduce(|acc, i| {
-            match acc {
-                Err(_) => acc,
-
-                // It's okay to have duplicates as along as they have the same value.
-                Ok(ref a) if a == &i.unwrap() => acc,
-
-                _ => Err(ApiError::BadRequest(anyhow!(
-                    "param {param_name} specified more than once"
-                ))),
-            }
-        })
-        .transpose()?;
-    // if values.next().is_some() {
-    //     return Err(ApiError::BadRequest(anyhow!(
-    //         "param {param_name} specified more than once"
-    //     )));
-    // }
-
+    let value1 = values.next();
+    if values.next().is_some() {
+        return Err(ApiError::BadRequest(anyhow!(
+            "param {param_name} specified more than once"
+        )));
+    }
    Ok(value1)
 }

@@ -110,39 +92,3 @@ pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError>
        None => Ok(()),
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_get_query_param_duplicate() {
-        let req = Request::builder()
-            .uri("http://localhost:12345/testuri?testparam=1")
-            .body(hyper::Body::empty())
-            .unwrap();
-        let value = get_query_param(&req, "testparam").unwrap();
-        assert_eq!(value.unwrap(), "1");
-
-        let req = Request::builder()
-            .uri("http://localhost:12345/testuri?testparam=1&testparam=1")
-            .body(hyper::Body::empty())
-            .unwrap();
-        let value = get_query_param(&req, "testparam").unwrap();
-        assert_eq!(value.unwrap(), "1");
-
-        let req = Request::builder()
-            .uri("http://localhost:12345/testuri")
-            .body(hyper::Body::empty())
-            .unwrap();
-        let value = get_query_param(&req, "testparam").unwrap();
-        assert!(value.is_none());
-
-        let req = Request::builder()
-            .uri("http://localhost:12345/testuri?testparam=1&testparam=2&testparam=3")
-            .body(hyper::Body::empty())
-            .unwrap();
-        let value = get_query_param(&req, "testparam");
-        assert!(value.is_err());
-    }
-}
--- a/libs/neon-shmem/Cargo.toml
+++ b/libs/neon-shmem/Cargo.toml
@@ -8,6 +8,27 @@ license.workspace = true
 thiserror.workspace = true
 nix.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+rustc-hash = { version = "2.1.1" }
+rand = "0.9.1"
+libc.workspace = true
+lock_api = "0.4.13"
+atomic = "0.6.1"
+bytemuck = { version = "1.23.1", features = ["derive"] }
+
+[dev-dependencies]
+criterion = { workspace = true, features = ["html_reports"] }
+rand_distr = "0.5.1"
+xxhash-rust = { version = "0.8.15", features = ["xxh3"] }
+ahash.workspace = true
+twox-hash = { version = "2.1.1" }
+seahash = "4.1.0"
+hashbrown = { git = "https://github.com/quantumish/hashbrown.git", rev = "6610e6d" }
+foldhash = "0.1.5"
+

 [target.'cfg(target_os = "macos")'.dependencies]
-tempfile = "3.20.0"
+tempfile = "3.14.0"
+
+[[bench]]
+name = "hmap_resize"
+harness = false
--- a/libs/neon-shmem/benches/hmap_resize.rs
+++ b/libs/neon-shmem/benches/hmap_resize.rs
@@ -0,0 +1,330 @@
+use criterion::{BatchSize, BenchmarkId, Criterion, criterion_group, criterion_main};
+use neon_shmem::hash::HashMapAccess;
+use neon_shmem::hash::HashMapInit;
+use neon_shmem::hash::entry::Entry;
+use rand::distr::{Distribution, StandardUniform};
+use rand::prelude::*;
+use std::default::Default;
+use std::hash::BuildHasher;
+
+// Taken from bindings to C code
+
+#[derive(Clone, Debug, Hash, Eq, PartialEq)]
+#[repr(C)]
+pub struct FileCacheKey {
+    pub _spc_id: u32,
+    pub _db_id: u32,
+    pub _rel_number: u32,
+    pub _fork_num: u32,
+    pub _block_num: u32,
+}
+
+impl Distribution<FileCacheKey> for StandardUniform {
+    // questionable, but doesn't need to be good randomness
+    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> FileCacheKey {
+        FileCacheKey {
+            _spc_id: rng.random(),
+            _db_id: rng.random(),
+            _rel_number: rng.random(),
+            _fork_num: rng.random(),
+            _block_num: rng.random(),
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+#[repr(C)]
+pub struct FileCacheEntry {
+    pub _offset: u32,
+    pub _access_count: u32,
+    pub _prev: *mut FileCacheEntry,
+    pub _next: *mut FileCacheEntry,
+    pub _state: [u32; 8],
+}
+
+impl FileCacheEntry {
+    fn dummy() -> Self {
+        Self {
+            _offset: 0,
+            _access_count: 0,
+            _prev: std::ptr::null_mut(),
+            _next: std::ptr::null_mut(),
+            _state: [0; 8],
+        }
+    }
+}
+
+// Utilities for applying operations.
+
+#[derive(Clone, Debug)]
+struct TestOp<K, V>(K, Option<V>);
+
+fn apply_op<K: Clone + std::hash::Hash + Eq, V, S: std::hash::BuildHasher>(
+    op: TestOp<K, V>,
+    map: &mut HashMapAccess<K, V, S>,
+) {
+    let entry = map.entry(op.0);
+
+    match op.1 {
+        Some(new) => match entry {
+            Entry::Occupied(mut e) => Some(e.insert(new)),
+            Entry::Vacant(e) => {
+                _ = e.insert(new).unwrap();
+                None
+            }
+        },
+        None => match entry {
+            Entry::Occupied(e) => Some(e.remove()),
+            Entry::Vacant(_) => None,
+        },
+    };
+}
+
+// Hash utilities
+
+struct SeaRandomState {
+    k1: u64,
+    k2: u64,
+    k3: u64,
+    k4: u64,
+}
+
+impl std::hash::BuildHasher for SeaRandomState {
+    type Hasher = seahash::SeaHasher;
+
+    fn build_hasher(&self) -> Self::Hasher {
+        seahash::SeaHasher::with_seeds(self.k1, self.k2, self.k3, self.k4)
+    }
+}
+
+impl SeaRandomState {
+    fn new() -> Self {
+        let mut rng = rand::rng();
+        Self {
+            k1: rng.random(),
+            k2: rng.random(),
+            k3: rng.random(),
+            k4: rng.random(),
+        }
+    }
+}
+
+fn small_benchs(c: &mut Criterion) {
+    let mut group = c.benchmark_group("Small maps");
+    group.sample_size(10);
+
+    group.bench_function("small_rehash", |b| {
+        let ideal_filled = 4_000_000;
+        let size = 5_000_000;
+        let mut writer = HashMapInit::new_resizeable(size, size * 2).attach_writer();
+        let mut rng = rand::rng();
+        while writer.get_num_buckets_in_use() < ideal_filled as usize {
+            let key: FileCacheKey = rng.random();
+            let val = FileCacheEntry::dummy();
+            apply_op(TestOp(key, Some(val)), &mut writer);
+        }
+        b.iter(|| writer.shuffle());
+    });
+
+    group.bench_function("small_rehash_xxhash", |b| {
+        let ideal_filled = 4_000_000;
+        let size = 5_000_000;
+        let mut writer = HashMapInit::new_resizeable(size, size * 2)
+            .with_hasher(twox_hash::xxhash64::RandomState::default())
+            .attach_writer();
+        let mut rng = rand::rng();
+        while writer.get_num_buckets_in_use() < ideal_filled as usize {
+            let key: FileCacheKey = rng.random();
+            let val = FileCacheEntry::dummy();
+            apply_op(TestOp(key, Some(val)), &mut writer);
+        }
+        b.iter(|| writer.shuffle());
+    });
+
+    group.bench_function("small_rehash_ahash", |b| {
+        let ideal_filled = 4_000_000;
+        let size = 5_000_000;
+        let mut writer = HashMapInit::new_resizeable(size, size * 2)
+            .with_hasher(ahash::RandomState::default())
+            .attach_writer();
+        let mut rng = rand::rng();
+        while writer.get_num_buckets_in_use() < ideal_filled as usize {
+            let key: FileCacheKey = rng.random();
+            let val = FileCacheEntry::dummy();
+            apply_op(TestOp(key, Some(val)), &mut writer);
+        }
+        b.iter(|| writer.shuffle());
+    });
+
+    group.bench_function("small_rehash_seahash", |b| {
+        let ideal_filled = 4_000_000;
+        let size = 5_000_000;
+        let mut writer = HashMapInit::new_resizeable(size, size * 2)
+            .with_hasher(SeaRandomState::new())
+            .attach_writer();
+        let mut rng = rand::rng();
+        while writer.get_num_buckets_in_use() < ideal_filled as usize {
+            let key: FileCacheKey = rng.random();
+            let val = FileCacheEntry::dummy();
+            apply_op(TestOp(key, Some(val)), &mut writer);
+        }
+        b.iter(|| writer.shuffle());
+    });
+
+    group.finish();
+}
+
+fn real_benchs(c: &mut Criterion) {
+    let mut group = c.benchmark_group("Realistic workloads");
+    group.sample_size(10);
+    group.bench_function("real_bulk_insert", |b| {
+        let size = 125_000_000;
+        let ideal_filled = 100_000_000;
+        let mut rng = rand::rng();
+        b.iter_batched(
+            || HashMapInit::new_resizeable(size, size * 2).attach_writer(),
+            |writer| {
+                for _ in 0..ideal_filled {
+                    let key: FileCacheKey = rng.random();
+                    let val = FileCacheEntry::dummy();
+                    let entry = writer.entry(key);
+                    std::hint::black_box(match entry {
+                        Entry::Occupied(mut e) => {
+                            e.insert(val);
+                        }
+                        Entry::Vacant(e) => {
+                            _ = e.insert(val).unwrap();
+                        }
+                    })
+                }
+            },
+            BatchSize::SmallInput,
+        )
+    });
+
+    group.bench_function("real_rehash", |b| {
+        let size = 125_000_000;
+        let ideal_filled = 100_000_000;
+        let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
+        let mut rng = rand::rng();
+        while writer.get_num_buckets_in_use() < ideal_filled {
+            let key: FileCacheKey = rng.random();
+            let val = FileCacheEntry::dummy();
+            apply_op(TestOp(key, Some(val)), &mut writer);
+        }
+        b.iter(|| writer.shuffle());
+    });
+
+    group.bench_function("real_rehash_hashbrown", |b| {
+        let size = 125_000_000;
+        let ideal_filled = 100_000_000;
+        let mut writer = hashbrown::raw::RawTable::new();
+        let mut rng = rand::rng();
+        let hasher = rustc_hash::FxBuildHasher::default();
+        unsafe {
+            writer
+                .resize(
+                    size,
+                    |(k, _)| hasher.hash_one(&k),
+                    hashbrown::raw::Fallibility::Infallible,
+                )
+                .unwrap();
+        }
+        while writer.len() < ideal_filled as usize {
+            let key: FileCacheKey = rng.random();
+            let val = FileCacheEntry::dummy();
+            writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
+                hasher.hash_one(&k)
+            });
+        }
+        b.iter(|| unsafe {
+            writer.table.rehash_in_place(
+                &|table, index| {
+                    hasher.hash_one(
+                        &table
+                            .bucket::<(FileCacheKey, FileCacheEntry)>(index)
+                            .as_ref()
+                            .0,
+                    )
+                },
+                std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
+                if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
+                    Some(|ptr| std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry)))
+                } else {
+                    None
+                },
+            )
+        });
+    });
+
+    for elems in [2, 4, 8, 16, 32, 64, 96, 112] {
+        group.bench_with_input(
+            BenchmarkId::new("real_rehash_varied", elems),
+            &elems,
+            |b, &size| {
+                let ideal_filled = size * 1_000_000;
+                let size = 125_000_000;
+                let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
+                let mut rng = rand::rng();
+                while writer.get_num_buckets_in_use() < ideal_filled as usize {
+                    let key: FileCacheKey = rng.random();
+                    let val = FileCacheEntry::dummy();
+                    apply_op(TestOp(key, Some(val)), &mut writer);
+                }
+                b.iter(|| writer.shuffle());
+            },
+        );
+        group.bench_with_input(
+            BenchmarkId::new("real_rehash_varied_hashbrown", elems),
+            &elems,
+            |b, &size| {
+                let ideal_filled = size * 1_000_000;
+                let size = 125_000_000;
+                let mut writer = hashbrown::raw::RawTable::new();
+                let mut rng = rand::rng();
+                let hasher = rustc_hash::FxBuildHasher::default();
+                unsafe {
+                    writer
+                        .resize(
+                            size,
+                            |(k, _)| hasher.hash_one(&k),
+                            hashbrown::raw::Fallibility::Infallible,
+                        )
+                        .unwrap();
+                }
+                while writer.len() < ideal_filled as usize {
+                    let key: FileCacheKey = rng.random();
+                    let val = FileCacheEntry::dummy();
+                    writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
+                        hasher.hash_one(&k)
+                    });
+                }
+                b.iter(|| unsafe {
+                    writer.table.rehash_in_place(
+                        &|table, index| {
+                            hasher.hash_one(
+                                &table
+                                    .bucket::<(FileCacheKey, FileCacheEntry)>(index)
+                                    .as_ref()
+                                    .0,
+                            )
+                        },
+                        std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
+                        if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
+                            Some(|ptr| {
+                                std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry))
+                            })
+                        } else {
+                            None
+                        },
+                    )
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, small_benchs, real_benchs);
+criterion_main!(benches);
--- a/libs/neon-shmem/src/hash.rs
+++ b/libs/neon-shmem/src/hash.rs
@@ -0,0 +1,622 @@
+use std::cell::UnsafeCell;
+use std::hash::{BuildHasher, Hash};
+use std::mem::MaybeUninit;
+use std::ptr::NonNull;
+use std::sync::atomic::Ordering;
+
+use crate::shmem::ShmemHandle;
+use crate::{shmem, sync::*};
+
+mod core;
+mod bucket;
+pub mod entry;
+
+#[cfg(test)]
+mod tests;
+
+use core::{
+	CoreHashMap, DictShard, EntryKey, EntryTag,
+	FullError, MaybeUninitDictShard
+};
+use bucket::{Bucket, BucketIdx};
+use entry::Entry;
+
+/// Wrapper struct around multiple [`ShmemHandle`]s.
+struct HashMapHandles {
+	keys_shmem: ShmemHandle,
+	idxs_shmem: ShmemHandle,
+	vals_shmem: ShmemHandle,
+}
+
+/// This represents a hash table that (possibly) lives in shared memory.
+/// If a new process is launched with fork(), the child process inherits
+/// this struct.
+#[must_use]
+pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
+    shmem_handles: Option<HashMapHandles>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+    hasher: S,
+    num_buckets: usize,
+	num_shards: usize,
+	resize_lock: Mutex<()>,
+}
+
+/// This is a per-process handle to a hash table that (possibly) lives in shared memory.
+/// If a child process is launched with fork(), the child process should
+/// get its own HashMapAccess by calling HashMapInit::attach_writer/reader().
+///
+/// XXX: We're not making use of it at the moment, but this struct could
+/// hold process-local information in the future.
+pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
+    shmem_handles: Option<HashMapHandles>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+    hasher: S,
+	resize_lock: Mutex<()>,
+}
+
+unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<'_, K, V, S> {}
+unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<'_, K, V, S> {}
+
+impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
+    /// Change the 'hasher' used by the hash table.
+    ///
+    /// NOTE: This must be called right after creating the hash table,
+    /// before inserting any entries and before calling attach_writer/reader.
+    /// Otherwise different accessors could be using different hash function,
+    /// with confusing results.
+	///
+	/// TODO(quantumish): consider splitting out into a separate builder type?
+    pub fn with_hasher<T: BuildHasher>(self, hasher: T) -> HashMapInit<'a, K, V, T> {
+        HashMapInit {
+            hasher,
+            shmem_handles: self.shmem_handles,
+            shared_ptr: self.shared_ptr,
+            num_buckets: self.num_buckets,
+			num_shards: self.num_shards,
+			resize_lock: self.resize_lock,
+        }
+    }
+
+    /// Loosely (over)estimate the size needed to store a hash table with `num_buckets` buckets.
+    pub fn estimate_sizes(num_buckets: usize, num_shards: usize) -> (usize, usize, usize) {
+		(
+			(size_of::<EntryKey<K>>() * num_buckets)
+				+ (size_of::<libc::pthread_rwlock_t>() * num_shards)
+				+ (size_of::<RwLock<DictShard<'_, K>>>() * num_shards)
+				+ size_of::<HashMapShared<K, V>>()
+				+ 1000,
+			(size_of::<BucketIdx>() * num_buckets)+ 1000,
+			(size_of::<Bucket<V>>() * num_buckets) + 1000
+		)
+	}
+
+	fn carve_space<T>(ptr: &mut *mut u8, amount: usize) -> *mut T {
+		*ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<T>())) };
+        let out = ptr.cast();
+        *ptr = unsafe { ptr.add(size_of::<T>() * amount) };
+		out
+	}
+	
+    fn new(
+        num_buckets: usize,
+		num_shards: usize,
+        mut keys_ptr: *mut u8,
+		mut idxs_ptr: *mut u8,
+		mut vals_ptr: *mut u8,
+        shmem_handles: Option<HashMapHandles>,
+        hasher: S,
+    ) -> Self {
+		// Set up the main area: hashmap info at front, keys at back
+		let mutex_ptr = Self::carve_space::<libc::pthread_mutex_t>(&mut keys_ptr, 1);
+		let shared_ptr = Self::carve_space::<HashMapShared<K, V>>(&mut keys_ptr, 1);
+		let shards_ptr = Self::carve_space::<RwLock<DictShard<'_, K>>>(&mut keys_ptr, num_shards);
+		let locks_ptr = Self::carve_space::<libc::pthread_rwlock_t>(&mut keys_ptr, num_shards);
+		let keys_ptr = Self::carve_space::<EntryKey<K>>(&mut keys_ptr, num_buckets);
+		
+		// Set up the area of bucket idxs and the area of buckets. Not much to do!
+		let idxs_ptr = Self::carve_space::<BucketIdx>(&mut idxs_ptr, num_buckets);
+		let vals_ptr = Self::carve_space::<Bucket<V>>(&mut vals_ptr, num_buckets);
+
+		// Initialize the shards.
+		let shards_uninit: &mut [MaybeUninit<RwLock<MaybeUninitDictShard<'_, K>>>] =
+            unsafe { std::slice::from_raw_parts_mut(shards_ptr.cast(), num_shards) };
+		let shard_size = num_buckets / num_shards;
+		for i in 0..num_shards {
+			let size = ((i + 1) * shard_size).min(num_buckets) - (i * shard_size);
+			unsafe {
+				shards_uninit[i].write(RwLock::from_raw(
+					PthreadRwLock::new(NonNull::new_unchecked(locks_ptr.add(i))),
+					MaybeUninitDictShard {
+						keys: std::slice::from_raw_parts_mut(keys_ptr.add(i * shard_size).cast(), size),
+						idxs: std::slice::from_raw_parts_mut(idxs_ptr.add(i * shard_size).cast(), size)
+					}
+				));
+			};
+		}
+		let shards: &mut [RwLock<MaybeUninitDictShard<'_, K>>] =
+            unsafe { std::slice::from_raw_parts_mut(shards_ptr.cast(), num_shards) };
+        let buckets: *const [MaybeUninit<Bucket<V>>] = 
+            unsafe { std::slice::from_raw_parts(vals_ptr.cast(), num_buckets) };
+
+		unsafe { 
+			let hashmap = CoreHashMap::new(&*(buckets as *const UnsafeCell<_>), shards);
+			std::ptr::write(shared_ptr, hashmap);
+		}
+
+		let resize_lock = Mutex::from_raw(
+			unsafe { PthreadMutex::new(NonNull::new_unchecked(mutex_ptr)) }, ()
+		);
+		
+        Self {
+			num_shards,
+            num_buckets,
+            shmem_handles,
+            shared_ptr,
+            hasher,
+			resize_lock, 
+        }
+    }
+
+    /// Attach to a hash table for writing.
+    pub fn attach_writer(self) -> HashMapAccess<'a, K, V, S> {
+        HashMapAccess {
+            shmem_handles: self.shmem_handles,
+            shared_ptr: self.shared_ptr,
+            hasher: self.hasher,
+			resize_lock: self.resize_lock,
+        }
+    }
+
+    /// Initialize a table for reading. Currently identical to [`HashMapInit::attach_writer`].
+    pub fn attach_reader(self) -> HashMapAccess<'a, K, V, S> {
+        self.attach_writer()
+    }
+}
+
+type HashMapShared<'a, K, V> = CoreHashMap<'a, K, V>;
+
+impl<'a, K, V> HashMapInit<'a, K, V, rustc_hash::FxBuildHasher>
+where
+    K: Clone + Hash + Eq,
+{
+    /// Place the hash table within a user-supplied fixed memory area.
+    pub fn with_fixed(
+		num_buckets: usize,
+		num_shards: usize,
+		area: &'a mut [MaybeUninit<u8>]
+	) -> Self {
+		let (keys_size, idxs_size, _) = Self::estimate_sizes(num_buckets, num_shards);
+		let ptr = area.as_mut_ptr().cast();
+        Self::new(
+            num_buckets,
+			num_shards,
+            ptr,
+			unsafe { ptr.add(keys_size) },
+			unsafe { ptr.add(keys_size).add(idxs_size) },
+            None,
+            rustc_hash::FxBuildHasher,
+        )
+    }
+
+    /// Place a new hash map in the given shared memory area
+    ///
+    /// # Panics
+    /// Will panic on failure to resize area to expected map size.
+    pub fn with_shmems(
+		num_buckets: usize,
+		num_shards: usize,
+		keys_shmem: ShmemHandle,
+		idxs_shmem: ShmemHandle,
+		vals_shmem: ShmemHandle,
+	) -> Self {
+		let (keys_size, idxs_size, vals_size) = Self::estimate_sizes(num_buckets, num_shards);
+        keys_shmem.set_size(keys_size).expect("could not resize shared memory area");
+        idxs_shmem.set_size(idxs_size).expect("could not resize shared memory area");
+        vals_shmem.set_size(vals_size).expect("could not resize shared memory area");
+        Self::new(
+            num_buckets,
+			num_shards,
+            keys_shmem.data_ptr.as_ptr().cast(),
+			idxs_shmem.data_ptr.as_ptr().cast(),
+			vals_shmem.data_ptr.as_ptr().cast(),
+            Some(HashMapHandles { keys_shmem, idxs_shmem, vals_shmem }),
+            rustc_hash::FxBuildHasher,
+        )
+    }
+
+    /// Make a resizable hash map within a new shared memory area with the given name.
+    pub fn new_resizeable_named(
+		num_buckets: usize,
+		max_buckets: usize,
+		num_shards: usize,
+		name: &str
+	) -> Self {
+		let (keys_size, idxs_size, vals_size) = Self::estimate_sizes(num_buckets, num_shards);
+		let (keys_max, idxs_max, vals_max) = Self::estimate_sizes(max_buckets, num_shards);
+        let keys_shmem = ShmemHandle::new(&format!("{name}_keys"), keys_size, keys_max)
+			.expect("failed to make shared memory area");
+		let idxs_shmem = ShmemHandle::new(&format!("{name}_idxs"), idxs_size, idxs_max)
+			.expect("failed to make shared memory area");
+		let vals_shmem = ShmemHandle::new(&format!("{name}_vals"), vals_size, vals_max)
+			.expect("failed to make shared memory area");
+        Self::new(
+            num_buckets,
+			num_shards,
+            keys_shmem.data_ptr.as_ptr().cast(),
+			idxs_shmem.data_ptr.as_ptr().cast(),
+			vals_shmem.data_ptr.as_ptr().cast(),
+            Some(HashMapHandles { keys_shmem, idxs_shmem, vals_shmem }),
+            rustc_hash::FxBuildHasher,
+        )
+    }
+
+    /// Make a resizable hash map within a new anonymous shared memory area.
+    pub fn new_resizeable(
+		num_buckets: usize,
+		max_buckets: usize,
+		num_shards: usize,
+	) -> Self {
+        use std::sync::atomic::{AtomicUsize, Ordering};
+        static COUNTER: AtomicUsize = AtomicUsize::new(0);
+        let val = COUNTER.fetch_add(1, Ordering::Relaxed);
+        let name = format!("neon_shmem_hmap{val}");
+        Self::new_resizeable_named(num_buckets, max_buckets, num_shards, &name)
+    }
+}
+
+impl<'a, K, V, S: BuildHasher> HashMapAccess<'a, K, V, S>
+where
+    K: Clone + Hash + Eq,
+{
+    /// Hash a key using the map's hasher.
+    #[inline]
+    fn get_hash_value(&self, key: &K) -> u64 {
+        self.hasher.hash_one(key)
+    }
+
+    /// Get a reference to the corresponding value for a key.
+    pub fn get<'e>(&'e self, key: &K) -> Option<ValueReadGuard<'e, V>> {
+        let hash = self.get_hash_value(key);
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+		map.get_with_hash(key, hash)
+    }
+
+    /// Get a reference to the entry containing a key.
+    pub fn entry(&self, key: K) -> Result<Entry<'a, K, V>, FullError> {
+        let hash = self.get_hash_value(&key);
+		let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+        map.entry_with_hash(key, hash)
+    }
+
+    /// Remove a key given its hash. Returns the associated value if it existed.
+    pub fn remove(&self, key: &K) -> Option<V> {
+		let hash = self.get_hash_value(key);
+		let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+        match map.entry_with_hash(key.clone(), hash) {
+            Ok(Entry::Occupied(mut e)) => Some(e.remove()),
+            _ => None,
+        }
+    }
+
+    /// Insert/update a key. Returns the previous associated value if it existed.
+    ///
+    /// # Errors
+    /// Will return [`core::FullError`] if there is no more space left in the map.
+    pub fn insert(&self, key: K, value: V) -> Result<Option<V>, core::FullError> {
+        let hash = self.get_hash_value(&key);
+		let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+        match map.entry_with_hash(key.clone(), hash)? {
+            Entry::Occupied(mut e) => Ok(Some(e.insert(value))),
+            Entry::Vacant(e) => {
+                _ = e.insert(value);
+                Ok(None)
+            }
+        }
+    }
+
+    pub unsafe fn get_at_bucket(&self, pos: usize) -> Option<&V> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+        if pos >= map.bucket_arr.len() {
+            return None;
+        }
+
+		let bucket = &map.bucket_arr[pos];
+		if bucket.next.load(Ordering::Relaxed).full_checked().is_some() {
+			Some(unsafe { bucket.val.assume_init_ref() })
+		} else {
+			None
+		}
+    }
+
+	pub unsafe fn entry_at_bucket(&self, pos: usize) -> Option<entry::OccupiedEntry<'a, K, V>> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+        if pos >= map.bucket_arr.len() {
+            return None;
+        }
+
+		let bucket = &map.bucket_arr[pos];
+		bucket.next.load(Ordering::Relaxed).full_checked().map(|entry_pos| {
+			let shard_size = map.get_num_buckets() / map.dict_shards.len();
+			let shard_index = entry_pos / shard_size;
+			let shard_off = entry_pos % shard_size;
+			entry::OccupiedEntry {
+				shard: map.dict_shards[shard_index].write(),
+				shard_pos: shard_off,
+				bucket_pos: pos,
+				bucket_arr: &map.bucket_arr,
+				key_pos: entry_pos,
+			}		
+		})
+    }
+	
+    /// bucket the number of buckets in the table.
+    pub fn get_num_buckets(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        map.get_num_buckets()
+    }
+
+    /// Returns the index of the bucket a given value corresponds to.
+    pub fn get_bucket_for_value(&self, val_ptr: *const V) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+
+        let origin = map.bucket_arr.as_mut_ptr() as *const _;
+        let idx = (val_ptr as usize - origin as usize) / size_of::<Bucket<V>>();
+        assert!(idx < map.bucket_arr.len());
+
+        idx
+    }
+
+    /// Returns the number of occupied buckets in the table.
+    pub fn get_num_buckets_in_use(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        map.bucket_arr.buckets_in_use.load(Ordering::Relaxed)
+    }
+
+    /// Clears all entries in a table. Does not reset any shrinking operations.
+    pub fn clear(&self) {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+        map.clear();
+	}
+
+	/// Begin a rehash operation. Converts all existing entries
+	// TODO: missing logic to prevent furhter resize operations when one is already underway.
+	// One future feature could be to allow interruptible resizes. We wouldn't pay much of a
+	// space penalty if we used something like https://crates.io/crates/u4 inside EntryTag
+	// to allow for many tiers of older chains (we would have to track previous sizes within
+	// a sliding window at the front of the memory region or something)
+    fn begin_rehash(
+		&self,
+		shards: &mut Vec<RwLockWriteGuard<'_, DictShard<'_, K>>>,
+		rehash_buckets: usize
+	) -> bool {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+		assert!(rehash_buckets <= map.get_num_buckets(), "rehashing subset of buckets");
+
+		if map.rehash_index.load(Ordering::Relaxed) >= map.rehash_end.load(Ordering::Relaxed) {
+			return false;
+		}
+		
+		shards.iter_mut().for_each(|x| x.keys.iter_mut().for_each(|key| {
+			match key.tag {
+				EntryTag::Occupied => key.tag = EntryTag::Rehash,
+				EntryTag::Tombstone => key.tag = EntryTag::RehashTombstone,
+				_ => (),
+			}
+		}));
+
+		map.rehash_index.store(0, Ordering::Relaxed);
+		map.rehash_end.store(rehash_buckets, Ordering::Relaxed);
+		true
+    }
+
+	// Unfinished, final large-ish piece standing in the way of a prototype.
+	//
+	// Based off the hashbrown implementation but adapted to an incremental context. See below:
+	// https://github.com/quantumish/hashbrown/blob/6610e6d2b1f288ef7b0709a3efefbc846395dc5e/src/raw/mod.rs#L2866
+	fn do_rehash(&self) -> bool {
+		let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+		// TODO(quantumish): refactor these out into settable quantities
+		const REHASH_CHUNK_SIZE: usize = 10;
+
+		let end = map.rehash_end.load(Ordering::Relaxed);
+		let ind = map.rehash_index.load(Ordering::Relaxed);
+		if ind >= end { return true }
+
+		// We have to use a mutex to prevent concurrent rehashes as they provide a pretty
+		// obvious chance at a deadlock: one thread wants to rehash an entry into a shard
+		// which is held by another thread which wants to rehash its block into the shard
+		// held by the first. Doesn't seem like there's an obvious way around this?
+		let _guard = self.resize_lock.try_lock();
+		if _guard.is_none() { return false }
+		
+		map.rehash_index.store((ind+REHASH_CHUNK_SIZE).min(end), Ordering::Relaxed);
+		
+		let shard_size = map.get_num_buckets() / map.dict_shards.len();
+		for i in ind..(ind+REHASH_CHUNK_SIZE).min(end) {
+			let (shard_index, shard_off) = (i / shard_size, i % shard_size);
+			let mut shard = map.dict_shards[shard_index].write();
+			if shard.keys[shard_off].tag != EntryTag::Rehash {
+				continue;
+			}
+			loop {
+				let hash = self.get_hash_value(unsafe {
+					shard.keys[shard_off].val.assume_init_ref()
+				});
+
+				let key = unsafe { shard.keys[shard_off].val.assume_init_ref() }.clone();
+				let new = map.entry(key, hash, |tag| match tag {
+					EntryTag::Empty => core::MapEntryType::Empty,
+					EntryTag::Occupied => core::MapEntryType::Occupied,
+					EntryTag::Tombstone => core::MapEntryType::Skip,
+					_ => core::MapEntryType::Tombstone,
+				}).unwrap();
+
+				// I believe the blocker here is that this unfortunately this would require
+				// duplicating a lot of the logic of a write lookup again but with the caveat
+				// that we're already holding one of the shard locks and need to pass that
+				// context on. One thing I was considering at the time was using a hashmap to
+				// manage the lock guards and passing that around?
+				todo!("finish rehash implementation")
+				// match new.tag() {
+				// 	EntryTag::Empty | EntryTag::RehashTombstone => {
+				// 		shard.keys[shard_off].tag = EntryTag::Empty;
+				// 		unsafe {
+				// 			std::mem::swap(
+				// 				shard.keys[shard_off].val.assume_init_mut(),
+				// 				new.
+				// 	},
+				// 	EntryTag::Rehash => {
+						
+				// 	},
+				// 	_ => unreachable!()
+				// }
+			}
+		}
+		false
+	}
+
+	pub fn finish_rehash(&self) {
+		let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+		while self.do_rehash() {}
+	}
+
+	pub fn shuffle(&self) {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+		let mut shards: Vec<_> = map.dict_shards.iter().map(|x| x.write()).collect();
+		self.begin_rehash(&mut shards, map.get_num_buckets());
+    }
+	
+	fn reshard(&self, shards: &mut Vec<RwLockWriteGuard<'_, DictShard<'_, K>>>, num_buckets: usize) {
+		let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+		let shard_size = num_buckets / map.dict_shards.len();
+		for i in 0..map.dict_shards.len() {
+			let size = ((i + 1) * shard_size).min(num_buckets) - (i * shard_size);
+			unsafe {
+				shards[i].keys = std::slice::from_raw_parts_mut(shards[i].keys.as_mut_ptr(), size);
+				shards[i].idxs = std::slice::from_raw_parts_mut(shards[i].idxs.as_mut_ptr(), size);
+			}
+		}
+	}
+
+	fn resize_shmem(&self, num_buckets: usize) -> Result<(), shmem::Error> {
+		let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+		let shmem_handles = self
+            .shmem_handles
+            .as_ref()
+            .expect("grow called on a fixed-size hash table");
+
+		let (keys_size, idxs_size, vals_size) =
+			HashMapInit::<K, V, S>::estimate_sizes(num_buckets, map.dict_shards.len());
+        shmem_handles.keys_shmem.set_size(keys_size)?;
+		shmem_handles.idxs_shmem.set_size(idxs_size)?;
+		shmem_handles.vals_shmem.set_size(vals_size)?;
+		Ok(())
+	}
+
+    pub fn grow(&self, num_buckets: usize) -> Result<(), shmem::Error> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+		let _resize_guard = self.resize_lock.lock();
+		let mut shards: Vec<_> = map.dict_shards.iter().map(|x| x.write()).collect();
+
+		let old_num_buckets = map.bucket_arr.len();
+        assert!(
+            num_buckets >= old_num_buckets,
+            "grow called with a smaller number of buckets"
+        );
+        if num_buckets == old_num_buckets {
+            return Ok(());
+        }
+
+		// Grow memory areas and initialize each of them.
+		self.resize_shmem(num_buckets)?;                
+        unsafe {
+			let buckets_ptr = map.bucket_arr.as_mut_ptr();
+            for i in old_num_buckets..num_buckets {
+                let bucket = buckets_ptr.add(i);
+                bucket.write(Bucket::empty(
+                    if i < num_buckets - 1 {
+                        BucketIdx::new(i + 1)
+                    } else {
+                        map.bucket_arr.free_head.load(Ordering::Relaxed)
+                    }
+                ));
+            }
+
+			// TODO(quantumish) a bit questionable to use pointers here
+			let first_shard = &mut shards[0];
+			let keys_ptr = first_shard.keys.as_mut_ptr();			
+			for i in old_num_buckets..num_buckets {
+                let key = keys_ptr.add(i);
+                key.write(EntryKey {
+					tag: EntryTag::Empty,
+					val: MaybeUninit::uninit(),
+				});
+            }
+			
+			let idxs_ptr = first_shard.idxs.as_mut_ptr();
+			for i in old_num_buckets..num_buckets {
+                let idx = idxs_ptr.add(i);
+                idx.write(BucketIdx::INVALID);
+            }
+        }
+
+		self.reshard(&mut shards, num_buckets);
+        map.bucket_arr.free_head.store(
+			BucketIdx::new(old_num_buckets), Ordering::Relaxed
+		);
+        self.begin_rehash(&mut shards, old_num_buckets);
+        Ok(())
+    }
+
+    pub fn begin_shrink(&mut self, num_buckets: usize) {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+		let _resize_guard = self.resize_lock.lock();
+        assert!(
+            num_buckets <= map.get_num_buckets(),
+            "shrink called with a larger number of buckets"
+        );
+        _ = self
+            .shmem_handles
+            .as_ref()
+            .expect("shrink called on a fixed-size hash table");
+        map.bucket_arr.alloc_limit.store(
+			BucketIdx::new(num_buckets), Ordering::SeqCst
+		);
+    }
+
+	// TODO(quantumish): Safety? Maybe replace this with expanded version of finish_shrink?
+    pub fn shrink_goal(&self) -> Option<usize> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+        let goal = map.bucket_arr.alloc_limit.load(Ordering::Relaxed);
+		goal.next_checked()
+	}
+
+    pub fn finish_shrink(&self) -> Result<(), shmem::Error> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+		let _resize_guard = self.resize_lock.lock();
+		let mut shards: Vec<_> = map.dict_shards.iter().map(|x| x.write()).collect();
+		
+        let num_buckets = map.bucket_arr.alloc_limit
+			.load(Ordering::Relaxed)
+			.next_checked()
+			.expect("called finish_shrink when no shrink is in progress");
+        
+        if map.get_num_buckets() == num_buckets {
+            return Ok(());
+        }
+
+        assert!(
+            map.bucket_arr.buckets_in_use.load(Ordering::Relaxed) <= num_buckets,
+            "called finish_shrink before enough entries were removed"
+        );
+
+		self.resize_shmem(num_buckets)?;
+
+		self.reshard(&mut shards, num_buckets);
+		
+        map.bucket_arr.alloc_limit.store(BucketIdx::INVALID, Ordering::Relaxed);
+        self.begin_rehash(&mut shards, num_buckets);
+
+        Ok(())
+    }
+}
--- a/libs/neon-shmem/src/hash/bucket.rs
+++ b/libs/neon-shmem/src/hash/bucket.rs
@@ -0,0 +1,301 @@
+//! Lock-free stable array of buckets managed with a freelist.
+//!
+//! Since the positions of entries in the dictionary and the bucket array are not correlated,
+//! we either had to separately shard both and deal with the overhead of two lock acquisitions
+//! per read/write, or make the bucket array lock free. This is *generally* fine since most
+//! accesses of the bucket array are done while holding the lock on the corresponding dict shard
+//! and thus synchronized. May not hold up to the removals done by the LFC which is a problem.
+//!
+//! Routines are pretty closely adapted from https://timharris.uk/papers/2001-disc.pdf 
+//! 
+//! Notable caveats:
+//! - Can only store around 2^30 entries, which is actually only 10x our current workload.
+//!  - This is because we need two tag bits to distinguish full/empty and marked/unmarked entries.
+//! - Has not been seriously tested.
+//!
+//! Full entries also store the index to their corresponding dictionary entry in order
+//! to enable .entry_at_bucket() which is needed for the clock eviction algo in the LFC.
+
+use std::cell::UnsafeCell;
+use std::mem::MaybeUninit;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use atomic::Atomic;
+
+#[derive(bytemuck::NoUninit, Clone, Copy, PartialEq, Eq)]
+#[repr(transparent)]
+pub(crate) struct BucketIdx(pub(super) u32);
+
+// This should always be true as `BucketIdx` is a simple newtype.
+const _: () = assert!(Atomic::<BucketIdx>::is_lock_free());
+
+impl BucketIdx {
+	/// Tag for next pointers in free entries.
+	const NEXT_TAG: u32 = 0b00 << 30;
+	/// Tag for marked next pointers in free entries.
+	const MARK_TAG: u32 = 0b01 << 30;
+	/// Tag for full entries.
+	const FULL_TAG: u32 = 0b10 << 30;
+	/// Reserved. Don't use me.
+	const RSVD_TAG: u32 = 0b11 << 30;
+
+	/// Invalid index within the bucket array (can be mixed with any tag).
+	pub const INVALID: Self = Self(0x3FFFFFFF);
+	/// Max index within the bucket array (can be mixed with any tag).
+	pub const MAX: usize = Self::INVALID.0 as usize - 1;
+
+	pub(super) fn is_marked(&self) -> bool {
+		self.0 & Self::RSVD_TAG == Self::MARK_TAG
+	}
+
+	pub(super) fn as_marked(self) -> Self {
+		Self((self.0 & Self::INVALID.0) | Self::MARK_TAG)
+	}
+
+	pub(super) fn get_unmarked(self) -> Self {
+		Self(self.0 & Self::INVALID.0)
+	}
+	
+	pub fn new(val: usize) -> Self {
+		debug_assert!(val < Self::MAX);
+		Self(val as u32)
+	}
+
+	pub fn new_full(val: usize) -> Self {
+		debug_assert!(val < Self::MAX);
+		Self(val as u32 | Self::FULL_TAG)
+	}
+
+	/// Try to extract a valid index if the tag is NEXT.
+	pub fn next_checked(&self) -> Option<usize> {
+		if self.0 & Self::RSVD_TAG == Self::NEXT_TAG && *self != Self::INVALID {
+			Some(self.0 as usize)
+		} else {
+			None
+		}
+	}
+
+	/// Try to extract an index if the tag is FULL.
+	pub fn full_checked(&self) -> Option<usize> {
+		if self.0 & Self::RSVD_TAG == Self::FULL_TAG {
+			Some((self.0 & Self::INVALID.0) as usize) 
+		} else {
+			None
+		}
+	}
+}
+
+/// Entry within the bucket array. Value is only initialized if you 
+pub(crate) struct Bucket<V> {
+	// Only initialized if `next` field is tagged with FULL.
+	pub val: MaybeUninit<V>,
+	// Either points to next entry in freelist if empty or points
+	// to the corresponding entry in dictionary if full.
+	pub next: Atomic<BucketIdx>,
+}
+
+impl<V> Bucket<V> {
+	pub fn empty(next: BucketIdx) -> Self {		
+		Self {
+			val: MaybeUninit::uninit(),
+			next: Atomic::new(next)
+		}
+	}
+
+	pub fn as_ref(&self) -> &V {
+		unsafe { self.val.assume_init_ref() }
+	}
+
+	pub fn as_mut(&mut self) -> &mut V {
+		unsafe { self.val.assume_init_mut() }
+	}
+
+	pub fn replace(&mut self, new_val: V) -> V {
+		unsafe { std::mem::replace(self.val.assume_init_mut(), new_val) }
+	}
+}
+
+pub(crate) struct BucketArray<'a, V> {
+	/// Buckets containing values.
+    pub(crate) buckets: &'a UnsafeCell<[Bucket<V>]>,
+    /// Head of the freelist.
+    pub(crate) free_head: Atomic<BucketIdx>,
+    /// Maximum index of a bucket allowed to be allocated.
+    pub(crate) alloc_limit: Atomic<BucketIdx>,
+    /// The number of currently occupied buckets.
+    pub(crate) buckets_in_use: AtomicUsize,
+    // Unclear what the purpose of this is.
+    pub(crate) _user_list_head: Atomic<BucketIdx>,
+}
+
+impl <'a, V> std::ops::Index<usize> for BucketArray<'a, V> {
+	type Output = Bucket<V>;
+		
+	fn index(&self, index: usize) -> &Self::Output {
+		let buckets: &[_] = unsafe { &*(self.buckets.get() as *mut _) };
+		&buckets[index]
+	}
+}
+
+impl <'a, V> std::ops::IndexMut<usize> for BucketArray<'a, V> {
+	fn index_mut(&mut self, index: usize) -> &mut Self::Output {
+		let buckets: &mut [_] = unsafe { &mut *(self.buckets.get() as *mut _) };
+		&mut buckets[index]
+	}
+}
+
+impl<'a, V> BucketArray<'a, V> {
+	pub fn new(buckets: &'a UnsafeCell<[Bucket<V>]>) -> Self {		
+		Self {
+			buckets,
+			free_head: Atomic::new(BucketIdx(0)),
+			_user_list_head: Atomic::new(BucketIdx(0)),
+			alloc_limit: Atomic::new(BucketIdx::INVALID),
+			buckets_in_use: 0.into(),
+		}
+	}
+
+	pub fn as_mut_ptr(&self) -> *mut Bucket<V> {
+		unsafe { (&mut *self.buckets.get()).as_mut_ptr() }
+	}
+
+	pub fn get_mut(&self, index: usize) -> &mut Bucket<V> {
+		let buckets: &mut [_] = unsafe { &mut *(self.buckets.get() as *mut _) };
+		&mut buckets[index]
+	}
+	
+	pub fn len(&self) -> usize {
+		unsafe { (&*self.buckets.get()).len() }
+	}
+
+	/// Deallocate a bucket, adding it to the free list.
+	// Adapted from List::insert in https://timharris.uk/papers/2001-disc.pdf
+	pub fn dealloc_bucket(&self, pos: usize) -> V {
+		loop {
+			let free = self.free_head.load(Ordering::Relaxed);
+			self[pos].next.store(free, Ordering::Relaxed);
+			if self.free_head.compare_exchange_weak(
+				free, BucketIdx::new(pos), Ordering::Relaxed, Ordering::Relaxed
+			).is_ok() {
+				self.buckets_in_use.fetch_sub(1, Ordering::Relaxed);
+				return unsafe { self[pos].val.assume_init_read() };
+			}
+		}
+	}
+
+	/// Find a usable bucket at the front of the free list.
+	// Adapted from List::search in https://timharris.uk/papers/2001-disc.pdf
+	#[allow(unused_assignments)]
+	fn find_bucket(&self) -> (BucketIdx, BucketIdx) {
+		let mut left_node = BucketIdx::INVALID;
+		let mut right_node = BucketIdx::INVALID;
+		let mut left_node_next = BucketIdx::INVALID;
+		
+		loop { 
+			let mut t = BucketIdx::INVALID;
+			let mut t_next = self.free_head.load(Ordering::Relaxed);
+			let alloc_limit = self.alloc_limit.load(Ordering::Relaxed).next_checked();
+			while t_next.is_marked() || t.next_checked()
+				.map_or(true, |v| alloc_limit.map_or(false, |l| v > l))
+			{
+				if !t_next.is_marked() {
+					left_node = t;
+					left_node_next = t_next;
+				}
+				t = t_next.get_unmarked();
+				if t == BucketIdx::INVALID { break }
+				t_next = self[t.0 as usize].next.load(Ordering::Relaxed);
+			}
+			right_node = t;
+
+			if left_node_next == right_node {
+				if right_node != BucketIdx::INVALID && self[right_node.0 as usize]
+					.next.load(Ordering::Relaxed).is_marked()
+				{					
+					continue;
+				} else {
+					return (left_node, right_node);
+				}
+			}
+
+			let left_ref = if left_node != BucketIdx::INVALID {
+				&self[left_node.0 as usize].next					
+			} else { &self.free_head };
+			
+			if left_ref.compare_exchange_weak(
+				left_node_next, right_node, Ordering::Relaxed, Ordering::Relaxed
+			).is_ok() {
+				if right_node != BucketIdx::INVALID && self[right_node.0 as usize]
+					.next.load(Ordering::Relaxed).is_marked()
+				{
+					continue;
+				} else {
+					return (left_node, right_node);
+				}
+			}			
+		}
+	}
+
+	/// Pop a bucket from the free list. 
+	// Adapted from List::delete in https://timharris.uk/papers/2001-disc.pdf
+	#[allow(unused_assignments)]
+    pub(crate) fn alloc_bucket(&self, value: V, key_pos: usize) -> Option<BucketIdx> {
+		let mut right_node_next = BucketIdx::INVALID;
+		let mut left_idx = BucketIdx::INVALID;
+		let mut right_idx = BucketIdx::INVALID;
+		
+		loop {
+			(left_idx, right_idx) = self.find_bucket();
+			if right_idx == BucketIdx::INVALID {
+				return None;
+			}
+			
+			let right = &self[right_idx.0 as usize];
+			right_node_next = right.next.load(Ordering::Relaxed);
+			if !right_node_next.is_marked() {
+				if right.next.compare_exchange_weak(
+					right_node_next, right_node_next.as_marked(),
+					Ordering::Relaxed, Ordering::Relaxed
+				).is_ok() {
+					break;
+				}
+			}
+		}
+
+		let left_ref = if left_idx != BucketIdx::INVALID {
+			&self[left_idx.0 as usize].next
+		} else {
+			&self.free_head
+		};
+		
+		if left_ref.compare_exchange_weak(
+			right_idx, right_node_next,
+			Ordering::Relaxed, Ordering::Relaxed
+		).is_err() {
+			todo!()
+		}
+
+        self.buckets_in_use.fetch_add(1, Ordering::Relaxed);
+		self[right_idx.0 as usize].next.store(
+			BucketIdx::new_full(key_pos), Ordering::Relaxed
+		);
+		self.get_mut(right_idx.0 as usize).val.write(value);
+		Some(right_idx)
+    }
+
+	pub fn clear(&mut self) {
+		for i in 0..self.len() {
+			self[i] = Bucket::empty(
+				if i < self.len() - 1 {
+					BucketIdx::new(i + 1)
+				} else {
+					BucketIdx::INVALID
+				}				
+			);
+        }
+
+		self.free_head.store(BucketIdx(0), Ordering::Relaxed);
+        self.buckets_in_use.store(0, Ordering::Relaxed);
+	}
+}
+
--- a/libs/neon-shmem/src/hash/core.rs
+++ b/libs/neon-shmem/src/hash/core.rs
@@ -0,0 +1,335 @@
+//! Sharded linear probing hash table.
+
+//! NOTE/FIXME: one major bug with this design is that the current hashmap DOES NOT TRACK
+//! the previous size of the hashmap and thus does lookups incorrectly/badly. This should
+//! be a reasonably minor fix?
+
+use std::cell::UnsafeCell;
+use std::hash::Hash;
+use std::mem::MaybeUninit;
+use std::sync::atomic::{Ordering, AtomicUsize};
+
+use crate::sync::*;
+use crate::hash::{
+	entry::*,
+	bucket::{BucketArray, Bucket, BucketIdx}
+};
+
+/// Metadata tag for the type of an entry in the hashmap.
+#[derive(PartialEq, Eq, Clone, Copy)]
+pub(crate) enum EntryTag {
+	/// An occupied entry inserted after a resize operation.
+	Occupied,
+	/// An occupied entry inserted before a resize operation
+	/// a.k.a. an entry that needs to be rehashed at some point.
+	Rehash,
+	/// An entry that was once `Occupied`.
+	Tombstone,
+	/// An entry that was once `Rehash`.
+	RehashTombstone,
+	/// An empty entry.
+	Empty,
+}
+
+/// Searching the chains of a hashmap oftentimes requires interpreting
+/// a set of metadata tags differently. This enum encodes the ways a
+/// metadata tag can be treated during a lookup.
+pub(crate) enum MapEntryType {
+	/// Should be treated as if it were occupied.
+	Occupied,
+	/// Should be treated as if it were a tombstone.
+	Tombstone,
+	/// Should be treated as if it were empty.
+	Empty,
+	/// Should be ignored.
+	Skip
+}
+
+/// A key within the dictionary component of the hashmap.
+pub(crate) struct EntryKey<K> {
+	// NOTE: This could be split out to save 3 bytes per entry!
+	// Wasn't sure it was worth the penalty of another shmem area.
+	pub(crate) tag: EntryTag,
+	pub(crate) val: MaybeUninit<K>,
+}
+
+/// A shard of the dictionary.
+pub(crate) struct DictShard<'a, K> {
+	pub(crate) keys: &'a mut [EntryKey<K>],
+	pub(crate) idxs: &'a mut [BucketIdx],
+}
+
+impl<'a, K> DictShard<'a, K> {
+	fn len(&self) -> usize {
+		self.keys.len()
+	}
+}
+
+ pub(crate) struct MaybeUninitDictShard<'a, K> {
+	pub(crate) keys: &'a mut [MaybeUninit<EntryKey<K>>],
+	pub(crate) idxs: &'a mut [MaybeUninit<BucketIdx>],
+}
+
+/// Core hash table implementation.
+pub(crate) struct CoreHashMap<'a, K, V> {
+	/// Dictionary used to map hashes to bucket indices.
+    pub(crate) dict_shards: &'a mut [RwLock<DictShard<'a, K>>],
+	/// Stable bucket array used to store the values.
+	pub(crate) bucket_arr: BucketArray<'a, V>,
+	/// Index of the next entry to process for rehashing.
+	pub(crate) rehash_index: AtomicUsize,
+	/// Index of the end of the range to be rehashed.
+	pub(crate) rehash_end: AtomicUsize,
+}
+
+/// Error for when there are no empty buckets left but one is needed.
+#[derive(Debug, PartialEq)]
+pub struct FullError();
+
+impl<'a, K: Clone + Hash + Eq, V> CoreHashMap<'a, K, V> {
+    pub fn new(
+        buckets_cell: &'a UnsafeCell<[MaybeUninit<Bucket<V>>]>,
+        dict_shards: &'a mut [RwLock<MaybeUninitDictShard<'a, K>>],
+    ) -> Self {
+		let buckets = unsafe { &mut *buckets_cell.get() };
+        // Initialize the buckets
+		for i in 0..buckets.len() {
+			buckets[i].write(Bucket::empty(
+				if i < buckets.len() - 1 {
+					BucketIdx::new(i + 1)
+				} else {
+					BucketIdx::INVALID
+				})
+			);
+        }
+
+        // Initialize the dictionary
+		for shard in dict_shards.iter_mut() {
+			let mut dicts = shard.write();
+			for e in dicts.keys.iter_mut() {
+				e.write(EntryKey {
+					tag: EntryTag::Empty,
+					val: MaybeUninit::uninit(),
+				});
+			}
+			for e in dicts.idxs.iter_mut() {
+				e.write(BucketIdx::INVALID);
+			}
+		}
+
+        let buckets_cell = unsafe {
+			&*(buckets_cell as *const _ as *const UnsafeCell<_>)
+		};
+        // TODO: use std::slice::assume_init_mut() once it stabilizes
+        let dict_shards = unsafe {
+            std::slice::from_raw_parts_mut(dict_shards.as_mut_ptr().cast(),
+										   dict_shards.len())
+        };
+
+        Self {
+            dict_shards,
+			rehash_index: buckets.len().into(),
+			rehash_end: buckets.len().into(),
+			bucket_arr: BucketArray::new(buckets_cell),
+        }
+    }
+
+    /// Get the value associated with a key (if it exists) given its hash.
+	pub fn get_with_hash(&'a self, key: &K, hash: u64) -> Option<ValueReadGuard<'a, V>> {
+		let ind = self.rehash_index.load(Ordering::Relaxed);
+		let end = self.rehash_end.load(Ordering::Relaxed);
+
+		// First search the chains from the current context (thus treat 
+		// to-be-rehashed entries as tombstones within a current chain).
+		let res = self.get(key, hash, |tag| match tag {
+			EntryTag::Empty => MapEntryType::Empty,
+			EntryTag::Occupied => MapEntryType::Occupied,
+			_ => MapEntryType::Tombstone,
+		});
+		if res.is_some() {
+			return res;
+		}
+
+		if ind < end {
+			// Search chains from the previous size of the map if a rehash is in progress.
+			// Ignore any entries inserted since the resize operation occurred.
+			self.get(key, hash, |tag| match tag {
+				EntryTag::Empty => MapEntryType::Empty,
+				EntryTag::Rehash => MapEntryType::Occupied,
+				_ => MapEntryType::Tombstone,
+			})
+		} else { 
+			None
+		}
+	}
+	
+	pub fn entry_with_hash(&'a mut self, key: K, hash: u64) -> Result<Entry<'a, K, V>, FullError> {
+		let ind = self.rehash_index.load(Ordering::Relaxed);
+		let end = self.rehash_end.load(Ordering::Relaxed);
+
+		let res = self.entry(key.clone(), hash, |tag| match tag {
+			EntryTag::Empty => MapEntryType::Empty,
+			EntryTag::Occupied => MapEntryType::Occupied,
+			// We can't treat old entries as tombstones here, as we definitely can't
+			// insert over them! Instead we can just skip directly over them.
+			EntryTag::Rehash => MapEntryType::Skip,
+			_ => MapEntryType::Tombstone,
+		});
+		if ind < end {
+			if let Ok(Entry::Occupied(_)) = res {
+				res
+			} else {
+				self.entry(key, hash, |tag| match tag {
+					EntryTag::Empty => MapEntryType::Empty,
+					EntryTag::Occupied => MapEntryType::Skip,
+					EntryTag::Rehash => MapEntryType::Occupied,
+					_ => MapEntryType::Tombstone
+				})
+			}
+		} else {
+			res
+		}
+	}
+	
+    fn get<F>(&'a self, key: &K, hash: u64, f: F) -> Option<ValueReadGuard<'a, V>>
+	    where F: Fn(EntryTag) -> MapEntryType
+	{	
+		let num_buckets = self.get_num_buckets();
+		let shard_size = num_buckets / self.dict_shards.len();
+		let bucket_pos = hash as usize % num_buckets;
+		let shard_start = bucket_pos / shard_size;
+		for off in 0..self.dict_shards.len() {
+			let shard_idx = (shard_start + off) % self.dict_shards.len();
+			let shard = self.dict_shards[shard_idx].read();
+			let entry_start = if off == 0 { bucket_pos % shard_size } else { 0 };
+			for entry_idx in entry_start..shard.len() {
+				match f(shard.keys[entry_idx].tag) {
+					MapEntryType::Empty => return None,
+					MapEntryType::Tombstone | MapEntryType::Skip => continue, 
+					MapEntryType::Occupied => {
+						let cand_key = unsafe { shard.keys[entry_idx].val.assume_init_ref() };
+						if cand_key == key {
+							let bucket_idx = shard.idxs[entry_idx].next_checked()
+								.expect("position is valid");
+							return Some(RwLockReadGuard::map(
+								shard, |_| self.bucket_arr[bucket_idx].as_ref()
+							));
+						} 
+					},
+				}
+			}
+		}
+		None
+	}
+
+	
+    pub fn entry<F>(&'a self, key: K, hash: u64, f: F) -> Result<Entry<'a, K, V>, FullError>
+	    where F: Fn(EntryTag) -> MapEntryType
+	{
+		// We need to keep holding on the locks for each shard we process since if we don't find the
+		// key anywhere, we want to insert it at the earliest possible position (which may be several
+		// shards away). Ideally cross-shard chains are quite rare, so this shouldn't be a big deal.
+		//
+		// NB: Somewhat real chance of a deadlock! E.g. one thread has a ridiculously long chain that
+		// starts at block N and wraps around the hashmap to N-1, yet another thread begins a lookup at
+		// N-1 during this and has a chain that lasts a few shards. Then thread 1 is blocked on thread 2
+		// to get to shard N-1 but thread 2 is blocked on thread 1 to get to shard N. Pretty fringe case
+		// since chains shouldn't last very long, but still a problem with this somewhat naive sharding
+		// mechanism.
+		//
+		// We could fix this by either refusing to hold locks and only inserting into the earliest entry
+		// within the current shard (which effectively means after a while we forget about certain open
+		// entries at the end of shards) or by pivoting to a more involved concurrency setup?
+		let mut shards = Vec::new();
+		let mut insert_pos = None;
+		let mut insert_shard = None;
+
+		let num_buckets = self.get_num_buckets();
+		let shard_size = num_buckets / self.dict_shards.len();
+		let mut entry_pos = hash as usize % num_buckets;
+		let shard_start = entry_pos / shard_size;
+		for off in 0..self.dict_shards.len() {
+			let shard_idx = (shard_start + off) % self.dict_shards.len();			
+			let shard = self.dict_shards[shard_idx].write();
+			let mut inserted = false;
+			let entry_start = if off == 0 { entry_pos % shard_size } else { 0 };
+			for entry_idx in entry_start..shard.len() {
+				entry_pos += 1;
+				match f(shard.keys[entry_idx].tag) {
+					MapEntryType::Skip => continue,
+					MapEntryType::Empty => {
+						let ((shard, idx), shard_pos) = match (insert_shard, insert_pos) {
+							(Some((s, i)), Some(p)) => ((s, i), p),
+							(None, Some(p)) => ((shard, shard_idx), p),
+							(None, None) => ((shard, shard_idx), entry_idx),
+							_ => unreachable!()
+						};
+						return Ok(Entry::Vacant(VacantEntry {
+							_key: key,
+							shard,
+							shard_pos,
+							key_pos: (shard_size * idx) + shard_pos,
+							bucket_arr: &self.bucket_arr,
+						}))
+					},
+					MapEntryType::Tombstone => {
+						if insert_pos.is_none() {
+							insert_pos = Some(entry_idx);
+							inserted = true;
+						}
+					},
+					MapEntryType::Occupied => {
+						let cand_key = unsafe { shard.keys[entry_idx].val.assume_init_ref() };
+						if *cand_key == key {
+							let bucket_pos = shard.idxs[entry_idx].next_checked().unwrap();
+							return Ok(Entry::Occupied(OccupiedEntry {
+								shard,
+								shard_pos: entry_idx,
+								bucket_pos,
+								bucket_arr: &self.bucket_arr,
+								key_pos: entry_pos,
+							}));
+						}	
+					}
+				} 
+			}
+			if inserted {
+				insert_shard = Some((shard, shard_idx));
+			} else {
+				shards.push(shard);
+			}
+		}
+		
+		if let (Some((shard, idx)), Some(shard_pos)) = (insert_shard, insert_pos) {
+			Ok(Entry::Vacant(VacantEntry {
+				_key: key,
+				shard,
+				shard_pos,
+				key_pos: (shard_size * idx) + shard_pos,
+				bucket_arr: &self.bucket_arr,
+			}))
+		} else {
+			Err(FullError{})
+		}
+	}
+	
+    /// Get number of buckets in map.
+    pub fn get_num_buckets(&self) -> usize {
+        self.bucket_arr.len()
+    }
+
+    pub fn clear(&mut self) {
+		let mut shards: Vec<_> = self.dict_shards.iter().map(|x| x.write()).collect();
+        for shard in shards.iter_mut() {
+			for e in shard.keys.iter_mut() {
+				e.tag = EntryTag::Empty;
+			}
+			for e in shard.idxs.iter_mut() {
+				*e = BucketIdx::INVALID;
+			}
+		}
+
+        self.bucket_arr.clear();
+    }
+}
+ 
--- a/libs/neon-shmem/src/hash/entry.rs
+++ b/libs/neon-shmem/src/hash/entry.rs
@@ -0,0 +1,81 @@
+//! Equivalent of [`std::collections::hash_map::Entry`] for this hashmap.
+
+use crate::hash::{
+	core::{DictShard, EntryTag},
+	bucket::{BucketArray, BucketIdx}
+};
+use crate::sync::{RwLockWriteGuard, ValueWriteGuard};
+
+use std::hash::Hash;
+
+pub enum Entry<'a, K, V> {
+    Occupied(OccupiedEntry<'a, K, V>),
+    Vacant(VacantEntry<'a, K, V>),
+}
+
+pub struct OccupiedEntry<'a, K, V> {
+    /// Mutable reference to the shard of the map the entry is in.
+    pub(crate) shard: RwLockWriteGuard<'a, DictShard<'a, K>>,
+	/// The position of the entry in the shard.
+    pub(crate) shard_pos: usize,
+	/// True logical position of the entry in the map.
+	pub(crate) key_pos: usize,
+	/// Mutable reference to the bucket array containing entry.
+	pub(crate) bucket_arr: &'a BucketArray<'a, V>,
+    /// The position of the bucket in the [`CoreHashMap`] bucket array.
+    pub(crate) bucket_pos: usize,
+}
+
+impl<K, V> OccupiedEntry<'_, K, V> {
+    pub fn get(&self) -> &V {
+		self.bucket_arr[self.bucket_pos].as_ref()
+    }
+
+    pub fn get_mut(&mut self) -> &mut V {
+		self.bucket_arr.get_mut(self.bucket_pos).as_mut()
+    }
+
+    /// Inserts a value into the entry, replacing (and returning) the existing value.
+    pub fn insert(&mut self, value: V) -> V {
+        self.bucket_arr.get_mut(self.bucket_pos).replace(value)
+    }
+
+    /// Removes the entry from the hash map, returning the value originally stored within it.
+    pub fn remove(&mut self) -> V {
+		self.shard.idxs[self.shard_pos] = BucketIdx::INVALID;
+		self.shard.keys[self.shard_pos].tag = EntryTag::Tombstone;
+        self.bucket_arr.dealloc_bucket(self.bucket_pos)
+    }
+}
+
+/// An abstract view into a vacant entry within the map.
+pub struct VacantEntry<'a, K, V> {
+    /// The key of the occupied entry
+    pub(crate) _key: K,
+    /// Mutable reference to the shard of the map the entry is in.
+    pub(crate) shard: RwLockWriteGuard<'a, DictShard<'a, K>>,
+	/// The position of the entry in the shard.
+    pub(crate) shard_pos: usize,
+	/// True logical position of the entry in the map.
+	pub(crate) key_pos: usize,
+	/// Mutable reference to the bucket array containing entry.
+	pub(crate) bucket_arr: &'a BucketArray<'a, V>,
+}
+
+impl<'a, K: Clone + Hash + Eq, V> VacantEntry<'a, K, V> {
+    /// Insert a value into the vacant entry, finding and populating an empty bucket in the process.
+    pub fn insert(mut self, value: V) -> ValueWriteGuard<'a, V> {
+		let pos = self.bucket_arr.alloc_bucket(value, self.key_pos)
+			.expect("bucket is available if entry is");
+		self.shard.keys[self.shard_pos].tag = EntryTag::Occupied;
+		self.shard.keys[self.shard_pos].val.write(self._key);
+		let idx = pos.next_checked().expect("position is valid");
+		self.shard.idxs[self.shard_pos] = pos;
+
+        RwLockWriteGuard::map(self.shard, |_| {
+            self.bucket_arr.get_mut(idx).as_mut()
+        })
+    }
+}
+	
+
--- a/libs/neon-shmem/src/hash/tests.rs
+++ b/libs/neon-shmem/src/hash/tests.rs
@@ -0,0 +1,428 @@
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fmt::Debug;
+use std::mem::MaybeUninit;
+
+use crate::hash::Entry;
+use crate::hash::HashMapAccess;
+use crate::hash::HashMapInit;
+use crate::hash::core::FullError;
+
+use rand::seq::SliceRandom;
+use rand::{Rng, RngCore};
+use rand_distr::Zipf;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl From<&TestKey> for u128 {
+    fn from(val: &TestKey) -> u128 {
+        u128::from_be_bytes(val.0)
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl<'a> From<&'a [u8]> for TestKey {
+    fn from(bytes: &'a [u8]) -> TestKey {
+        TestKey(bytes.try_into().unwrap())
+    }
+}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
+    let w = HashMapInit::<TestKey, usize>::new_resizeable_named(100000, 120000, 100, "test_inserts")
+        .attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+        let res = w.entry((*k).into());
+        match res.unwrap() {
+            Entry::Occupied(mut e) => {
+                e.insert(idx);
+            }
+            Entry::Vacant(e) => {
+                _ = e.insert(idx);
+            }
+        };
+    }
+
+    for (idx, k) in keys.iter().enumerate() {
+        let x = w.get(&(*k).into());
+        let value = x.as_deref().copied();
+        assert_eq!(value, Some(idx));
+    }
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut rand::rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.contains(&key) {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
+
+#[derive(Clone, Debug)]
+struct TestOp(TestKey, Option<usize>);
+
+fn apply_op(
+    op: &TestOp,
+    map: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    // apply the change to the shadow tree first
+    let shadow_existing = if let Some(v) = op.1 {
+        shadow.insert(op.0, v)
+    } else {
+        shadow.remove(&op.0)
+    };
+
+    let entry = map.entry(op.0);
+    let hash_existing = match op.1 {
+        Some(new) => match entry.unwrap() {
+            Entry::Occupied(mut e) => Some(e.insert(new)),
+            Entry::Vacant(e) => {
+                _ = e.insert(new);
+                None
+            }
+        },
+        None => match entry.unwrap() {
+            Entry::Occupied(mut e) => Some(e.remove()),
+            Entry::Vacant(_) => None,
+        },
+    };
+
+    assert_eq!(shadow_existing, hash_existing);
+}
+
+fn do_random_ops(
+    num_ops: usize,
+    size: u32,
+    del_prob: f64,
+    writer: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+    rng: &mut rand::rngs::ThreadRng,
+) {
+    for i in 0..num_ops {
+        let key: TestKey = ((rng.next_u32() % size) as u128).into();
+        let op = TestOp(
+            key,
+            if rng.random_bool(del_prob) {
+                Some(i)
+            } else {
+                None
+            },
+        );
+        apply_op(&op, writer, shadow);
+    }
+}
+
+fn do_deletes(
+    num_ops: usize,
+    writer: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    for _ in 0..num_ops {
+        let (k, _) = shadow.pop_first().unwrap();
+        writer.remove(&k);
+    }
+}
+
+fn do_shrink(
+    writer: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+    to: usize,
+) {
+    assert!(writer.shrink_goal().is_none());
+    writer.begin_shrink(to);
+    assert_eq!(writer.shrink_goal(), Some(to as usize));
+    while writer.get_num_buckets_in_use() > to as usize {
+        let (k, _) = shadow.pop_first().unwrap();
+        let entry = writer.entry(k).unwrap();
+        if let Entry::Occupied(mut e) = entry {
+            e.remove();
+        }
+    }
+    let old_usage = writer.get_num_buckets_in_use();
+    writer.finish_shrink().unwrap();
+    assert!(writer.shrink_goal().is_none());
+    assert_eq!(writer.get_num_buckets_in_use(), old_usage);
+}
+
+#[test]
+fn random_ops() {
+    let mut writer =
+        HashMapInit::<TestKey, usize>::new_resizeable_named(100000, 120000, 10, "test_random")
+            .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
+    let mut rng = rand::rng();
+    for i in 0..100000 {
+        let key: TestKey = (rng.sample(distribution) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &mut writer, &mut shadow);
+    }
+}
+
+// #[test]
+// fn test_shuffle() {
+//     let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 1200, 10, "test_shuf")
+//         .attach_writer();
+//     let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+//     let mut rng = rand::rng();
+
+//     do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+//     writer.shuffle();
+//     do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+// }
+
+// #[test]
+// fn test_grow() {
+//     let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 2000, 10, "test_grow")
+//         .attach_writer();
+//     let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+//     let mut rng = rand::rng();
+
+//     do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+//     let old_usage = writer.get_num_buckets_in_use();
+//     writer.grow(1500).unwrap();
+//     assert_eq!(writer.get_num_buckets_in_use(), old_usage);
+//     assert_eq!(writer.get_num_buckets(), 1500);
+//     do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+// }
+
+#[test]
+fn test_clear() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, 10, "test_clear")
+        .attach_writer();
+    // let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    // let mut rng = rand::rng();
+    // do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+    // writer.clear();
+    // assert_eq!(writer.get_num_buckets_in_use(), 0);
+    // assert_eq!(writer.get_num_buckets(), 1500);
+    // while let Some((key, _)) = shadow.pop_first() {
+    //     assert!(writer.get(&key).is_none());
+    // }
+    // do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+    // for i in 0..(1500 - writer.get_num_buckets_in_use()) {
+    //     writer.insert((1500 + i as u128).into(), 0).unwrap();
+    // }
+    // assert_eq!(writer.insert(5000.into(), 0), Err(FullError {}));
+    // writer.clear();
+    // assert!(writer.insert(5000.into(), 0).is_ok());
+}
+
+// #[test]
+// fn test_idx_remove() {
+//     let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, 10, "test_clear")
+//         .attach_writer();
+//     let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+//     let mut rng = rand::rng();
+//     do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+//     for _ in 0..100 {
+//         let idx = (rng.next_u32() % 1500) as usize;
+//         if let Some(e) = writer.entry_at_bucket(idx) {
+//             shadow.remove(&e._key);
+//             e.remove();
+//         }
+//     }
+//     while let Some((key, val)) = shadow.pop_first() {
+//         assert_eq!(*writer.get(&key).unwrap(), val);
+//     }
+// }
+
+// #[test]
+// fn test_idx_get() {
+//     let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_clear")
+//         .attach_writer();
+//     let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+//     let mut rng = rand::rng();
+//     do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+//     for _ in 0..100 {
+//         let idx = (rng.next_u32() % 1500) as usize;
+//         if let Some(pair) = writer.get_at_bucket(idx) {
+//             {
+//                 let v: *const usize = &pair.1;
+//                 assert_eq!(writer.get_bucket_for_value(v), idx);
+//             }
+//             {
+//                 let v: *const usize = &pair.1;
+//                 assert_eq!(writer.get_bucket_for_value(v), idx);
+//             }
+//         }
+//     }
+// }
+
+// #[test]
+// fn test_shrink() {
+//     let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_shrink")
+//         .attach_writer();
+//     let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+//     let mut rng = rand::rng();
+
+//     do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+//     do_shrink(&mut writer, &mut shadow, 1000);
+//     assert_eq!(writer.get_num_buckets(), 1000);
+//     do_deletes(500, &mut writer, &mut shadow);
+//     do_random_ops(10000, 500, 0.75, &mut writer, &mut shadow, &mut rng);
+//     assert!(writer.get_num_buckets_in_use() <= 1000);
+// }
+
+// #[test]
+// fn test_shrink_grow_seq() {
+//     let mut writer =
+//         HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 20000, "test_grow_seq")
+//             .attach_writer();
+//     let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+//     let mut rng = rand::rng();
+
+//     do_random_ops(500, 1000, 0.1, &mut writer, &mut shadow, &mut rng);
+//     eprintln!("Shrinking to 750");
+//     do_shrink(&mut writer, &mut shadow, 750);
+//     do_random_ops(200, 1000, 0.5, &mut writer, &mut shadow, &mut rng);
+//     eprintln!("Growing to 1500");
+//     writer.grow(1500).unwrap();
+//     do_random_ops(600, 1500, 0.1, &mut writer, &mut shadow, &mut rng);
+//     eprintln!("Shrinking to 200");
+//     while shadow.len() > 100 {
+//         do_deletes(1, &mut writer, &mut shadow);
+//     }
+//     do_shrink(&mut writer, &mut shadow, 200);
+//     do_random_ops(50, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+//     eprintln!("Growing to 10k");
+//     writer.grow(10000).unwrap();
+//     do_random_ops(10000, 5000, 0.25, &mut writer, &mut shadow, &mut rng);
+// }
+
+#[test]
+fn test_bucket_ops() {
+
+	let writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 1200, 10, "test_bucket_ops")
+        .attach_writer();
+    match writer.entry(1.into()).unwrap() {
+        Entry::Occupied(mut e) => {
+            e.insert(2);
+        }
+        Entry::Vacant(e) => {
+            _ = e.insert(2);
+        },
+    }
+    assert_eq!(writer.get_num_buckets_in_use(), 1);
+    assert_eq!(writer.get_num_buckets(), 1000);
+    assert_eq!(*writer.get(&1.into()).unwrap(), 2);
+    let pos = match writer.entry(1.into()).unwrap() {
+        Entry::Occupied(e) => {
+            assert_eq!(e._key, 1.into());
+            let pos = e.bucket_pos as usize;
+            pos
+        }
+        Entry::Vacant(_) => {
+            panic!("Insert didn't affect entry");
+        }
+    };
+    assert_eq!(unsafe { writer.get_at_bucket(pos).unwrap() }, &2);
+    {
+        let ptr: *const usize = &*writer.get(&1.into()).unwrap();
+        assert_eq!(writer.get_bucket_for_value(ptr), pos);
+    }
+    writer.remove(&1.into());
+    assert!(writer.get(&1.into()).is_none());
+}
+
+// #[test]
+// fn test_shrink_zero() {
+//     let mut writer =
+//         HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_shrink_zero")
+//             .attach_writer();
+//     writer.begin_shrink(0);
+//     for i in 0..1500 {
+//         writer.entry_at_bucket(i).map(|x| x.remove());
+//     }
+//     writer.finish_shrink().unwrap();
+//     assert_eq!(writer.get_num_buckets_in_use(), 0);
+//     let entry = writer.entry(1.into());
+//     if let Entry::Vacant(v) = entry {
+//         assert!(v.insert(2).is_err());
+//     } else {
+//         panic!("Somehow got non-vacant entry in empty map.")
+//     }
+//     writer.grow(50).unwrap();
+//     let entry = writer.entry(1.into());
+//     if let Entry::Vacant(v) = entry {
+//         assert!(v.insert(2).is_ok());
+//     } else {
+//         panic!("Somehow got non-vacant entry in empty map.")
+//     }
+//     assert_eq!(writer.get_num_buckets_in_use(), 1);
+// }
+
+// #[test]
+// #[should_panic]
+// fn test_grow_oom() {
+//     let writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_grow_oom")
+//         .attach_writer();
+//     writer.grow(20000).unwrap();
+// }
+
+// #[test]
+// #[should_panic]
+// fn test_shrink_bigger() {
+//     let mut writer =
+//         HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2500, "test_shrink_bigger")
+//             .attach_writer();
+//     writer.begin_shrink(2000);
+// }
+
+// #[test]
+// #[should_panic]
+// fn test_shrink_early_finish() {
+//     let writer =
+//         HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2500, "test_shrink_early_finish")
+//             .attach_writer();
+//     writer.finish_shrink().unwrap();
+// }
+
+// #[test]
+// #[should_panic]
+// fn test_shrink_fixed_size() {
+//     let mut area = [MaybeUninit::uninit(); 10000];
+//     let init_struct = HashMapInit::<TestKey, usize>::with_fixed(3, &mut area);
+//     let mut writer = init_struct.attach_writer();
+//     writer.begin_shrink(1);
+// }
--- a/libs/neon-shmem/src/lib.rs
+++ b/libs/neon-shmem/src/lib.rs
@@ -1,418 +1,5 @@
 //! Shared memory utilities for neon communicator

-use std::num::NonZeroUsize;
-use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
-use std::ptr::NonNull;
-use std::sync::atomic::{AtomicUsize, Ordering};
-
-use nix::errno::Errno;
-use nix::sys::mman::MapFlags;
-use nix::sys::mman::ProtFlags;
-use nix::sys::mman::mmap as nix_mmap;
-use nix::sys::mman::munmap as nix_munmap;
-use nix::unistd::ftruncate as nix_ftruncate;
-
-/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
-/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
-/// specified at creation.
-///
-/// The area is backed by an anonymous file created with memfd_create(). The full address space for
-/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
-/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
-/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
-/// future.
-pub struct ShmemHandle {
-    /// memfd file descriptor
-    fd: OwnedFd,
-
-    max_size: usize,
-
-    // Pointer to the beginning of the shared memory area. The header is stored there.
-    shared_ptr: NonNull<SharedStruct>,
-
-    // Pointer to the beginning of the user data
-    pub data_ptr: NonNull<u8>,
-}
-
-/// This is stored at the beginning in the shared memory area.
-struct SharedStruct {
-    max_size: usize,
-
-    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
-    current_size: AtomicUsize,
-}
-
-const RESIZE_IN_PROGRESS: usize = 1 << 63;
-
-const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
-
-/// Error type returned by the ShmemHandle functions.
-#[derive(thiserror::Error, Debug)]
-#[error("{msg}: {errno}")]
-pub struct Error {
-    pub msg: String,
-    pub errno: Errno,
-}
-
-impl Error {
-    fn new(msg: &str, errno: Errno) -> Error {
-        Error {
-            msg: msg.to_string(),
-            errno,
-        }
-    }
-}
-
-impl ShmemHandle {
-    /// Create a new shared memory area. To communicate between processes, the processes need to be
-    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
-    ///
-    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
-    /// processes can continue using it, however.
-    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
-        // create the backing anonymous file.
-        let fd = create_backing_file(name)?;
-
-        Self::new_with_fd(fd, initial_size, max_size)
-    }
-
-    fn new_with_fd(
-        fd: OwnedFd,
-        initial_size: usize,
-        max_size: usize,
-    ) -> Result<ShmemHandle, Error> {
-        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
-        // is a little larger than this because of the SharedStruct header. Make the upper limit
-        // somewhat smaller than that, because with anything close to that, you'll run out of
-        // memory anyway.
-        if max_size >= 1 << 48 {
-            panic!("max size {max_size} too large");
-        }
-        if initial_size > max_size {
-            panic!("initial size {initial_size} larger than max size {max_size}");
-        }
-
-        // The actual initial / max size is the one given by the caller, plus the size of
-        // 'SharedStruct'.
-        let initial_size = HEADER_SIZE + initial_size;
-        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
-
-        // Reserve address space for it with mmap
-        //
-        // TODO: Use MAP_HUGETLB if possible
-        let start_ptr = unsafe {
-            nix_mmap(
-                None,
-                max_size,
-                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
-                MapFlags::MAP_SHARED,
-                &fd,
-                0,
-            )
-        }
-        .map_err(|e| Error::new("mmap failed: {e}", e))?;
-
-        // Reserve space for the initial size
-        enlarge_file(fd.as_fd(), initial_size as u64)?;
-
-        // Initialize the header
-        let shared: NonNull<SharedStruct> = start_ptr.cast();
-        unsafe {
-            shared.write(SharedStruct {
-                max_size: max_size.into(),
-                current_size: AtomicUsize::new(initial_size),
-            })
-        };
-
-        // The user data begins after the header
-        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
-
-        Ok(ShmemHandle {
-            fd,
-            max_size: max_size.into(),
-            shared_ptr: shared,
-            data_ptr,
-        })
-    }
-
-    // return reference to the header
-    fn shared(&self) -> &SharedStruct {
-        unsafe { self.shared_ptr.as_ref() }
-    }
-
-    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
-    /// when creating the area.
-    ///
-    /// This may only be called from one process/thread concurrently. We detect that case
-    /// and return an Error.
-    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
-        let new_size = new_size + HEADER_SIZE;
-        let shared = self.shared();
-
-        if new_size > self.max_size {
-            panic!(
-                "new size ({} is greater than max size ({})",
-                new_size, self.max_size
-            );
-        }
-        assert_eq!(self.max_size, shared.max_size);
-
-        // Lock the area by setting the bit in 'current_size'
-        //
-        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
-        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
-        // since this is not performance-critical, better safe than sorry .
-        let mut old_size = shared.current_size.load(Ordering::Acquire);
-        loop {
-            if (old_size & RESIZE_IN_PROGRESS) != 0 {
-                return Err(Error::new(
-                    "concurrent resize detected",
-                    Errno::UnknownErrno,
-                ));
-            }
-            match shared.current_size.compare_exchange(
-                old_size,
-                new_size,
-                Ordering::Acquire,
-                Ordering::Relaxed,
-            ) {
-                Ok(_) => break,
-                Err(x) => old_size = x,
-            }
-        }
-
-        // Ok, we got the lock.
-        //
-        // NB: If anything goes wrong, we *must* clear the bit!
-        let result = {
-            use std::cmp::Ordering::{Equal, Greater, Less};
-            match new_size.cmp(&old_size) {
-                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
-                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
-                }),
-                Equal => Ok(()),
-                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
-            }
-        };
-
-        // Unlock
-        shared.current_size.store(
-            if result.is_ok() { new_size } else { old_size },
-            Ordering::Release,
-        );
-
-        result
-    }
-
-    /// Returns the current user-visible size of the shared memory segment.
-    ///
-    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
-    /// responsibility not to access the area beyond the current size.
-    pub fn current_size(&self) -> usize {
-        let total_current_size =
-            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
-        total_current_size - HEADER_SIZE
-    }
-}
-
-impl Drop for ShmemHandle {
-    fn drop(&mut self) {
-        // SAFETY: The pointer was obtained from mmap() with the given size.
-        // We unmap the entire region.
-        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
-        // The fd is dropped automatically by OwnedFd.
-    }
-}
-
-/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
-/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
-/// development and testing, but in production we want the file to stay in memory.
-///
-/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
-#[allow(unused_variables)]
-fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
-            .map_err(|e| Error::new("memfd_create failed: {e}", e))
-    }
-    #[cfg(target_os = "macos")]
-    {
-        let file = tempfile::tempfile().map_err(|e| {
-            Error::new(
-                "could not create temporary file to back shmem area: {e}",
-                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
-            )
-        })?;
-        Ok(OwnedFd::from(file))
-    }
-}
-
-fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
-    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
-    // we don't get a segfault later when trying to actually use it.
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
-            Error::new(
-                "could not grow shmem segment, posix_fallocate failed: {e}",
-                e,
-            )
-        })
-    }
-    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
-    #[cfg(target_os = "macos")]
-    {
-        nix::unistd::ftruncate(fd, size as i64)
-            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use nix::unistd::ForkResult;
-    use std::ops::Range;
-
-    /// check that all bytes in given range have the expected value.
-    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
-        for i in range {
-            let b = unsafe { *(ptr.add(i)) };
-            assert_eq!(expected, b, "unexpected byte at offset {i}");
-        }
-    }
-
-    /// Write 'b' to all bytes in the given range
-    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
-        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
-    }
-
-    // simple single-process test of growing and shrinking
-    #[test]
-    fn test_shmem_resize() -> Result<(), Error> {
-        let max_size = 1024 * 1024;
-        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
-
-        assert_eq!(init_struct.current_size(), 0);
-
-        // Initial grow
-        let size1 = 10000;
-        init_struct.set_size(size1).unwrap();
-        assert_eq!(init_struct.current_size(), size1);
-
-        // Write some data
-        let data_ptr = init_struct.data_ptr.as_ptr();
-        write_range(data_ptr, 0xAA, 0..size1);
-        assert_range(data_ptr, 0xAA, 0..size1);
-
-        // Shrink
-        let size2 = 5000;
-        init_struct.set_size(size2).unwrap();
-        assert_eq!(init_struct.current_size(), size2);
-
-        // Grow again
-        let size3 = 20000;
-        init_struct.set_size(size3).unwrap();
-        assert_eq!(init_struct.current_size(), size3);
-
-        // Try to read it. The area that was shrunk and grown again should read as all zeros now
-        assert_range(data_ptr, 0xAA, 0..5000);
-        assert_range(data_ptr, 0, 5000..size1);
-
-        // Try to grow beyond max_size
-        //let size4 = max_size + 1;
-        //assert!(init_struct.set_size(size4).is_err());
-
-        // Dropping init_struct should unmap the memory
-        drop(init_struct);
-
-        Ok(())
-    }
-
-    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
-    /// but is stored in the shared memory area and works across processes. It's implemented by
-    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
-    struct SimpleBarrier {
-        num_procs: usize,
-        count: AtomicUsize,
-    }
-
-    impl SimpleBarrier {
-        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
-            unsafe {
-                *ptr = SimpleBarrier {
-                    num_procs,
-                    count: AtomicUsize::new(0),
-                }
-            }
-        }
-
-        pub fn wait(&self) {
-            let old = self.count.fetch_add(1, Ordering::Relaxed);
-
-            let generation = old / self.num_procs;
-
-            let mut current = old + 1;
-            while current < (generation + 1) * self.num_procs {
-                std::thread::sleep(std::time::Duration::from_millis(10));
-                current = self.count.load(Ordering::Relaxed);
-            }
-        }
-    }
-
-    #[test]
-    fn test_multi_process() {
-        // Initialize
-        let max_size = 1_000_000_000_000;
-        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
-        let ptr = init_struct.data_ptr.as_ptr();
-
-        // Store the SimpleBarrier in the first 1k of the area.
-        init_struct.set_size(10000).unwrap();
-        let barrier_ptr: *mut SimpleBarrier = unsafe {
-            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
-                .cast()
-        };
-        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
-        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
-
-        // Fork another test process. The code after this runs in both processes concurrently.
-        let fork_result = unsafe { nix::unistd::fork().unwrap() };
-
-        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, 1000..2000);
-        } else {
-            write_range(ptr, 0xBB, 2000..3000);
-        }
-        barrier.wait();
-        // Verify the contents. (in both processes)
-        assert_range(ptr, 0xAA, 1000..2000);
-        assert_range(ptr, 0xBB, 2000..3000);
-
-        // Grow, from the child this time
-        let size = 10_000_000;
-        if !fork_result.is_parent() {
-            init_struct.set_size(size).unwrap();
-        }
-        barrier.wait();
-
-        // make some writes at the end
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, (size - 10)..size);
-        } else {
-            write_range(ptr, 0xBB, (size - 20)..(size - 10));
-        }
-        barrier.wait();
-
-        // Verify the contents. (This runs in both processes)
-        assert_range(ptr, 0, (size - 1000)..(size - 20));
-        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
-        assert_range(ptr, 0xAA, (size - 10)..size);
-
-        if let ForkResult::Parent { child } = fork_result {
-            nix::sys::wait::waitpid(child, None).unwrap();
-        }
-    }
-}
+pub mod hash;
+pub mod shmem;
+pub mod sync;
--- a/libs/neon-shmem/src/shmem.rs
+++ b/libs/neon-shmem/src/shmem.rs
@@ -0,0 +1,409 @@
+//! Dynamically resizable contiguous chunk of shared memory
+
+use std::num::NonZeroUsize;
+use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use nix::errno::Errno;
+use nix::sys::mman::MapFlags;
+use nix::sys::mman::ProtFlags;
+use nix::sys::mman::mmap as nix_mmap;
+use nix::sys::mman::munmap as nix_munmap;
+use nix::unistd::ftruncate as nix_ftruncate;
+
+/// `ShmemHandle` represents a shared memory area that can be shared by processes over `fork()`.
+/// Unlike shared memory allocated by Postgres, this area is resizable, up to `max_size` that's
+/// specified at creation.
+///
+/// The area is backed by an anonymous file created with `memfd_create()`. The full address space for
+/// `max_size` is reserved up-front with `mmap()`, but whenever you call [`ShmemHandle::set_size`],
+/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
+/// will cause the file to be expanded, but we might use `mprotect()` etc. to enforce that in the
+/// future.
+pub struct ShmemHandle {
+    /// memfd file descriptor
+    fd: OwnedFd,
+
+    max_size: usize,
+
+    // Pointer to the beginning of the shared memory area. The header is stored there.
+    shared_ptr: NonNull<SharedStruct>,
+
+    // Pointer to the beginning of the user data
+    pub data_ptr: NonNull<u8>,
+}
+
+/// This is stored at the beginning in the shared memory area.
+struct SharedStruct {
+    max_size: usize,
+
+    /// Current size of the backing file. The high-order bit is used for the [`RESIZE_IN_PROGRESS`] flag.
+    current_size: AtomicUsize,
+}
+
+const RESIZE_IN_PROGRESS: usize = 1 << 63;
+
+const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
+
+/// Error type returned by the [`ShmemHandle`] functions.
+#[derive(thiserror::Error, Debug)]
+#[error("{msg}: {errno}")]
+pub struct Error {
+    pub msg: String,
+    pub errno: Errno,
+}
+
+impl Error {
+    fn new(msg: &str, errno: Errno) -> Self {
+        Self {
+            msg: msg.to_string(),
+            errno,
+        }
+    }
+}
+
+impl ShmemHandle {
+    /// Create a new shared memory area. To communicate between processes, the processes need to be
+    /// `fork()`'d after calling this, so that the `ShmemHandle` is inherited by all processes.
+    ///
+    /// If the `ShmemHandle` is dropped, the memory is unmapped from the current process. Other
+    /// processes can continue using it, however.
+    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<Self, Error> {
+        // create the backing anonymous file.
+        let fd = create_backing_file(name)?;
+
+        Self::new_with_fd(fd, initial_size, max_size)
+    }
+
+    fn new_with_fd(fd: OwnedFd, initial_size: usize, max_size: usize) -> Result<Self, Error> {
+        // We reserve the high-order bit for the `RESIZE_IN_PROGRESS` flag, and the actual size
+        // is a little larger than this because of the SharedStruct header. Make the upper limit
+        // somewhat smaller than that, because with anything close to that, you'll run out of
+        // memory anyway.
+        assert!(max_size < 1 << 48, "max size {max_size} too large");
+
+        assert!(
+            initial_size <= max_size,
+            "initial size {initial_size} larger than max size {max_size}"
+        );
+
+        // The actual initial / max size is the one given by the caller, plus the size of
+        // 'SharedStruct'.
+        let initial_size = HEADER_SIZE + initial_size;
+        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
+
+        // Reserve address space for it with mmap
+        //
+        // TODO: Use MAP_HUGETLB if possible
+        let start_ptr = unsafe {
+            nix_mmap(
+                None,
+                max_size,
+                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
+                MapFlags::MAP_SHARED,
+                &fd,
+                0,
+            )
+        }
+        .map_err(|e| Error::new("mmap failed", e))?;
+
+        // Reserve space for the initial size
+        enlarge_file(fd.as_fd(), initial_size as u64)?;
+
+        // Initialize the header
+        let shared: NonNull<SharedStruct> = start_ptr.cast();
+        unsafe {
+            shared.write(SharedStruct {
+                max_size: max_size.into(),
+                current_size: AtomicUsize::new(initial_size),
+            });
+        }
+
+        // The user data begins after the header
+        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
+
+        Ok(Self {
+            fd,
+            max_size: max_size.into(),
+            shared_ptr: shared,
+            data_ptr,
+        })
+    }
+
+    // return reference to the header
+    fn shared(&self) -> &SharedStruct {
+        unsafe { self.shared_ptr.as_ref() }
+    }
+
+    /// Resize the shared memory area. `new_size` must not be larger than the `max_size` specified
+    /// when creating the area.
+    ///
+    /// This may only be called from one process/thread concurrently. We detect that case
+    /// and return an [`shmem::Error`](Error).
+    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
+        let new_size = new_size + HEADER_SIZE;
+        let shared = self.shared();
+
+        assert!(
+            new_size <= self.max_size,
+            "new size ({new_size}) is greater than max size ({})",
+            self.max_size
+        );
+
+        assert_eq!(self.max_size, shared.max_size);
+
+        // Lock the area by setting the bit in `current_size`
+        //
+        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
+        // and the `posix_fallocate`/`ftruncate` call is surely a synchronization point anyway. But
+        // since this is not performance-critical, better safe than sorry.
+        let mut old_size = shared.current_size.load(Ordering::Acquire);
+        loop {
+            if (old_size & RESIZE_IN_PROGRESS) != 0 {
+                return Err(Error::new(
+                    "concurrent resize detected",
+                    Errno::UnknownErrno,
+                ));
+            }
+            match shared.current_size.compare_exchange(
+                old_size,
+                new_size,
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => break,
+                Err(x) => old_size = x,
+            }
+        }
+
+        // Ok, we got the lock.
+        //
+        // NB: If anything goes wrong, we *must* clear the bit!
+        let result = {
+            use std::cmp::Ordering::{Equal, Greater, Less};
+            match new_size.cmp(&old_size) {
+                Less => nix_ftruncate(&self.fd, new_size as i64)
+                    .map_err(|e| Error::new("could not shrink shmem segment, ftruncate failed", e)),
+                Equal => Ok(()),
+                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
+            }
+        };
+
+        // Unlock
+        shared.current_size.store(
+            if result.is_ok() { new_size } else { old_size },
+            Ordering::Release,
+        );
+
+        result
+    }
+
+    /// Returns the current user-visible size of the shared memory segment.
+    ///
+    /// NOTE: a concurrent [`ShmemHandle::set_size()`] call can change the size at any time.
+    /// It is the caller's responsibility not to access the area beyond the current size.
+    pub fn current_size(&self) -> usize {
+        let total_current_size =
+            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
+        total_current_size - HEADER_SIZE
+    }
+}
+
+impl Drop for ShmemHandle {
+    fn drop(&mut self) {
+        // SAFETY: The pointer was obtained from mmap() with the given size.
+        // We unmap the entire region.
+        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
+        // The fd is dropped automatically by OwnedFd.
+    }
+}
+
+/// Create a "backing file" for the shared memory area. On Linux, use `memfd_create()`, to create an
+/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
+/// development and testing, but in production we want the file to stay in memory.
+///
+/// Disable unused variables warnings because `name` is unused in the macos path.
+#[allow(unused_variables)]
+fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
+            .map_err(|e| Error::new("memfd_create failed", e))
+    }
+    #[cfg(target_os = "macos")]
+    {
+        let file = tempfile::tempfile().map_err(|e| {
+            Error::new(
+                "could not create temporary file to back shmem area",
+                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
+            )
+        })?;
+        Ok(OwnedFd::from(file))
+    }
+}
+
+fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
+    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
+    // we don't get a segfault later when trying to actually use it.
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::fcntl::posix_fallocate(fd, 0, size as i64)
+            .map_err(|e| Error::new("could not grow shmem segment, posix_fallocate failed", e))
+    }
+    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
+    #[cfg(target_os = "macos")]
+    {
+        nix::unistd::ftruncate(fd, size as i64)
+            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed", e))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use nix::unistd::ForkResult;
+    use std::ops::Range;
+
+    /// check that all bytes in given range have the expected value.
+    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
+        for i in range {
+            let b = unsafe { *(ptr.add(i)) };
+            assert_eq!(expected, b, "unexpected byte at offset {i}");
+        }
+    }
+
+    /// Write 'b' to all bytes in the given range
+    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
+        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
+    }
+
+    // simple single-process test of growing and shrinking
+    #[test]
+    fn test_shmem_resize() -> Result<(), Error> {
+        let max_size = 1024 * 1024;
+        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
+
+        assert_eq!(init_struct.current_size(), 0);
+
+        // Initial grow
+        let size1 = 10000;
+        init_struct.set_size(size1).unwrap();
+        assert_eq!(init_struct.current_size(), size1);
+
+        // Write some data
+        let data_ptr = init_struct.data_ptr.as_ptr();
+        write_range(data_ptr, 0xAA, 0..size1);
+        assert_range(data_ptr, 0xAA, 0..size1);
+
+        // Shrink
+        let size2 = 5000;
+        init_struct.set_size(size2).unwrap();
+        assert_eq!(init_struct.current_size(), size2);
+
+        // Grow again
+        let size3 = 20000;
+        init_struct.set_size(size3).unwrap();
+        assert_eq!(init_struct.current_size(), size3);
+
+        // Try to read it. The area that was shrunk and grown again should read as all zeros now
+        assert_range(data_ptr, 0xAA, 0..5000);
+        assert_range(data_ptr, 0, 5000..size1);
+
+        // Try to grow beyond max_size
+        //let size4 = max_size + 1;
+        //assert!(init_struct.set_size(size4).is_err());
+
+        // Dropping init_struct should unmap the memory
+        drop(init_struct);
+
+        Ok(())
+    }
+
+    /// This is used in tests to coordinate between test processes. It's like `std::sync::Barrier`,
+    /// but is stored in the shared memory area and works across processes. It's implemented by
+    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
+    struct SimpleBarrier {
+        num_procs: usize,
+        count: AtomicUsize,
+    }
+
+    impl SimpleBarrier {
+        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
+            unsafe {
+                *ptr = SimpleBarrier {
+                    num_procs,
+                    count: AtomicUsize::new(0),
+                }
+            }
+        }
+
+        pub fn wait(&self) {
+            let old = self.count.fetch_add(1, Ordering::Relaxed);
+
+            let generation = old / self.num_procs;
+
+            let mut current = old + 1;
+            while current < (generation + 1) * self.num_procs {
+                std::thread::sleep(std::time::Duration::from_millis(10));
+                current = self.count.load(Ordering::Relaxed);
+            }
+        }
+    }
+
+    #[test]
+    fn test_multi_process() {
+        // Initialize
+        let max_size = 1_000_000_000_000;
+        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
+        let ptr = init_struct.data_ptr.as_ptr();
+
+        // Store the SimpleBarrier in the first 1k of the area.
+        init_struct.set_size(10000).unwrap();
+        let barrier_ptr: *mut SimpleBarrier = unsafe {
+            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
+                .cast()
+        };
+        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
+        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
+
+        // Fork another test process. The code after this runs in both processes concurrently.
+        let fork_result = unsafe { nix::unistd::fork().unwrap() };
+
+        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, 1000..2000);
+        } else {
+            write_range(ptr, 0xBB, 2000..3000);
+        }
+        barrier.wait();
+        // Verify the contents. (in both processes)
+        assert_range(ptr, 0xAA, 1000..2000);
+        assert_range(ptr, 0xBB, 2000..3000);
+
+        // Grow, from the child this time
+        let size = 10_000_000;
+        if !fork_result.is_parent() {
+            init_struct.set_size(size).unwrap();
+        }
+        barrier.wait();
+
+        // make some writes at the end
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, (size - 10)..size);
+        } else {
+            write_range(ptr, 0xBB, (size - 20)..(size - 10));
+        }
+        barrier.wait();
+
+        // Verify the contents. (This runs in both processes)
+        assert_range(ptr, 0, (size - 1000)..(size - 20));
+        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
+        assert_range(ptr, 0xAA, (size - 10)..size);
+
+        if let ForkResult::Parent { child } = fork_result {
+            nix::sys::wait::waitpid(child, None).unwrap();
+        }
+    }
+}
--- a/libs/neon-shmem/src/sync.rs
+++ b/libs/neon-shmem/src/sync.rs
@@ -0,0 +1,169 @@
+//! Simple utilities akin to what's in [`std::sync`] but designed to work with shared memory.
+
+use std::mem::MaybeUninit;
+use std::ptr::NonNull;
+
+use nix::errno::Errno;
+
+pub type RwLock<T> = lock_api::RwLock<PthreadRwLock, T>;
+pub type Mutex<T> = lock_api::Mutex<PthreadMutex, T>;
+pub(crate) type RwLockReadGuard<'a, T> = lock_api::RwLockReadGuard<'a, PthreadRwLock, T>;
+pub type RwLockWriteGuard<'a, T> = lock_api::RwLockWriteGuard<'a, PthreadRwLock, T>;
+pub type ValueReadGuard<'a, T> = lock_api::MappedRwLockReadGuard<'a, PthreadRwLock, T>;
+pub type ValueWriteGuard<'a, T> = lock_api::MappedRwLockWriteGuard<'a, PthreadRwLock, T>;
+
+/// Wrapper around a pointer to a [`libc::pthread_rwlock_t`].
+///
+/// `PthreadRwLock(None)` is an invalid state for this type. It only exists because the
+/// [`lock_api::RawRwLock`] trait has a mandatory `INIT` const member to allow for static
+/// initialization of the lock. Unfortunately, pthread seemingly does not support any way
+/// to statically initialize a `pthread_rwlock_t` with `PTHREAD_PROCESS_SHARED` set. However,
+/// `lock_api` allows manual construction and seemingly doesn't use `INIT` itself so for
+/// now it's set to this invalid value to satisfy the trait constraints.
+pub struct PthreadRwLock(Option<NonNull<libc::pthread_rwlock_t>>);
+
+impl PthreadRwLock {
+	pub fn new(lock: NonNull<libc::pthread_rwlock_t>) -> Self {
+		unsafe {
+			let mut attrs = MaybeUninit::uninit();
+			// Ignoring return value here - only possible error is OOM.
+			libc::pthread_rwlockattr_init(attrs.as_mut_ptr());
+			libc::pthread_rwlockattr_setpshared(
+				attrs.as_mut_ptr(),
+				libc::PTHREAD_PROCESS_SHARED
+			);
+			// TODO(quantumish): worth making this function fallible?
+			libc::pthread_rwlock_init(lock.as_ptr(), attrs.as_mut_ptr());
+			// Safety: POSIX specifies that "any function affecting the attributes
+			// object (including destruction) shall not affect any previously
+			// initialized read-write locks". 
+			libc::pthread_rwlockattr_destroy(attrs.as_mut_ptr());
+			Self(Some(lock))
+		}
+	}
+	
+	fn inner(&self) -> NonNull<libc::pthread_rwlock_t> {
+		self.0.unwrap_or_else(
+			|| panic!("PthreadRwLock constructed badly - something likely used RawRwLock::INIT")
+		)
+	}
+
+	fn unlock(&self) {
+		unsafe {
+			let res = libc::pthread_rwlock_unlock(self.inner().as_ptr());
+			assert!(res == 0, "unlock failed with {}", Errno::from_raw(res));
+		}
+	}
+}
+
+unsafe impl lock_api::RawRwLock for PthreadRwLock {
+	type GuardMarker = lock_api::GuardSend;
+
+	/// *DO NOT USE THIS.* See [`PthreadRwLock`] for the full explanation.
+	const INIT: Self = Self(None);	
+	
+	fn lock_shared(&self) {
+		unsafe {
+			let res = libc::pthread_rwlock_rdlock(self.inner().as_ptr());
+			assert!(res == 0, "rdlock failed with {}", Errno::from_raw(res));
+		}
+	}
+
+	fn try_lock_shared(&self) -> bool {
+		unsafe {
+			let res = libc::pthread_rwlock_tryrdlock(self.inner().as_ptr());
+			match res {
+				0 => true,
+				libc::EAGAIN => false,
+				o => panic!("try_rdlock failed with {}", Errno::from_raw(o)),
+			}
+		}
+	}
+
+	fn lock_exclusive(&self) {
+		unsafe {
+			let res = libc::pthread_rwlock_wrlock(self.inner().as_ptr());
+			assert!(res == 0, "wrlock failed with {}", Errno::from_raw(res));
+		}
+	}
+
+	fn try_lock_exclusive(&self) -> bool {
+		unsafe {
+			let res = libc::pthread_rwlock_trywrlock(self.inner().as_ptr());
+			match res {
+				0 => true,
+				libc::EAGAIN => false,
+				o => panic!("try_wrlock failed with {}", Errno::from_raw(o)),
+			}
+		}
+	}
+
+	unsafe fn unlock_exclusive(&self) {
+		self.unlock();
+	}
+
+	unsafe fn unlock_shared(&self) {
+		self.unlock();
+	}
+}
+
+pub struct PthreadMutex(Option<NonNull<libc::pthread_mutex_t>>);
+
+impl PthreadMutex {
+	pub fn new(lock: NonNull<libc::pthread_mutex_t>) -> Self {
+		unsafe {
+			let mut attrs = MaybeUninit::uninit();
+			// Ignoring return value here - only possible error is OOM.
+			libc::pthread_mutexattr_init(attrs.as_mut_ptr());
+			libc::pthread_mutexattr_setpshared(
+				attrs.as_mut_ptr(),
+				libc::PTHREAD_PROCESS_SHARED
+			);
+			libc::pthread_mutex_init(lock.as_ptr(), attrs.as_mut_ptr());
+			// Safety: POSIX specifies that "any function affecting the attributes
+			// object (including destruction) shall not affect any previously
+			// initialized read-write locks". 
+			libc::pthread_mutexattr_destroy(attrs.as_mut_ptr());
+			Self(Some(lock))
+		}
+	}
+
+	fn inner(&self) -> NonNull<libc::pthread_mutex_t> {
+		self.0.unwrap_or_else(
+			|| panic!("PthreadMutex constructed badly - something likely used RawMutex::INIT")
+		)
+	}
+
+}
+
+unsafe impl lock_api::RawMutex for PthreadMutex {
+	type GuardMarker = lock_api::GuardSend;
+
+	/// *DO NOT USE THIS.* See [`PthreadRwLock`] for the full explanation.
+	const INIT: Self = Self(None);	
+
+	fn lock(&self) {
+		unsafe {
+			let res = libc::pthread_mutex_lock(self.inner().as_ptr());
+			assert!(res == 0, "lock failed with {}", Errno::from_raw(res));
+		}
+	}
+
+	fn try_lock(&self) -> bool {
+		unsafe {
+			let res = libc::pthread_mutex_trylock(self.inner().as_ptr());
+			match res {
+				0 => true,
+				libc::EAGAIN => false,
+				o => panic!("try_rdlock failed with {}", Errno::from_raw(o)),
+			}
+		}
+	}
+
+	unsafe fn unlock(&self) {
+		unsafe {
+			let res = libc::pthread_mutex_unlock(self.inner().as_ptr());
+			assert!(res == 0, "unlock failed with {}", Errno::from_raw(res));
+		}
+	}
+}
--- a/libs/neonart/Cargo.toml
+++ b/libs/neonart/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "neonart"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+crossbeam-utils.workspace = true
+spin.workspace = true
+tracing.workspace = true
+
+[dev-dependencies]
+rand = "0.9.1"
+rand_distr = "0.5.1"
--- a/libs/neonart/src/algorithm.rs
+++ b/libs/neonart/src/algorithm.rs
@@ -0,0 +1,599 @@
+mod lock_and_version;
+pub(crate) mod node_ptr;
+mod node_ref;
+
+use std::vec::Vec;
+
+use crate::algorithm::lock_and_version::ConcurrentUpdateError;
+use crate::algorithm::node_ptr::MAX_PREFIX_LEN;
+use crate::algorithm::node_ref::{NewNodeRef, NodeRef, ReadLockedNodeRef, WriteLockedNodeRef};
+use crate::allocator::OutOfMemoryError;
+
+use crate::TreeWriteGuard;
+use crate::UpdateAction;
+use crate::allocator::ArtAllocator;
+use crate::epoch::EpochPin;
+use crate::{Key, Value};
+
+pub(crate) type RootPtr<V> = node_ptr::NodePtr<V>;
+
+#[derive(Debug)]
+pub enum ArtError {
+    ConcurrentUpdate, // need to retry
+    OutOfMemory,
+}
+
+impl From<ConcurrentUpdateError> for ArtError {
+    fn from(_: ConcurrentUpdateError) -> ArtError {
+        ArtError::ConcurrentUpdate
+    }
+}
+
+impl From<OutOfMemoryError> for ArtError {
+    fn from(_: OutOfMemoryError) -> ArtError {
+        ArtError::OutOfMemory
+    }
+}
+
+pub fn new_root<V: Value>(
+    allocator: &impl ArtAllocator<V>,
+) -> Result<RootPtr<V>, OutOfMemoryError> {
+    node_ptr::new_root(allocator)
+}
+
+pub(crate) fn search<'e, K: Key, V: Value>(
+    key: &K,
+    root: RootPtr<V>,
+    epoch_pin: &'e EpochPin,
+) -> Option<&'e V> {
+    loop {
+        let root_ref = NodeRef::from_root_ptr(root);
+        if let Ok(result) = lookup_recurse(key.as_bytes(), root_ref, None, epoch_pin) {
+            break result;
+        }
+        // retry
+    }
+}
+
+pub(crate) fn iter_next<'e, V: Value>(
+    key: &[u8],
+    root: RootPtr<V>,
+    epoch_pin: &'e EpochPin,
+) -> Option<(Vec<u8>, &'e V)> {
+    loop {
+        let mut path = Vec::new();
+        let root_ref = NodeRef::from_root_ptr(root);
+
+        match next_recurse(key, &mut path, root_ref, epoch_pin) {
+            Ok(Some(v)) => {
+                assert_eq!(path.len(), key.len());
+                break Some((path, v));
+            }
+            Ok(None) => break None,
+            Err(ConcurrentUpdateError()) => {
+                // retry
+                continue;
+            }
+        }
+    }
+}
+
+pub(crate) fn update_fn<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>, F>(
+    key: &K,
+    value_fn: F,
+    root: RootPtr<V>,
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), OutOfMemoryError>
+where
+    F: FnOnce(Option<&V>) -> UpdateAction<V>,
+{
+    let value_fn_cell = std::cell::Cell::new(Some(value_fn));
+    loop {
+        let root_ref = NodeRef::from_root_ptr(root);
+        let this_value_fn = |arg: Option<&V>| value_fn_cell.take().unwrap()(arg);
+        let key_bytes = key.as_bytes();
+
+        match update_recurse(
+            key_bytes,
+            this_value_fn,
+            root_ref,
+            None,
+            None,
+            guard,
+            0,
+            key_bytes,
+        ) {
+            Ok(()) => break Ok(()),
+            Err(ArtError::ConcurrentUpdate) => {
+                continue; // retry
+            }
+            Err(ArtError::OutOfMemory) => break Err(OutOfMemoryError()),
+        }
+    }
+}
+
+// Error means you must retry.
+//
+// This corresponds to the 'lookupOpt' function in the paper
+#[allow(clippy::only_used_in_recursion)]
+fn lookup_recurse<'e, V: Value>(
+    key: &[u8],
+    node: NodeRef<'e, V>,
+    parent: Option<ReadLockedNodeRef<V>>,
+    epoch_pin: &'e EpochPin,
+) -> Result<Option<&'e V>, ConcurrentUpdateError> {
+    let rnode = node.read_lock_or_restart()?;
+    if let Some(parent) = parent {
+        parent.read_unlock_or_restart()?;
+    }
+
+    // check if the prefix matches, may increment level
+    let prefix_len = if let Some(prefix_len) = rnode.prefix_matches(key) {
+        prefix_len
+    } else {
+        rnode.read_unlock_or_restart()?;
+        return Ok(None);
+    };
+
+    if rnode.is_leaf() {
+        assert_eq!(key.len(), prefix_len);
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let v = unsafe { vptr.as_ref().unwrap() };
+        return Ok(Some(v));
+    }
+
+    let key = &key[prefix_len..];
+
+    // find child (or leaf value)
+    let next_node = rnode.find_child_or_restart(key[0])?;
+
+    match next_node {
+        None => Ok(None), // key not found
+        Some(child) => lookup_recurse(&key[1..], child, Some(rnode), epoch_pin),
+    }
+}
+
+#[allow(clippy::only_used_in_recursion)]
+fn next_recurse<'e, V: Value>(
+    min_key: &[u8],
+    path: &mut Vec<u8>,
+    node: NodeRef<'e, V>,
+    epoch_pin: &'e EpochPin,
+) -> Result<Option<&'e V>, ConcurrentUpdateError> {
+    let rnode = node.read_lock_or_restart()?;
+    let prefix = rnode.get_prefix();
+    if !prefix.is_empty() {
+        path.extend_from_slice(prefix);
+    }
+
+    use std::cmp::Ordering;
+    let comparison = path.as_slice().cmp(&min_key[0..path.len()]);
+    if comparison == Ordering::Less {
+        rnode.read_unlock_or_restart()?;
+        return Ok(None);
+    }
+
+    if rnode.is_leaf() {
+        assert_eq!(path.len(), min_key.len());
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let v = unsafe { vptr.as_ref().unwrap() };
+        return Ok(Some(v));
+    }
+
+    let mut min_key_byte = match comparison {
+        Ordering::Less => unreachable!(), // checked this above already
+        Ordering::Equal => min_key[path.len()],
+        Ordering::Greater => 0,
+    };
+
+    loop {
+        match rnode.find_next_child_or_restart(min_key_byte)? {
+            None => {
+                return Ok(None);
+            }
+            Some((key_byte, child_ref)) => {
+                let path_len = path.len();
+                path.push(key_byte);
+                let result = next_recurse(min_key, path, child_ref, epoch_pin)?;
+                if result.is_some() {
+                    return Ok(result);
+                }
+                if key_byte == u8::MAX {
+                    return Ok(None);
+                }
+                path.truncate(path_len);
+                min_key_byte = key_byte + 1;
+            }
+        }
+    }
+}
+
+// This corresponds to the 'insertOpt' function in the paper
+#[allow(clippy::only_used_in_recursion)]
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn update_recurse<'e, K: Key, V: Value, A: ArtAllocator<V>, F>(
+    key: &[u8],
+    value_fn: F,
+    node: NodeRef<'e, V>,
+    rparent: Option<(ReadLockedNodeRef<V>, u8)>,
+    rgrandparent: Option<(ReadLockedNodeRef<V>, u8)>,
+    guard: &'_ mut TreeWriteGuard<'e, K, V, A>,
+    level: usize,
+    orig_key: &[u8],
+) -> Result<(), ArtError>
+where
+    F: FnOnce(Option<&V>) -> UpdateAction<V>,
+{
+    let rnode = node.read_lock_or_restart()?;
+
+    let prefix_match_len = rnode.prefix_matches(key);
+    if prefix_match_len.is_none() {
+        let (rparent, parent_key) = rparent.expect("direct children of the root have no prefix");
+        let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+        let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+        match value_fn(None) {
+            UpdateAction::Nothing => {}
+            UpdateAction::Insert(new_value) => {
+                insert_split_prefix(key, new_value, &mut wnode, &mut wparent, parent_key, guard)?;
+            }
+            UpdateAction::Remove => {
+                panic!("unexpected Remove action on insertion");
+            }
+        }
+        wnode.write_unlock();
+        wparent.write_unlock();
+        return Ok(());
+    }
+    let prefix_match_len = prefix_match_len.unwrap();
+    let key = &key[prefix_match_len..];
+    let level = level + prefix_match_len;
+
+    if rnode.is_leaf() {
+        assert_eq!(key.len(), 0);
+        let (rparent, parent_key) = rparent.expect("root cannot be leaf");
+        let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+        let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+        // safety: Now that we have acquired the write lock, we have exclusive access to the
+        // value. XXX: There might be concurrent reads though?
+        let value_mut = wnode.get_leaf_value_mut();
+
+        match value_fn(Some(value_mut)) {
+            UpdateAction::Nothing => {
+                wparent.write_unlock();
+                wnode.write_unlock();
+            }
+            UpdateAction::Insert(_) => panic!("cannot insert over existing value"),
+            UpdateAction::Remove => {
+                guard.remember_obsolete_node(wnode.as_ptr());
+                wparent.delete_child(parent_key);
+                wnode.write_unlock_obsolete();
+
+                if let Some(rgrandparent) = rgrandparent {
+                    // FIXME: Ignore concurrency error. It doesn't lead to
+                    // corruption, but it means we might leak something. Until
+                    // another update cleans it up.
+                    let _ = cleanup_parent(wparent, rgrandparent, guard);
+                }
+            }
+        }
+
+        return Ok(());
+    }
+
+    let next_node = rnode.find_child_or_restart(key[0])?;
+
+    if next_node.is_none() {
+        if rnode.is_full() {
+            let (rparent, parent_key) = rparent.expect("root node cannot become full");
+            let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+            let wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+            match value_fn(None) {
+                UpdateAction::Nothing => {
+                    wnode.write_unlock();
+                    wparent.write_unlock();
+                }
+                UpdateAction::Insert(new_value) => {
+                    insert_and_grow(key, new_value, wnode, &mut wparent, parent_key, guard)?;
+                    wparent.write_unlock();
+                }
+                UpdateAction::Remove => {
+                    panic!("unexpected Remove action on insertion");
+                }
+            };
+        } else {
+            let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+            if let Some((rparent, _)) = rparent {
+                rparent.read_unlock_or_restart()?;
+            }
+            match value_fn(None) {
+                UpdateAction::Nothing => {}
+                UpdateAction::Insert(new_value) => {
+                    insert_to_node(&mut wnode, key, new_value, guard)?;
+                }
+                UpdateAction::Remove => {
+                    panic!("unexpected Remove action on insertion");
+                }
+            };
+            wnode.write_unlock();
+        }
+        Ok(())
+    } else {
+        let next_child = next_node.unwrap(); // checked above it's not None
+        if let Some((ref rparent, _)) = rparent {
+            rparent.check_or_restart()?;
+        }
+
+        // recurse to next level
+        update_recurse(
+            &key[1..],
+            value_fn,
+            next_child,
+            Some((rnode, key[0])),
+            rparent,
+            guard,
+            level + 1,
+            orig_key,
+        )
+    }
+}
+
+#[derive(Clone)]
+enum PathElement {
+    Prefix(Vec<u8>),
+    KeyByte(u8),
+}
+
+impl std::fmt::Debug for PathElement {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            PathElement::Prefix(prefix) => write!(fmt, "{prefix:?}"),
+            PathElement::KeyByte(key_byte) => write!(fmt, "{key_byte}"),
+        }
+    }
+}
+
+pub(crate) fn dump_tree<V: Value + std::fmt::Debug>(
+    root: RootPtr<V>,
+    epoch_pin: &'_ EpochPin,
+    dst: &mut dyn std::io::Write,
+) {
+    let root_ref = NodeRef::from_root_ptr(root);
+
+    let _ = dump_recurse(&[], root_ref, epoch_pin, 0, dst);
+}
+
+// TODO: return an Err if writeln!() returns error, instead of unwrapping
+#[allow(clippy::only_used_in_recursion)]
+fn dump_recurse<'e, V: Value + std::fmt::Debug>(
+    path: &[PathElement],
+    node: NodeRef<'e, V>,
+    epoch_pin: &'e EpochPin,
+    level: usize,
+    dst: &mut dyn std::io::Write,
+) -> Result<(), ConcurrentUpdateError> {
+    let indent = str::repeat(" ", level);
+
+    let rnode = node.read_lock_or_restart()?;
+    let mut path = Vec::from(path);
+    let prefix = rnode.get_prefix();
+    if !prefix.is_empty() {
+        path.push(PathElement::Prefix(Vec::from(prefix)));
+    }
+
+    if rnode.is_leaf() {
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let val = unsafe { vptr.as_ref().unwrap() };
+        writeln!(dst, "{indent} {path:?}: {val:?}").unwrap();
+        return Ok(());
+    }
+
+    for key_byte in 0..=u8::MAX {
+        match rnode.find_child_or_restart(key_byte)? {
+            None => continue,
+            Some(child_ref) => {
+                let rchild = child_ref.read_lock_or_restart()?;
+                writeln!(
+                    dst,
+                    "{} {:?}, {}: prefix {:?}",
+                    indent,
+                    &path,
+                    key_byte,
+                    rchild.get_prefix()
+                )
+                .unwrap();
+
+                let mut child_path = path.clone();
+                child_path.push(PathElement::KeyByte(key_byte));
+
+                dump_recurse(&child_path, child_ref, epoch_pin, level + 1, dst)?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+///```text
+///        [fooba]r -> value
+///
+/// [foo]b -> [a]r  -> value
+///      e -> [ls]e -> value
+///```
+fn insert_split_prefix<K: Key, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    node: &mut WriteLockedNodeRef<V>,
+    parent: &mut WriteLockedNodeRef<V>,
+    parent_key: u8,
+    guard: &'_ TreeWriteGuard<K, V, A>,
+) -> Result<(), OutOfMemoryError> {
+    let old_node = node;
+    let old_prefix = old_node.get_prefix();
+    let common_prefix_len = common_prefix(key, old_prefix);
+
+    // Allocate a node for the new value.
+    let new_value_node = allocate_node_for_value(
+        &key[common_prefix_len + 1..],
+        value,
+        guard.tree_writer.allocator,
+    )?;
+
+    // Allocate a new internal node with the common prefix
+    // FIXME: deallocate 'new_value_node' on OOM
+    let mut prefix_node =
+        node_ref::new_internal(&key[..common_prefix_len], guard.tree_writer.allocator)?;
+
+    // Add the old node and the new nodes to the new internal node
+    prefix_node.insert_old_child(old_prefix[common_prefix_len], old_node);
+    prefix_node.insert_new_child(key[common_prefix_len], new_value_node);
+
+    // Modify the prefix of the old child in place
+    old_node.truncate_prefix(old_prefix.len() - common_prefix_len - 1);
+
+    // replace the pointer in the parent
+    parent.replace_child(parent_key, prefix_node.into_ptr());
+
+    Ok(())
+}
+
+fn insert_to_node<K: Key, V: Value, A: ArtAllocator<V>>(
+    wnode: &mut WriteLockedNodeRef<V>,
+    key: &[u8],
+    value: V,
+    guard: &'_ TreeWriteGuard<K, V, A>,
+) -> Result<(), OutOfMemoryError> {
+    let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
+    wnode.insert_child(key[0], value_child.into_ptr());
+    Ok(())
+}
+
+// On entry: 'parent' and 'node' are locked
+fn insert_and_grow<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    wnode: WriteLockedNodeRef<V>,
+    parent: &mut WriteLockedNodeRef<V>,
+    parent_key_byte: u8,
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), ArtError> {
+    let mut bigger_node = wnode.grow(guard.tree_writer.allocator)?;
+
+    // FIXME: deallocate 'bigger_node' on OOM
+    let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
+    bigger_node.insert_new_child(key[0], value_child);
+
+    // Replace the pointer in the parent
+    parent.replace_child(parent_key_byte, bigger_node.into_ptr());
+
+    guard.remember_obsolete_node(wnode.as_ptr());
+    wnode.write_unlock_obsolete();
+
+    Ok(())
+}
+
+fn cleanup_parent<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
+    wparent: WriteLockedNodeRef<V>,
+    rgrandparent: (ReadLockedNodeRef<V>, u8),
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), ArtError> {
+    let (rgrandparent, grandparent_key_byte) = rgrandparent;
+
+    // If the parent becomes completely empty after the deletion, remove the parent from the
+    // grandparent. (This case is possible because we reserve only 8 bytes for the prefix.)
+    // TODO: not implemented.
+
+    // If the parent has only one child, replace the parent with the remaining child. (This is not
+    // possible if the child's prefix field cannot absorb the parent's)
+    if wparent.num_children() == 1 {
+        // Try to lock the remaining child. This can fail if the child is updated
+        // concurrently.
+        let (key_byte, remaining_child) = wparent.find_remaining_child();
+
+        let mut wremaining_child = remaining_child.write_lock_or_restart()?;
+
+        if 1 + wremaining_child.get_prefix().len() + wparent.get_prefix().len() <= MAX_PREFIX_LEN {
+            let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
+
+            // Ok, we have locked the leaf, the parent, the grandparent, and the parent's only
+            // remaining leaf. Proceed with the updates.
+
+            // Update the prefix on the remaining leaf
+            wremaining_child.prepend_prefix(wparent.get_prefix(), key_byte);
+
+            // Replace the pointer in the grandparent to point directly to the remaining leaf
+            wgrandparent.replace_child(grandparent_key_byte, wremaining_child.as_ptr());
+
+            // Mark the parent as deleted.
+            guard.remember_obsolete_node(wparent.as_ptr());
+            wparent.write_unlock_obsolete();
+            return Ok(());
+        }
+    }
+
+    // If the parent's children would fit on a smaller node type after the deletion, replace it with
+    // a smaller node.
+    if wparent.can_shrink() {
+        let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
+        let smaller_node = wparent.shrink(guard.tree_writer.allocator)?;
+
+        // Replace the pointer in the grandparent
+        wgrandparent.replace_child(grandparent_key_byte, smaller_node.into_ptr());
+
+        guard.remember_obsolete_node(wparent.as_ptr());
+        wparent.write_unlock_obsolete();
+        return Ok(());
+    }
+
+    // nothing to do
+    wparent.write_unlock();
+    Ok(())
+}
+
+// Allocate a new leaf node to hold 'value'. If the key is long, we
+// may need to allocate new internal nodes to hold it too
+fn allocate_node_for_value<'a, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError> {
+    let mut prefix_off = key.len().saturating_sub(MAX_PREFIX_LEN);
+
+    let leaf_node = node_ref::new_leaf(&key[prefix_off..key.len()], value, allocator)?;
+
+    let mut node = leaf_node;
+    while prefix_off > 0 {
+        // Need another internal node
+        let remain_prefix = &key[0..prefix_off];
+
+        prefix_off = remain_prefix.len().saturating_sub(MAX_PREFIX_LEN + 1);
+        let mut internal_node = node_ref::new_internal(
+            &remain_prefix[prefix_off..remain_prefix.len() - 1],
+            allocator,
+        )?;
+        internal_node.insert_new_child(*remain_prefix.last().unwrap(), node);
+        node = internal_node;
+    }
+
+    Ok(node)
+}
+
+fn common_prefix(a: &[u8], b: &[u8]) -> usize {
+    for i in 0..MAX_PREFIX_LEN {
+        if a[i] != b[i] {
+            return i;
+        }
+    }
+    panic!("prefixes are equal");
+}
--- a/libs/neonart/src/algorithm/lock_and_version.rs
+++ b/libs/neonart/src/algorithm/lock_and_version.rs
@@ -0,0 +1,117 @@
+//! Each node in the tree has contains one atomic word that stores three things:
+//!
+//! Bit 0: set if the node is "obsolete". An obsolete node has been removed from the tree,
+//!        but might still be accessed by concurrent readers until the epoch expires.
+//! Bit 1: set if the node is currently write-locked. Used as a spinlock.
+//! Bits 2-63: Version number, incremented every time the node is modified.
+//!
+//! AtomicLockAndVersion represents that.
+
+use std::sync::atomic::{AtomicU64, Ordering};
+
+pub(crate) struct ConcurrentUpdateError();
+
+pub(crate) struct AtomicLockAndVersion {
+    inner: AtomicU64,
+}
+
+impl AtomicLockAndVersion {
+    pub(crate) fn new() -> AtomicLockAndVersion {
+        AtomicLockAndVersion {
+            inner: AtomicU64::new(0),
+        }
+    }
+}
+
+impl AtomicLockAndVersion {
+    pub(crate) fn read_lock_or_restart(&self) -> Result<u64, ConcurrentUpdateError> {
+        let version = self.await_node_unlocked();
+        if is_obsolete(version) {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(version)
+    }
+
+    pub(crate) fn check_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
+        self.read_unlock_or_restart(version)
+    }
+
+    pub(crate) fn read_unlock_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
+        if self.inner.load(Ordering::Acquire) != version {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn upgrade_to_write_lock_or_restart(
+        &self,
+        version: u64,
+    ) -> Result<(), ConcurrentUpdateError> {
+        if self
+            .inner
+            .compare_exchange(
+                version,
+                set_locked_bit(version),
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            )
+            .is_err()
+        {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn write_lock_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
+        let old = self.inner.load(Ordering::Relaxed);
+        if is_obsolete(old) || is_locked(old) {
+            return Err(ConcurrentUpdateError());
+        }
+        if self
+            .inner
+            .compare_exchange(
+                old,
+                set_locked_bit(old),
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            )
+            .is_err()
+        {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn write_unlock(&self) {
+        // reset locked bit and overflow into version
+        self.inner.fetch_add(2, Ordering::Release);
+    }
+
+    pub(crate) fn write_unlock_obsolete(&self) {
+        // set obsolete, reset locked, overflow into version
+        self.inner.fetch_add(3, Ordering::Release);
+    }
+
+    // Helper functions
+    fn await_node_unlocked(&self) -> u64 {
+        let mut version = self.inner.load(Ordering::Acquire);
+        while is_locked(version) {
+            // spinlock
+            std::thread::yield_now();
+            version = self.inner.load(Ordering::Acquire)
+        }
+        version
+    }
+}
+
+fn set_locked_bit(version: u64) -> u64 {
+    version + 2
+}
+
+fn is_obsolete(version: u64) -> bool {
+    (version & 1) == 1
+}
+
+fn is_locked(version: u64) -> bool {
+    (version & 2) == 2
+}
--- a/libs/neonart/src/algorithm/node_ptr.rs
+++ b/libs/neonart/src/algorithm/node_ptr.rs
--- a/libs/neonart/src/algorithm/node_ref.rs
+++ b/libs/neonart/src/algorithm/node_ref.rs
@@ -0,0 +1,349 @@
+use std::fmt::Debug;
+use std::marker::PhantomData;
+
+use super::node_ptr;
+use super::node_ptr::NodePtr;
+use crate::EpochPin;
+use crate::Value;
+use crate::algorithm::lock_and_version::AtomicLockAndVersion;
+use crate::algorithm::lock_and_version::ConcurrentUpdateError;
+use crate::allocator::ArtAllocator;
+use crate::allocator::OutOfMemoryError;
+
+pub struct NodeRef<'e, V> {
+    ptr: NodePtr<V>,
+
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V> Debug for NodeRef<'e, V> {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.ptr)
+    }
+}
+
+impl<'e, V: Value> NodeRef<'e, V> {
+    pub(crate) fn from_root_ptr(root_ptr: NodePtr<V>) -> NodeRef<'e, V> {
+        NodeRef {
+            ptr: root_ptr,
+            phantom: PhantomData,
+        }
+    }
+
+    pub(crate) fn read_lock_or_restart(
+        &self,
+    ) -> Result<ReadLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        let version = self.lockword().read_lock_or_restart()?;
+        Ok(ReadLockedNodeRef {
+            ptr: self.ptr,
+            version,
+            phantom: self.phantom,
+        })
+    }
+
+    pub(crate) fn write_lock_or_restart(
+        &self,
+    ) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        self.lockword().write_lock_or_restart()?;
+        Ok(WriteLockedNodeRef {
+            ptr: self.ptr,
+            phantom: self.phantom,
+        })
+    }
+
+    fn lockword(&self) -> &AtomicLockAndVersion {
+        self.ptr.lockword()
+    }
+}
+
+/// A reference to a node that has been optimistically read-locked. The functions re-check
+/// the version after each read.
+pub struct ReadLockedNodeRef<'e, V> {
+    ptr: NodePtr<V>,
+    version: u64,
+
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V: Value> ReadLockedNodeRef<'e, V> {
+    pub(crate) fn is_leaf(&self) -> bool {
+        self.ptr.is_leaf()
+    }
+
+    pub(crate) fn is_full(&self) -> bool {
+        self.ptr.is_full()
+    }
+
+    pub(crate) fn get_prefix(&self) -> &[u8] {
+        self.ptr.get_prefix()
+    }
+
+    /// Note: because we're only holding a read lock, the prefix can change concurrently.
+    /// You must be prepared to restart, if read_unlock() returns error later.
+    ///
+    /// Returns the length of the prefix, or None if it's not a match
+    pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
+        self.ptr.prefix_matches(key)
+    }
+
+    pub(crate) fn find_child_or_restart(
+        &self,
+        key_byte: u8,
+    ) -> Result<Option<NodeRef<'e, V>>, ConcurrentUpdateError> {
+        let child_or_value = self.ptr.find_child(key_byte);
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        match child_or_value {
+            None => Ok(None),
+            Some(child_ptr) => Ok(Some(NodeRef {
+                ptr: child_ptr,
+                phantom: self.phantom,
+            })),
+        }
+    }
+
+    pub(crate) fn find_next_child_or_restart(
+        &self,
+        min_key_byte: u8,
+    ) -> Result<Option<(u8, NodeRef<'e, V>)>, ConcurrentUpdateError> {
+        let child_or_value = self.ptr.find_next_child(min_key_byte);
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        match child_or_value {
+            None => Ok(None),
+            Some((k, child_ptr)) => Ok(Some((
+                k,
+                NodeRef {
+                    ptr: child_ptr,
+                    phantom: self.phantom,
+                },
+            ))),
+        }
+    }
+
+    pub(crate) fn get_leaf_value_ptr(&self) -> Result<*const V, ConcurrentUpdateError> {
+        let result = self.ptr.get_leaf_value();
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        // Extend the lifetime.
+        let result = std::ptr::from_ref(result);
+
+        Ok(result)
+    }
+
+    pub(crate) fn upgrade_to_write_lock_or_restart(
+        self,
+    ) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        self.ptr
+            .lockword()
+            .upgrade_to_write_lock_or_restart(self.version)?;
+
+        Ok(WriteLockedNodeRef {
+            ptr: self.ptr,
+            phantom: self.phantom,
+        })
+    }
+
+    pub(crate) fn read_unlock_or_restart(self) -> Result<(), ConcurrentUpdateError> {
+        self.ptr.lockword().check_or_restart(self.version)?;
+        Ok(())
+    }
+
+    pub(crate) fn check_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
+        self.ptr.lockword().check_or_restart(self.version)?;
+        Ok(())
+    }
+}
+
+/// A reference to a node that has been optimistically read-locked. The functions re-check
+/// the version after each read.
+pub struct WriteLockedNodeRef<'e, V> {
+    ptr: NodePtr<V>,
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V: Value> WriteLockedNodeRef<'e, V> {
+    pub(crate) fn can_shrink(&self) -> bool {
+        self.ptr.can_shrink()
+    }
+
+    pub(crate) fn num_children(&self) -> usize {
+        self.ptr.num_children()
+    }
+
+    pub(crate) fn write_unlock(mut self) {
+        self.ptr.lockword().write_unlock();
+        self.ptr = NodePtr::null();
+    }
+
+    pub(crate) fn write_unlock_obsolete(mut self) {
+        self.ptr.lockword().write_unlock_obsolete();
+        self.ptr = NodePtr::null();
+    }
+
+    pub(crate) fn get_prefix(&self) -> &[u8] {
+        self.ptr.get_prefix()
+    }
+
+    pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
+        self.ptr.truncate_prefix(new_prefix_len)
+    }
+
+    pub(crate) fn prepend_prefix(&mut self, prefix: &[u8], prefix_byte: u8) {
+        self.ptr.prepend_prefix(prefix, prefix_byte)
+    }
+
+    pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
+        self.ptr.insert_child(key_byte, child)
+    }
+
+    pub(crate) fn get_leaf_value_mut(&mut self) -> &mut V {
+        self.ptr.get_leaf_value_mut()
+    }
+
+    pub(crate) fn grow<'a, A>(
+        &self,
+        allocator: &'a A,
+    ) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+    where
+        A: ArtAllocator<V>,
+    {
+        let new_node = self.ptr.grow(allocator)?;
+        Ok(NewNodeRef {
+            ptr: new_node,
+            allocator,
+            extra_nodes: Vec::new(),
+        })
+    }
+
+    pub(crate) fn shrink<'a, A>(
+        &self,
+        allocator: &'a A,
+    ) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+    where
+        A: ArtAllocator<V>,
+    {
+        let new_node = self.ptr.shrink(allocator)?;
+        Ok(NewNodeRef {
+            ptr: new_node,
+            allocator,
+            extra_nodes: Vec::new(),
+        })
+    }
+
+    pub(crate) fn as_ptr(&self) -> NodePtr<V> {
+        self.ptr
+    }
+
+    pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
+        self.ptr.replace_child(key_byte, replacement);
+    }
+
+    pub(crate) fn delete_child(&mut self, key_byte: u8) {
+        self.ptr.delete_child(key_byte);
+    }
+
+    pub(crate) fn find_remaining_child(&self) -> (u8, NodeRef<'e, V>) {
+        assert_eq!(self.num_children(), 1);
+        let child_or_value = self.ptr.find_next_child(0);
+
+        match child_or_value {
+            None => panic!("could not find only child in node"),
+            Some((k, child_ptr)) => (
+                k,
+                NodeRef {
+                    ptr: child_ptr,
+                    phantom: self.phantom,
+                },
+            ),
+        }
+    }
+}
+
+impl<'e, V> Drop for WriteLockedNodeRef<'e, V> {
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            self.ptr.lockword().write_unlock();
+        }
+    }
+}
+
+pub(crate) struct NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    ptr: NodePtr<V>,
+    allocator: &'a A,
+
+    extra_nodes: Vec<NodePtr<V>>,
+}
+
+impl<'a, V, A> NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    pub(crate) fn insert_old_child(&mut self, key_byte: u8, child: &WriteLockedNodeRef<V>) {
+        self.ptr.insert_child(key_byte, child.as_ptr())
+    }
+
+    pub(crate) fn into_ptr(mut self) -> NodePtr<V> {
+        let ptr = self.ptr;
+        self.ptr = NodePtr::null();
+        ptr
+    }
+
+    pub(crate) fn insert_new_child(&mut self, key_byte: u8, child: NewNodeRef<'a, V, A>) {
+        let child_ptr = child.into_ptr();
+        self.ptr.insert_child(key_byte, child_ptr);
+        self.extra_nodes.push(child_ptr);
+    }
+}
+
+impl<'a, V, A> Drop for NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    /// This drop implementation deallocates the newly allocated node, if into_ptr() was not called.
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            self.ptr.deallocate(self.allocator);
+            for p in self.extra_nodes.iter() {
+                p.deallocate(self.allocator);
+            }
+        }
+    }
+}
+
+pub(crate) fn new_internal<'a, V, A>(
+    prefix: &[u8],
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    Ok(NewNodeRef {
+        ptr: node_ptr::new_internal(prefix, allocator)?,
+        allocator,
+        extra_nodes: Vec::new(),
+    })
+}
+
+pub(crate) fn new_leaf<'a, V, A>(
+    prefix: &[u8],
+    value: V,
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    Ok(NewNodeRef {
+        ptr: node_ptr::new_leaf(prefix, value, allocator)?,
+        allocator,
+        extra_nodes: Vec::new(),
+    })
+}
--- a/libs/neonart/src/allocator.rs
+++ b/libs/neonart/src/allocator.rs
@@ -0,0 +1,156 @@
+pub mod block;
+mod multislab;
+mod slab;
+pub mod r#static;
+
+use std::alloc::Layout;
+use std::marker::PhantomData;
+use std::mem::MaybeUninit;
+use std::sync::atomic::Ordering;
+
+use crate::allocator::multislab::MultiSlabAllocator;
+use crate::allocator::r#static::alloc_from_slice;
+
+use spin;
+
+use crate::Tree;
+pub use crate::algorithm::node_ptr::{
+    NodeInternal4, NodeInternal16, NodeInternal48, NodeInternal256, NodeLeaf,
+};
+
+#[derive(Debug)]
+pub struct OutOfMemoryError();
+
+pub trait ArtAllocator<V: crate::Value> {
+    fn alloc_tree(&self) -> *mut Tree<V>;
+
+    fn alloc_node_internal4(&self) -> *mut NodeInternal4<V>;
+    fn alloc_node_internal16(&self) -> *mut NodeInternal16<V>;
+    fn alloc_node_internal48(&self) -> *mut NodeInternal48<V>;
+    fn alloc_node_internal256(&self) -> *mut NodeInternal256<V>;
+    fn alloc_node_leaf(&self) -> *mut NodeLeaf<V>;
+
+    fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>);
+    fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>);
+    fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>);
+    fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>);
+    fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>);
+}
+
+pub struct ArtMultiSlabAllocator<'t, V>
+where
+    V: crate::Value,
+{
+    tree_area: spin::Mutex<Option<&'t mut MaybeUninit<Tree<V>>>>,
+
+    pub(crate) inner: MultiSlabAllocator<'t, 5>,
+
+    phantom_val: PhantomData<V>,
+}
+
+impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
+    const LAYOUTS: [Layout; 5] = [
+        Layout::new::<NodeInternal4<V>>(),
+        Layout::new::<NodeInternal16<V>>(),
+        Layout::new::<NodeInternal48<V>>(),
+        Layout::new::<NodeInternal256<V>>(),
+        Layout::new::<NodeLeaf<V>>(),
+    ];
+
+    pub fn new(area: &'t mut [MaybeUninit<u8>]) -> &'t mut ArtMultiSlabAllocator<'t, V> {
+        let (allocator_area, remain) = alloc_from_slice::<ArtMultiSlabAllocator<V>>(area);
+        let (tree_area, remain) = alloc_from_slice::<Tree<V>>(remain);
+
+        allocator_area.write(ArtMultiSlabAllocator {
+            tree_area: spin::Mutex::new(Some(tree_area)),
+            inner: MultiSlabAllocator::new(remain, &Self::LAYOUTS),
+            phantom_val: PhantomData,
+        })
+    }
+}
+
+impl<'t, V: crate::Value> ArtAllocator<V> for ArtMultiSlabAllocator<'t, V> {
+    fn alloc_tree(&self) -> *mut Tree<V> {
+        let mut t = self.tree_area.lock();
+        if let Some(tree_area) = t.take() {
+            return tree_area.as_mut_ptr().cast();
+        }
+        panic!("cannot allocate more than one tree");
+    }
+
+    fn alloc_node_internal4(&self) -> *mut NodeInternal4<V> {
+        self.inner.alloc_slab(0).cast()
+    }
+    fn alloc_node_internal16(&self) -> *mut NodeInternal16<V> {
+        self.inner.alloc_slab(1).cast()
+    }
+    fn alloc_node_internal48(&self) -> *mut NodeInternal48<V> {
+        self.inner.alloc_slab(2).cast()
+    }
+    fn alloc_node_internal256(&self) -> *mut NodeInternal256<V> {
+        self.inner.alloc_slab(3).cast()
+    }
+    fn alloc_node_leaf(&self) -> *mut NodeLeaf<V> {
+        self.inner.alloc_slab(4).cast()
+    }
+
+    fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>) {
+        self.inner.dealloc_slab(0, ptr.cast())
+    }
+
+    fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>) {
+        self.inner.dealloc_slab(1, ptr.cast())
+    }
+    fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>) {
+        self.inner.dealloc_slab(2, ptr.cast())
+    }
+    fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>) {
+        self.inner.dealloc_slab(3, ptr.cast())
+    }
+    fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>) {
+        self.inner.dealloc_slab(4, ptr.cast())
+    }
+}
+
+impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
+    pub(crate) fn get_statistics(&self) -> ArtMultiSlabStats {
+        ArtMultiSlabStats {
+            num_internal4: self.inner.slab_descs[0]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal16: self.inner.slab_descs[1]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal48: self.inner.slab_descs[2]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal256: self.inner.slab_descs[3]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_leaf: self.inner.slab_descs[4]
+                .num_allocated
+                .load(Ordering::Relaxed),
+
+            num_blocks_internal4: self.inner.slab_descs[0].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal16: self.inner.slab_descs[1].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal48: self.inner.slab_descs[2].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal256: self.inner.slab_descs[3].num_blocks.load(Ordering::Relaxed),
+            num_blocks_leaf: self.inner.slab_descs[4].num_blocks.load(Ordering::Relaxed),
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ArtMultiSlabStats {
+    pub num_internal4: u64,
+    pub num_internal16: u64,
+    pub num_internal48: u64,
+    pub num_internal256: u64,
+    pub num_leaf: u64,
+
+    pub num_blocks_internal4: u64,
+    pub num_blocks_internal16: u64,
+    pub num_blocks_internal48: u64,
+    pub num_blocks_internal256: u64,
+    pub num_blocks_leaf: u64,
+}
--- a/libs/neonart/src/allocator/block.rs
+++ b/libs/neonart/src/allocator/block.rs
@@ -0,0 +1,191 @@
+//! Simple allocator of fixed-size blocks
+
+use std::mem::MaybeUninit;
+use std::sync::atomic::{AtomicU64, Ordering};
+
+use spin;
+
+pub const BLOCK_SIZE: usize = 16 * 1024;
+
+const INVALID_BLOCK: u64 = u64::MAX;
+
+pub(crate) struct BlockAllocator<'t> {
+    blocks_ptr: &'t [MaybeUninit<u8>],
+    num_blocks: u64,
+    num_initialized: AtomicU64,
+
+    freelist_head: spin::Mutex<u64>,
+}
+
+struct FreeListBlock {
+    inner: spin::Mutex<FreeListBlockInner>,
+}
+
+struct FreeListBlockInner {
+    next: u64,
+
+    num_free_blocks: u64,
+    free_blocks: [u64; 100], // FIXME: fill the rest of the block
+}
+
+impl<'t> BlockAllocator<'t> {
+    pub(crate) fn new(area: &'t mut [MaybeUninit<u8>]) -> Self {
+        // Use all the space for the blocks
+        let padding = area.as_ptr().align_offset(BLOCK_SIZE);
+        let remain = &mut area[padding..];
+
+        let num_blocks = (remain.len() / BLOCK_SIZE) as u64;
+
+        BlockAllocator {
+            blocks_ptr: remain,
+            num_blocks,
+            num_initialized: AtomicU64::new(0),
+            freelist_head: spin::Mutex::new(INVALID_BLOCK),
+        }
+    }
+
+    /// safety: you must hold a lock on the pointer to this block, otherwise it might get
+    /// reused for another kind of block
+    fn read_freelist_block(&self, blkno: u64) -> &FreeListBlock {
+        let ptr: *const FreeListBlock = self.get_block_ptr(blkno).cast();
+        unsafe { ptr.as_ref().unwrap() }
+    }
+
+    fn get_block_ptr(&self, blkno: u64) -> *mut u8 {
+        assert!(blkno < self.num_blocks);
+        unsafe {
+            self.blocks_ptr
+                .as_ptr()
+                .byte_offset(blkno as isize * BLOCK_SIZE as isize)
+        }
+        .cast_mut()
+        .cast()
+    }
+
+    #[allow(clippy::mut_from_ref)]
+    pub(crate) fn alloc_block(&self) -> &mut [MaybeUninit<u8>] {
+        // FIXME: handle OOM
+        let blkno = self.alloc_block_internal();
+        if blkno == INVALID_BLOCK {
+            panic!("out of memory");
+        }
+
+        let ptr: *mut MaybeUninit<u8> = self.get_block_ptr(blkno).cast();
+        unsafe { std::slice::from_raw_parts_mut(ptr, BLOCK_SIZE) }
+    }
+
+    fn alloc_block_internal(&self) -> u64 {
+        //  check the free list.
+        {
+            let mut freelist_head = self.freelist_head.lock();
+            if *freelist_head != INVALID_BLOCK {
+                let freelist_block = self.read_freelist_block(*freelist_head);
+
+                // acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
+                let mut g = freelist_block.inner.lock();
+
+                if g.num_free_blocks > 0 {
+                    g.num_free_blocks -= 1;
+                    let result = g.free_blocks[g.num_free_blocks as usize];
+                    return result;
+                } else {
+                    // consume the freelist block itself
+                    let result = *freelist_head;
+                    *freelist_head = g.next;
+                    // This freelist block is now unlinked and can be repurposed
+                    drop(g);
+                    return result;
+                }
+            }
+        }
+
+        // If there are some blocks left that we've never used, pick next such block
+        let mut next_uninitialized = self.num_initialized.load(Ordering::Relaxed);
+        while next_uninitialized < self.num_blocks {
+            match self.num_initialized.compare_exchange(
+                next_uninitialized,
+                next_uninitialized + 1,
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => {
+                    return next_uninitialized;
+                }
+                Err(old) => {
+                    next_uninitialized = old;
+                    continue;
+                }
+            }
+        }
+
+        // out of blocks
+        INVALID_BLOCK
+    }
+
+    // TODO: this is currently unused. The slab allocator never releases blocks
+    #[allow(dead_code)]
+    pub(crate) fn release_block(&self, block_ptr: *mut u8) {
+        let blockno = unsafe { block_ptr.byte_offset_from(self.blocks_ptr) / BLOCK_SIZE as isize };
+        self.release_block_internal(blockno as u64);
+    }
+
+    fn release_block_internal(&self, blockno: u64) {
+        let mut freelist_head = self.freelist_head.lock();
+        if *freelist_head != INVALID_BLOCK {
+            let freelist_block = self.read_freelist_block(*freelist_head);
+
+            // acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
+            let mut g = freelist_block.inner.lock();
+
+            let num_free_blocks = g.num_free_blocks;
+            if num_free_blocks < g.free_blocks.len() as u64 {
+                g.free_blocks[num_free_blocks as usize] = blockno;
+                g.num_free_blocks += 1;
+                return;
+            }
+        }
+
+        // Convert the block into a new freelist block
+        let block_ptr: *mut FreeListBlock = self.get_block_ptr(blockno).cast();
+        let init = FreeListBlock {
+            inner: spin::Mutex::new(FreeListBlockInner {
+                next: *freelist_head,
+                num_free_blocks: 0,
+                free_blocks: [INVALID_BLOCK; 100],
+            }),
+        };
+        unsafe { (*block_ptr) = init };
+        *freelist_head = blockno;
+    }
+
+    // for debugging
+    pub(crate) fn get_statistics(&self) -> BlockAllocatorStats {
+        let mut num_free_blocks = 0;
+
+        let mut _prev_lock = None;
+        let head_lock = self.freelist_head.lock();
+        let mut next_blk = *head_lock;
+        let mut _head_lock = Some(head_lock);
+        while next_blk != INVALID_BLOCK {
+            let freelist_block = self.read_freelist_block(next_blk);
+            let lock = freelist_block.inner.lock();
+            num_free_blocks += lock.num_free_blocks;
+            next_blk = lock.next;
+            _prev_lock = Some(lock); // hold the lock until we've read the next block
+            _head_lock = None;
+        }
+
+        BlockAllocatorStats {
+            num_blocks: self.num_blocks,
+            num_initialized: self.num_initialized.load(Ordering::Relaxed),
+            num_free_blocks,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct BlockAllocatorStats {
+    pub num_blocks: u64,
+    pub num_initialized: u64,
+    pub num_free_blocks: u64,
+}
--- a/libs/neonart/src/allocator/multislab.rs
+++ b/libs/neonart/src/allocator/multislab.rs
@@ -0,0 +1,33 @@
+use std::alloc::Layout;
+use std::mem::MaybeUninit;
+
+use crate::allocator::block::BlockAllocator;
+use crate::allocator::slab::SlabDesc;
+
+pub struct MultiSlabAllocator<'t, const N: usize> {
+    pub(crate) block_allocator: BlockAllocator<'t>,
+
+    pub(crate) slab_descs: [SlabDesc; N],
+}
+
+impl<'t, const N: usize> MultiSlabAllocator<'t, N> {
+    pub(crate) fn new(
+        area: &'t mut [MaybeUninit<u8>],
+        layouts: &[Layout; N],
+    ) -> MultiSlabAllocator<'t, N> {
+        let block_allocator = BlockAllocator::new(area);
+        MultiSlabAllocator {
+            block_allocator,
+
+            slab_descs: std::array::from_fn(|i| SlabDesc::new(&layouts[i])),
+        }
+    }
+
+    pub(crate) fn alloc_slab(&self, slab_idx: usize) -> *mut u8 {
+        self.slab_descs[slab_idx].alloc_chunk(&self.block_allocator)
+    }
+
+    pub(crate) fn dealloc_slab(&self, slab_idx: usize, ptr: *mut u8) {
+        self.slab_descs[slab_idx].dealloc_chunk(ptr, &self.block_allocator)
+    }
+}
--- a/libs/neonart/src/allocator/slab.rs
+++ b/libs/neonart/src/allocator/slab.rs
@@ -0,0 +1,433 @@
+//! A slab allocator that carves out fixed-size chunks from larger blocks.
+//!
+//!
+
+use std::alloc::Layout;
+use std::mem::MaybeUninit;
+use std::ops::Deref;
+use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
+
+use spin;
+
+use super::alloc_from_slice;
+use super::block::BlockAllocator;
+
+use crate::allocator::block::BLOCK_SIZE;
+
+pub(crate) struct SlabDesc {
+    pub(crate) layout: Layout,
+
+    block_lists: spin::RwLock<BlockLists>,
+
+    pub(crate) num_blocks: AtomicU64,
+    pub(crate) num_allocated: AtomicU64,
+}
+
+// FIXME: Not sure if SlabDesc is really Sync or Send. It probably is when it's empty, but
+// 'block_lists' contains pointers when it's not empty. In the current use as part of the
+// the art tree, SlabDescs are only moved during initialization.
+unsafe impl Sync for SlabDesc {}
+unsafe impl Send for SlabDesc {}
+
+#[derive(Default, Debug)]
+struct BlockLists {
+    full_blocks: BlockList,
+    nonfull_blocks: BlockList,
+}
+
+impl BlockLists {
+    // Unlink a node. It must be in either one of the two lists.
+    unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
+        let list = unsafe {
+            if (*elem).next.is_null() {
+                if self.full_blocks.tail == elem {
+                    Some(&mut self.full_blocks)
+                } else {
+                    Some(&mut self.nonfull_blocks)
+                }
+            } else if (*elem).prev.is_null() {
+                if self.full_blocks.head == elem {
+                    Some(&mut self.full_blocks)
+                } else {
+                    Some(&mut self.nonfull_blocks)
+                }
+            } else {
+                None
+            }
+        };
+        unsafe { unlink_slab_block(list, elem) };
+    }
+}
+
+unsafe fn unlink_slab_block(mut list: Option<&mut BlockList>, elem: *mut SlabBlockHeader) {
+    unsafe {
+        if (*elem).next.is_null() {
+            assert_eq!(list.as_ref().unwrap().tail, elem);
+            list.as_mut().unwrap().tail = (*elem).prev;
+        } else {
+            assert_eq!((*(*elem).next).prev, elem);
+            (*(*elem).next).prev = (*elem).prev;
+        }
+        if (*elem).prev.is_null() {
+            assert_eq!(list.as_ref().unwrap().head, elem);
+            list.as_mut().unwrap().head = (*elem).next;
+        } else {
+            assert_eq!((*(*elem).prev).next, elem);
+            (*(*elem).prev).next = (*elem).next;
+        }
+    }
+}
+
+#[derive(Debug)]
+struct BlockList {
+    head: *mut SlabBlockHeader,
+    tail: *mut SlabBlockHeader,
+}
+
+impl Default for BlockList {
+    fn default() -> Self {
+        BlockList {
+            head: std::ptr::null_mut(),
+            tail: std::ptr::null_mut(),
+        }
+    }
+}
+
+impl BlockList {
+    unsafe fn push_head(&mut self, elem: *mut SlabBlockHeader) {
+        unsafe {
+            if self.is_empty() {
+                self.tail = elem;
+                (*elem).next = std::ptr::null_mut();
+            } else {
+                (*elem).next = self.head;
+                (*self.head).prev = elem;
+            }
+            (*elem).prev = std::ptr::null_mut();
+            self.head = elem;
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        self.head.is_null()
+    }
+
+    unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
+        unsafe { unlink_slab_block(Some(self), elem) }
+    }
+
+    #[cfg(test)]
+    fn dump(&self) {
+        let mut next = self.head;
+
+        while !next.is_null() {
+            let n = unsafe { next.as_ref() }.unwrap();
+            eprintln!(
+                "  blk {:?} (free {}/{})",
+                next,
+                n.num_free_chunks.load(Ordering::Relaxed),
+                n.num_chunks
+            );
+            next = n.next;
+        }
+    }
+}
+
+impl SlabDesc {
+    pub(crate) fn new(layout: &Layout) -> SlabDesc {
+        SlabDesc {
+            layout: *layout,
+            block_lists: spin::RwLock::new(BlockLists::default()),
+            num_allocated: AtomicU64::new(0),
+            num_blocks: AtomicU64::new(0),
+        }
+    }
+}
+
+#[derive(Debug)]
+struct SlabBlockHeader {
+    free_chunks_head: spin::Mutex<*mut FreeChunk>,
+    num_free_chunks: AtomicU32,
+    num_chunks: u32, // this is really a constant for a given Layout
+
+    // these fields are protected by the lock on the BlockLists
+    prev: *mut SlabBlockHeader,
+    next: *mut SlabBlockHeader,
+}
+
+struct FreeChunk {
+    next: *mut FreeChunk,
+}
+
+enum ReadOrWriteGuard<'a, T> {
+    Read(spin::RwLockReadGuard<'a, T>),
+    Write(spin::RwLockWriteGuard<'a, T>),
+}
+
+impl<'a, T> Deref for ReadOrWriteGuard<'a, T> {
+    type Target = T;
+
+    fn deref(&self) -> &<Self as Deref>::Target {
+        match self {
+            ReadOrWriteGuard::Read(g) => g.deref(),
+            ReadOrWriteGuard::Write(g) => g.deref(),
+        }
+    }
+}
+
+impl SlabDesc {
+    pub fn alloc_chunk(&self, block_allocator: &BlockAllocator) -> *mut u8 {
+        // Are there any free chunks?
+        let mut acquire_write = false;
+        'outer: loop {
+            let mut block_lists_guard = if acquire_write {
+                ReadOrWriteGuard::Write(self.block_lists.write())
+            } else {
+                ReadOrWriteGuard::Read(self.block_lists.read())
+            };
+            'inner: loop {
+                let block_ptr = block_lists_guard.nonfull_blocks.head;
+                if block_ptr.is_null() {
+                    break 'outer;
+                }
+                unsafe {
+                    let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
+                    if !(*free_chunks_head).is_null() {
+                        let result = *free_chunks_head;
+                        (*free_chunks_head) = (*result).next;
+                        let _old = (*block_ptr).num_free_chunks.fetch_sub(1, Ordering::Relaxed);
+
+                        self.num_allocated.fetch_add(1, Ordering::Relaxed);
+                        return result.cast();
+                    }
+                }
+
+                // The block at the head of the list was full. Grab write lock and retry
+                match block_lists_guard {
+                    ReadOrWriteGuard::Read(_) => {
+                        acquire_write = true;
+                        continue 'outer;
+                    }
+                    ReadOrWriteGuard::Write(ref mut g) => {
+                        // move the node to the list of full blocks
+                        unsafe {
+                            g.nonfull_blocks.unlink(block_ptr);
+                            g.full_blocks.push_head(block_ptr);
+                        };
+                        continue 'inner;
+                    }
+                }
+            }
+        }
+
+        // no free chunks. Allocate a new block (and the chunk from that)
+        let (new_block, new_chunk) = self.alloc_block_and_chunk(block_allocator);
+        self.num_blocks.fetch_add(1, Ordering::Relaxed);
+
+        // Add the block to the list in the SlabDesc
+        unsafe {
+            let mut block_lists_guard = self.block_lists.write();
+            block_lists_guard.nonfull_blocks.push_head(new_block);
+        }
+        self.num_allocated.fetch_add(1, Ordering::Relaxed);
+        new_chunk
+    }
+
+    pub fn dealloc_chunk(&self, chunk_ptr: *mut u8, _block_allocator: &BlockAllocator) {
+        // Find the block it belongs to. You can find the block from the address. (And knowing the
+        // layout, you could calculate the chunk number too.)
+        let block_ptr: *mut SlabBlockHeader = {
+            let block_addr = (chunk_ptr.addr() / BLOCK_SIZE) * BLOCK_SIZE;
+            chunk_ptr.with_addr(block_addr).cast()
+        };
+        let chunk_ptr: *mut FreeChunk = chunk_ptr.cast();
+
+        // Mark the chunk as free in 'freechunks' list
+        let num_chunks;
+        let num_free_chunks;
+        unsafe {
+            let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
+            (*chunk_ptr).next = *free_chunks_head;
+            *free_chunks_head = chunk_ptr;
+
+            num_free_chunks = (*block_ptr).num_free_chunks.fetch_add(1, Ordering::Relaxed) + 1;
+            num_chunks = (*block_ptr).num_chunks;
+        }
+
+        if num_free_chunks == 1 {
+            // If the block was full previously, add it to the nonfull blocks list. Note that
+            // we're not holding the lock anymore, so it can immediately become full again.
+            // That's harmless, it will be moved back to the full list again when a call
+            // to alloc_chunk() sees it.
+            let mut block_lists = self.block_lists.write();
+            unsafe {
+                block_lists.unlink(block_ptr);
+                block_lists.nonfull_blocks.push_head(block_ptr);
+            };
+        } else if num_free_chunks == num_chunks {
+            // If the block became completely empty, move it to the free list
+            // TODO
+            // FIXME: we're still holding the spinlock. It's not exactly safe to return it to
+            // the free blocks list, is it? Defer it as garbage to wait out concurrent updates?
+            //block_allocator.release_block()
+        }
+
+        // update stats
+        self.num_allocated.fetch_sub(1, Ordering::Relaxed);
+    }
+
+    fn alloc_block_and_chunk(
+        &self,
+        block_allocator: &BlockAllocator,
+    ) -> (*mut SlabBlockHeader, *mut u8) {
+        // fixme: handle OOM
+        let block_slice: &mut [MaybeUninit<u8>] = block_allocator.alloc_block();
+        let (block_header, remain) = alloc_from_slice::<SlabBlockHeader>(block_slice);
+
+        let padding = remain.as_ptr().align_offset(self.layout.align());
+
+        let num_chunks = (remain.len() - padding) / self.layout.size();
+
+        let first_chunk_ptr: *mut FreeChunk = remain[padding..].as_mut_ptr().cast();
+
+        unsafe {
+            let mut chunk_ptr = first_chunk_ptr;
+            for _ in 0..num_chunks - 1 {
+                let next_chunk_ptr = chunk_ptr.byte_add(self.layout.size());
+                (*chunk_ptr).next = next_chunk_ptr;
+                chunk_ptr = next_chunk_ptr;
+            }
+            (*chunk_ptr).next = std::ptr::null_mut();
+
+            let result_chunk = first_chunk_ptr;
+
+            let block_header = block_header.write(SlabBlockHeader {
+                free_chunks_head: spin::Mutex::new((*first_chunk_ptr).next),
+                prev: std::ptr::null_mut(),
+                next: std::ptr::null_mut(),
+                num_chunks: num_chunks as u32,
+                num_free_chunks: AtomicU32::new(num_chunks as u32 - 1),
+            });
+
+            (block_header, result_chunk.cast())
+        }
+    }
+
+    #[cfg(test)]
+    fn dump(&self) {
+        eprintln!(
+            "slab dump ({} blocks, {} allocated chunks)",
+            self.num_blocks.load(Ordering::Relaxed),
+            self.num_allocated.load(Ordering::Relaxed)
+        );
+        let lists = self.block_lists.read();
+
+        eprintln!("nonfull blocks:");
+        lists.nonfull_blocks.dump();
+        eprintln!("full blocks:");
+        lists.full_blocks.dump();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use rand::Rng;
+    use rand_distr::Zipf;
+
+    struct TestObject {
+        val: usize,
+        _dummy: [u8; BLOCK_SIZE / 4],
+    }
+
+    struct TestObjectSlab<'a>(SlabDesc, BlockAllocator<'a>);
+    impl<'a> TestObjectSlab<'a> {
+        fn new(block_allocator: BlockAllocator) -> TestObjectSlab {
+            TestObjectSlab(SlabDesc::new(&Layout::new::<TestObject>()), block_allocator)
+        }
+
+        fn alloc(&self, val: usize) -> *mut TestObject {
+            let obj: *mut TestObject = self.0.alloc_chunk(&self.1).cast();
+            unsafe { (*obj).val = val };
+            obj
+        }
+
+        fn dealloc(&self, obj: *mut TestObject) {
+            self.0.dealloc_chunk(obj.cast(), &self.1)
+        }
+    }
+
+    #[test]
+    fn test_slab_alloc() {
+        const MEM_SIZE: usize = 100000000;
+        let mut area = Box::new_uninit_slice(MEM_SIZE);
+        let block_allocator = BlockAllocator::new(&mut area);
+
+        let slab = TestObjectSlab::new(block_allocator);
+
+        let mut all: Vec<*mut TestObject> = Vec::new();
+        for i in 0..11 {
+            all.push(slab.alloc(i));
+        }
+        #[allow(clippy::needless_range_loop)]
+        for i in 0..11 {
+            assert!(unsafe { (*all[i]).val == i });
+        }
+
+        let distribution = Zipf::new(10.0, 1.1).unwrap();
+        let mut rng = rand::rng();
+        for _ in 0..100000 {
+            slab.0.dump();
+            let idx = rng.sample(distribution) as usize;
+            let ptr: *mut TestObject = all[idx];
+            if !ptr.is_null() {
+                assert_eq!(unsafe { (*ptr).val }, idx);
+                slab.dealloc(ptr);
+                all[idx] = std::ptr::null_mut();
+            } else {
+                all[idx] = slab.alloc(idx);
+            }
+        }
+    }
+
+    fn new_test_blk(i: u32) -> *mut SlabBlockHeader {
+        Box::into_raw(Box::new(SlabBlockHeader {
+            free_chunks_head: spin::Mutex::new(std::ptr::null_mut()),
+            num_free_chunks: AtomicU32::new(0),
+            num_chunks: i,
+            prev: std::ptr::null_mut(),
+            next: std::ptr::null_mut(),
+        }))
+    }
+
+    #[test]
+    fn test_block_linked_list() {
+        // note: these are leaked, but that's OK for tests
+        let a = new_test_blk(0);
+        let b = new_test_blk(1);
+
+        let mut list = BlockList::default();
+        assert!(list.is_empty());
+
+        unsafe {
+            list.push_head(a);
+            assert!(!list.is_empty());
+            list.unlink(a);
+        }
+        assert!(list.is_empty());
+
+        unsafe {
+            list.push_head(b);
+            list.push_head(a);
+            assert_eq!(list.head, a);
+            assert_eq!((*a).next, b);
+            assert_eq!((*b).prev, a);
+            assert_eq!(list.tail, b);
+
+            list.unlink(a);
+            list.unlink(b);
+            assert!(list.is_empty());
+        }
+    }
+}
--- a/libs/neonart/src/allocator/static.rs
+++ b/libs/neonart/src/allocator/static.rs
@@ -0,0 +1,44 @@
+use std::mem::MaybeUninit;
+
+pub fn alloc_from_slice<T>(
+    area: &mut [MaybeUninit<u8>],
+) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
+    let layout = std::alloc::Layout::new::<T>();
+
+    let area_start = area.as_mut_ptr();
+
+    // pad to satisfy alignment requirements
+    let padding = area_start.align_offset(layout.align());
+    if padding + layout.size() > area.len() {
+        panic!("out of memory");
+    }
+    let area = &mut area[padding..];
+    let (result_area, remain) = area.split_at_mut(layout.size());
+
+    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
+    let result = unsafe { result_ptr.as_mut().unwrap() };
+
+    (result, remain)
+}
+
+pub fn alloc_array_from_slice<T>(
+    area: &mut [MaybeUninit<u8>],
+    len: usize,
+) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
+    let layout = std::alloc::Layout::new::<T>();
+
+    let area_start = area.as_mut_ptr();
+
+    // pad to satisfy alignment requirements
+    let padding = area_start.align_offset(layout.align());
+    if padding + layout.size() * len > area.len() {
+        panic!("out of memory");
+    }
+    let area = &mut area[padding..];
+    let (result_area, remain) = area.split_at_mut(layout.size() * len);
+
+    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
+    let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
+
+    (result, remain)
+}
--- a/libs/neonart/src/epoch.rs
+++ b/libs/neonart/src/epoch.rs
@@ -0,0 +1,142 @@
+//! This is similar to crossbeam_epoch crate, but works in shared memory
+
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
+
+use crossbeam_utils::CachePadded;
+
+const NUM_SLOTS: usize = 1000;
+
+/// This is the struct that is stored in shmem
+///
+/// bit 0: is it pinned or not?
+/// rest of the bits are the epoch counter.
+pub struct EpochShared {
+    global_epoch: AtomicU64,
+    participants: [CachePadded<AtomicU64>; NUM_SLOTS],
+
+    broadcast_lock: spin::Mutex<()>,
+}
+
+impl EpochShared {
+    pub fn new() -> EpochShared {
+        EpochShared {
+            global_epoch: AtomicU64::new(2),
+            participants: [const { CachePadded::new(AtomicU64::new(2)) }; NUM_SLOTS],
+            broadcast_lock: spin::Mutex::new(()),
+        }
+    }
+
+    pub fn register(&self) -> LocalHandle {
+        LocalHandle {
+            global: self,
+            last_slot: AtomicUsize::new(0), // todo: choose more intelligently
+        }
+    }
+
+    fn release_pin(&self, slot: usize, _epoch: u64) {
+        let global_epoch = self.global_epoch.load(Ordering::Relaxed);
+        self.participants[slot].store(global_epoch, Ordering::Relaxed);
+    }
+
+    fn pin_internal(&self, slot_hint: usize) -> (usize, u64) {
+        // pick a slot
+        let mut slot = slot_hint;
+        let epoch = loop {
+            let old = self.participants[slot].fetch_or(1, Ordering::Relaxed);
+            if old & 1 == 0 {
+                // Got this slot
+                break old;
+            }
+
+            // the slot was busy by another thread / process. try a different slot
+            slot += 1;
+            if slot == NUM_SLOTS {
+                slot = 0;
+            }
+            continue;
+        };
+        (slot, epoch)
+    }
+
+    pub(crate) fn advance(&self) -> u64 {
+        // Advance the global epoch
+        let old_epoch = self.global_epoch.fetch_add(2, Ordering::Relaxed);
+        // Anyone that release their pin after this will update their slot.
+        old_epoch + 2
+    }
+
+    pub(crate) fn broadcast(&self) {
+        let Some(_guard) = self.broadcast_lock.try_lock() else {
+            return;
+        };
+
+        let epoch = self.global_epoch.load(Ordering::Relaxed);
+        let old_epoch = epoch.wrapping_sub(2);
+
+        // Update all free slots.
+        for i in 0..NUM_SLOTS {
+            // TODO: check result, as a sanity check. It should either be the old epoch, or pinned
+            let _ = self.participants[i].compare_exchange(
+                old_epoch,
+                epoch,
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+            );
+        }
+
+        // FIXME: memory fence here, since we used Relaxed?
+    }
+
+    pub(crate) fn get_oldest(&self) -> u64 {
+        // Read all slots.
+        let now = self.global_epoch.load(Ordering::Relaxed);
+        let mut oldest = now;
+        for i in 0..NUM_SLOTS {
+            let this_epoch = self.participants[i].load(Ordering::Relaxed);
+            let delta = now.wrapping_sub(this_epoch);
+            if delta > u64::MAX / 2 {
+                // this is very recent
+            } else if delta > now.wrapping_sub(oldest) {
+                oldest = this_epoch;
+            }
+        }
+        oldest
+    }
+
+    pub(crate) fn get_current(&self) -> u64 {
+        self.global_epoch.load(Ordering::Relaxed)
+    }
+}
+
+pub(crate) struct EpochPin<'e> {
+    slot: usize,
+    pub(crate) epoch: u64,
+
+    handle: &'e LocalHandle<'e>,
+}
+
+impl<'e> Drop for EpochPin<'e> {
+    fn drop(&mut self) {
+        self.handle.global.release_pin(self.slot, self.epoch);
+    }
+}
+
+pub struct LocalHandle<'g> {
+    global: &'g EpochShared,
+
+    last_slot: AtomicUsize,
+}
+
+impl<'g> LocalHandle<'g> {
+    pub fn pin(&self) -> EpochPin {
+        let (slot, epoch) = self
+            .global
+            .pin_internal(self.last_slot.load(Ordering::Relaxed));
+        self.last_slot.store(slot, Ordering::Relaxed);
+        EpochPin {
+            handle: self,
+            epoch,
+            slot,
+        }
+    }
+}
--- a/libs/neonart/src/lib.rs
+++ b/libs/neonart/src/lib.rs
@@ -0,0 +1,583 @@
+//! Adaptive Radix Tree (ART) implementation, with Optimistic Lock Coupling.
+//!
+//! The data structure is described in these two papers:
+//!
+//! [1] Leis, V. & Kemper, Alfons & Neumann, Thomas. (2013).
+//!     The adaptive radix tree: ARTful indexing for main-memory databases.
+//!     Proceedings - International Conference on Data Engineering. 38-49. 10.1109/ICDE.2013.6544812.
+//!     https://db.in.tum.de/~leis/papers/ART.pdf
+//!
+//! [2] Leis, Viktor & Scheibner, Florian & Kemper, Alfons & Neumann, Thomas. (2016).
+//!     The ART of practical synchronization.
+//!     1-8. 10.1145/2933349.2933352.
+//!     https://db.in.tum.de/~leis/papers/artsync.pdf
+//!
+//! [1] describes the base data structure, and [2] describes the Optimistic Lock Coupling that we
+//! use.
+//!
+//! The papers mention a few different variants. We have made the following choices in this
+//! implementation:
+//!
+//! - All keys have the same length
+//!
+//! - Single-value leaves.
+//!
+//! - For collapsing inner nodes, we use the Pessimistic approach, where each inner node stores a
+//!   variable length "prefix", which stores the keys of all the one-way nodes which have been
+//!   removed. However, similar to the "hybrid" approach described in the paper, each node only has
+//!   space for a constant-size prefix of 8 bytes. If a node would have a longer prefix, then we
+//!   create create one-way nodes to store them. (There was no particular reason for this choice,
+//!   the "hybrid" approach described in the paper might be better.)
+//!
+//! - For concurrency, we use Optimistic Lock Coupling. The paper [2] also describes another method,
+//!   ROWEX, which generally performs better when there is contention, but that is not important
+//!   for use and Optimisic Lock Coupling is simpler to implement.
+//!
+//! ## Requirements
+//!
+//! This data structure is currently used for the integrated LFC, relsize and last-written LSN cache
+//! in the compute communicator, part of the 'neon' Postgres extension. We have some unique
+//! requirements, which is why we had to write our own. Namely:
+//!
+//! - The data structure has to live in fixed-sized shared memory segment. That rules out any
+//!   built-in Rust collections and most crates. (Except possibly with the 'allocator_api' rust
+//!   feature, which still nightly-only experimental as of this writing).
+//!
+//! - The data structure is accessed from multiple processes. Only one process updates the data
+//!   structure, but other processes perform reads. That rules out using built-in Rust locking
+//!   primitives like Mutex and RwLock, and most crates too.
+//!
+//! - Within the one process with write-access, multiple threads can perform updates concurrently.
+//!   That rules out using PostgreSQL LWLocks for the locking.
+//!
+//! The implementation is generic, and doesn't depend on any PostgreSQL specifics, but it has been
+//! written with that usage and the above constraints in mind. Some noteworthy assumptions:
+//!
+//! - Contention is assumed to be rare. In the integrated cache in PostgreSQL, there's higher level
+//!   locking in the PostgreSQL buffer manager, which ensures that two backends should not try to
+//!   read / write the same page at the same time. (Prefetching can conflict with actual reads,
+//!   however.)
+//!
+//!  - The keys in the integrated cache are 17 bytes long.
+//!
+//! ## Usage
+//!
+//! Because this is designed to be used as a Postgres shared memory data structure, initialization
+//! happens in three stages:
+//!
+//! 0. A fixed area of shared memory is allocated at postmaster startup.
+//!
+//! 1. TreeInitStruct::new() is called to initialize it, still in Postmaster process, before any
+//!    other process or thread is running. It returns a TreeInitStruct, which is inherited by all
+//!    the processes through fork().
+//!
+//! 2. One process may have write-access to the struct, by calling
+//!    [TreeInitStruct::attach_writer]. (That process is the communicator process.)
+//!
+//! 3. Other processes get read-access to the struct, by calling [TreeInitStruct::attach_reader]
+//!
+//! "Write access" means that you can insert / update / delete values in the tree.
+//!
+//! NOTE: The Values stored in the tree are sometimes moved, when a leaf node fills up and a new
+//! larger node needs to be allocated. The versioning and epoch-based allocator ensure that the data
+//! structure stays consistent, but if the Value has interior mutability, like atomic fields,
+//! updates to such fields might be lost if the leaf node is concurrently moved! If that becomes a
+//! problem, the version check could be passed up to the caller, so that the caller could detect the
+//! lost updates and retry the operation.
+//!
+//! ## Implementation
+//!
+//! node_ptr: Provides low-level implementations of the four different node types (eight actually,
+//! since there is an Internal and Leaf variant of each)
+//!
+//! lock_and_version.rs: Provides an abstraction for the combined lock and version counter on each
+//! node.
+//!
+//! node_ref.rs: The code in node_ptr.rs deals with raw pointers. node_ref.rs provides more type-safe
+//!   abstractions on top.
+//!
+//! algorithm.rs: Contains the functions to implement lookups and updates in the tree
+//!
+//! allocator.rs: Provides a facility to allocate memory for the tree nodes. (We must provide our
+//!   own abstraction for that because we need the data structure to live in a pre-allocated shared
+//!   memory segment).
+//!
+//! epoch.rs: The data structure requires that when a node is removed from the tree, it is not
+//!   immediately deallocated, but stays around for as long as concurrent readers might still have
+//!   pointers to them. This is enforced by an epoch system. This is similar to
+//!   e.g. crossbeam_epoch, but we couldn't use that either because it has to work across processes
+//!   communicating over the shared memory segment.
+//!
+//! ## See also
+//!
+//! There are some existing Rust ART implementations out there, but none of them filled all
+//! the requirements:
+//!
+//! - https://github.com/XiangpengHao/congee
+//! - https://github.com/declanvk/blart
+//!
+//! ## TODO
+//!
+//! - Removing values has not been implemented
+
+mod algorithm;
+pub mod allocator;
+mod epoch;
+
+use algorithm::RootPtr;
+use algorithm::node_ptr::NodePtr;
+
+use std::collections::VecDeque;
+use std::fmt::Debug;
+use std::marker::PhantomData;
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+use crate::epoch::EpochPin;
+
+#[cfg(test)]
+mod tests;
+
+use allocator::ArtAllocator;
+pub use allocator::ArtMultiSlabAllocator;
+pub use allocator::OutOfMemoryError;
+
+/// Fixed-length key type.
+///
+pub trait Key: Debug {
+    const KEY_LEN: usize;
+
+    fn as_bytes(&self) -> &[u8];
+}
+
+/// Values stored in the tree
+///
+/// Values need to be Cloneable, because when a node "grows", the value is copied to a new node and
+/// the old sticks around until all readers that might see the old value are gone.
+// fixme obsolete, no longer needs Clone
+pub trait Value {}
+
+const MAX_GARBAGE: usize = 1024;
+
+/// The root of the tree, plus other tree-wide data. This is stored in the shared memory.
+pub struct Tree<V: Value> {
+    /// For simplicity, so that we never need to grow or shrink the root, the root node is always an
+    /// Internal256 node. Also, it never has a prefix (that's actually a bit wasteful, incurring one
+    /// indirection to every lookup)
+    root: RootPtr<V>,
+
+    writer_attached: AtomicBool,
+
+    epoch: epoch::EpochShared,
+}
+
+unsafe impl<V: Value + Sync> Sync for Tree<V> {}
+unsafe impl<V: Value + Send> Send for Tree<V> {}
+
+struct GarbageQueue<V>(VecDeque<(NodePtr<V>, u64)>);
+
+unsafe impl<V: Value + Sync> Sync for GarbageQueue<V> {}
+unsafe impl<V: Value + Send> Send for GarbageQueue<V> {}
+
+impl<V> GarbageQueue<V> {
+    fn new() -> GarbageQueue<V> {
+        GarbageQueue(VecDeque::with_capacity(MAX_GARBAGE))
+    }
+
+    fn remember_obsolete_node(&mut self, ptr: NodePtr<V>, epoch: u64) {
+        self.0.push_front((ptr, epoch));
+    }
+
+    fn next_obsolete(&mut self, cutoff_epoch: u64) -> Option<NodePtr<V>> {
+        if let Some(back) = self.0.back() {
+            if back.1 < cutoff_epoch {
+                return Some(self.0.pop_back().unwrap().0);
+            }
+        }
+        None
+    }
+}
+
+/// Struct created at postmaster startup
+pub struct TreeInitStruct<'t, K: Key, V: Value, A: ArtAllocator<V>> {
+    tree: &'t Tree<V>,
+
+    allocator: &'t A,
+
+    phantom_key: PhantomData<K>,
+}
+
+/// The worker process has a reference to this. The write operations are only safe
+/// from the worker process
+pub struct TreeWriteAccess<'t, K: Key, V: Value, A: ArtAllocator<V>>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'t Tree<V>,
+
+    pub allocator: &'t A,
+
+    epoch_handle: epoch::LocalHandle<'t>,
+
+    phantom_key: PhantomData<K>,
+
+    /// Obsolete nodes that cannot be recycled until their epoch expires.
+    garbage: spin::Mutex<GarbageQueue<V>>,
+}
+
+/// The backends have a reference to this. It cannot be used to modify the tree
+pub struct TreeReadAccess<'t, K: Key, V: Value>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'t Tree<V>,
+
+    epoch_handle: epoch::LocalHandle<'t>,
+
+    phantom_key: PhantomData<K>,
+}
+
+impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeInitStruct<'t, K, V, A> {
+    pub fn new(allocator: &'t A) -> TreeInitStruct<'t, K, V, A> {
+        let tree_ptr = allocator.alloc_tree();
+        let tree_ptr = NonNull::new(tree_ptr).expect("out of memory");
+        let init = Tree {
+            root: algorithm::new_root(allocator).expect("out of memory"),
+            writer_attached: AtomicBool::new(false),
+            epoch: epoch::EpochShared::new(),
+        };
+        unsafe { tree_ptr.write(init) };
+
+        TreeInitStruct {
+            tree: unsafe { tree_ptr.as_ref() },
+            allocator,
+            phantom_key: PhantomData,
+        }
+    }
+
+    pub fn attach_writer(self) -> TreeWriteAccess<'t, K, V, A> {
+        let previously_attached = self.tree.writer_attached.swap(true, Ordering::Relaxed);
+        if previously_attached {
+            panic!("writer already attached");
+        }
+        TreeWriteAccess {
+            tree: self.tree,
+            allocator: self.allocator,
+            phantom_key: PhantomData,
+            epoch_handle: self.tree.epoch.register(),
+            garbage: spin::Mutex::new(GarbageQueue::new()),
+        }
+    }
+
+    pub fn attach_reader(self) -> TreeReadAccess<'t, K, V> {
+        TreeReadAccess {
+            tree: self.tree,
+            phantom_key: PhantomData,
+            epoch_handle: self.tree.epoch.register(),
+        }
+    }
+}
+
+impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteAccess<'t, K, V, A> {
+    pub fn start_write<'g>(&'t self) -> TreeWriteGuard<'g, K, V, A>
+    where
+        't: 'g,
+    {
+        TreeWriteGuard {
+            tree_writer: self,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+            created_garbage: false,
+        }
+    }
+
+    pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
+        TreeReadGuard {
+            tree: self.tree,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+        }
+    }
+}
+
+impl<'t, K: Key, V: Value> TreeReadAccess<'t, K, V> {
+    pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
+        TreeReadGuard {
+            tree: self.tree,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+        }
+    }
+}
+
+pub struct TreeReadGuard<'e, K, V>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'e Tree<V>,
+
+    epoch_pin: EpochPin<'e>,
+    phantom_key: PhantomData<K>,
+}
+
+impl<'e, K: Key, V: Value> TreeReadGuard<'e, K, V> {
+    pub fn get(&'e self, key: &K) -> Option<&'e V> {
+        algorithm::search(key, self.tree.root, &self.epoch_pin)
+    }
+}
+
+pub struct TreeWriteGuard<'e, K, V, A>
+where
+    K: Key,
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    tree_writer: &'e TreeWriteAccess<'e, K, V, A>,
+
+    epoch_pin: EpochPin<'e>,
+    phantom_key: PhantomData<K>,
+
+    created_garbage: bool,
+}
+
+pub enum UpdateAction<V> {
+    Nothing,
+    Insert(V),
+    Remove,
+}
+
+impl<'e, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
+    /// Get a value
+    pub fn get(&'e mut self, key: &K) -> Option<&'e V> {
+        algorithm::search(key, self.tree_writer.tree.root, &self.epoch_pin)
+    }
+
+    /// Insert a value
+    pub fn insert(self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
+        let mut success = None;
+
+        self.update_with_fn(key, |existing| {
+            if existing.is_some() {
+                success = Some(false);
+                UpdateAction::Nothing
+            } else {
+                success = Some(true);
+                UpdateAction::Insert(value)
+            }
+        })?;
+        Ok(success.expect("value_fn not called"))
+    }
+
+    /// Remove value. Returns true if it existed
+    pub fn remove(self, key: &K) -> bool {
+        let mut result = false;
+        // FIXME: It's not clear if OOM is expected while removing. It seems
+        // not nice, but shrinking a node can OOM. Then again, we could opt
+        // to not shrink a node if we cannot allocate, to live a little longer.
+        self.update_with_fn(key, |existing| match existing {
+            Some(_) => {
+                result = true;
+                UpdateAction::Remove
+            }
+            None => UpdateAction::Nothing,
+        })
+        .expect("out of memory while removing");
+        result
+    }
+
+    /// Try to remove value and return the old value.
+    pub fn remove_and_return(self, key: &K) -> Option<V>
+    where
+        V: Clone,
+    {
+        let mut old = None;
+        self.update_with_fn(key, |existing| {
+            old = existing.cloned();
+            UpdateAction::Remove
+        })
+        .expect("out of memory while removing");
+        old
+    }
+
+    /// Update key using the given function. All the other modifying operations are based on this.
+    ///
+    /// The function is passed a reference to the existing value, if any. If the function
+    /// returns None, the value is removed from the tree (or if there was no existing value,
+    /// does nothing). If the function returns Some, the existing value is replaced, of if there
+    /// was no existing value, it is inserted. FIXME: update comment
+    pub fn update_with_fn<F>(mut self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
+    where
+        F: FnOnce(Option<&V>) -> UpdateAction<V>,
+    {
+        algorithm::update_fn(key, value_fn, self.tree_writer.tree.root, &mut self)?;
+
+        if self.created_garbage {
+            let _ = self.collect_garbage();
+        }
+        Ok(())
+    }
+
+    fn remember_obsolete_node(&mut self, ptr: NodePtr<V>) {
+        self.tree_writer
+            .garbage
+            .lock()
+            .remember_obsolete_node(ptr, self.epoch_pin.epoch);
+        self.created_garbage = true;
+    }
+
+    // returns number of nodes recycled
+    fn collect_garbage(&self) -> usize {
+        self.tree_writer.tree.epoch.advance();
+        self.tree_writer.tree.epoch.broadcast();
+
+        let cutoff_epoch = self.tree_writer.tree.epoch.get_oldest();
+
+        let mut result = 0;
+        let mut garbage_queue = self.tree_writer.garbage.lock();
+        while let Some(ptr) = garbage_queue.next_obsolete(cutoff_epoch) {
+            ptr.deallocate(self.tree_writer.allocator);
+            result += 1;
+        }
+        result
+    }
+}
+
+pub struct TreeIterator<K>
+where
+    K: Key + for<'a> From<&'a [u8]>,
+{
+    done: bool,
+    pub next_key: Vec<u8>,
+    max_key: Option<Vec<u8>>,
+
+    phantom_key: PhantomData<K>,
+}
+
+impl<K> TreeIterator<K>
+where
+    K: Key + for<'a> From<&'a [u8]>,
+{
+    pub fn new_wrapping() -> TreeIterator<K> {
+        TreeIterator {
+            done: false,
+            next_key: vec![0; K::KEY_LEN],
+            max_key: None,
+            phantom_key: PhantomData,
+        }
+    }
+
+    pub fn new(range: &std::ops::Range<K>) -> TreeIterator<K> {
+        let result = TreeIterator {
+            done: false,
+            next_key: Vec::from(range.start.as_bytes()),
+            max_key: Some(Vec::from(range.end.as_bytes())),
+            phantom_key: PhantomData,
+        };
+        assert_eq!(result.next_key.len(), K::KEY_LEN);
+        assert_eq!(result.max_key.as_ref().unwrap().len(), K::KEY_LEN);
+
+        result
+    }
+
+    pub fn next<'g, V>(&mut self, read_guard: &'g TreeReadGuard<'g, K, V>) -> Option<(K, &'g V)>
+    where
+        V: Value,
+    {
+        if self.done {
+            return None;
+        }
+
+        let mut wrapped_around = false;
+        loop {
+            assert_eq!(self.next_key.len(), K::KEY_LEN);
+            if let Some((k, v)) =
+                algorithm::iter_next(&self.next_key, read_guard.tree.root, &read_guard.epoch_pin)
+            {
+                assert_eq!(k.len(), K::KEY_LEN);
+                assert_eq!(self.next_key.len(), K::KEY_LEN);
+
+                // Check if we reached the end of the range
+                if let Some(max_key) = &self.max_key {
+                    if k.as_slice() >= max_key.as_slice() {
+                        self.done = true;
+                        break None;
+                    }
+                }
+
+                // increment the key
+                self.next_key = k.clone();
+                increment_key(self.next_key.as_mut_slice());
+                let k = k.as_slice().into();
+
+                break Some((k, v));
+            } else {
+                if self.max_key.is_some() {
+                    self.done = true;
+                } else {
+                    // Start from beginning
+                    if !wrapped_around {
+                        for i in 0..K::KEY_LEN {
+                            self.next_key[i] = 0;
+                        }
+                        wrapped_around = true;
+                        continue;
+                    } else {
+                        // The tree is completely empty
+                        // FIXME: perhaps we should remember the starting point instead.
+                        // Currently this will scan some ranges twice.
+                        break None;
+                    }
+                }
+                break None;
+            }
+        }
+    }
+}
+
+fn increment_key(key: &mut [u8]) -> bool {
+    for i in (0..key.len()).rev() {
+        let (byte, overflow) = key[i].overflowing_add(1);
+        key[i] = byte;
+        if !overflow {
+            return false;
+        }
+    }
+    true
+}
+
+// Debugging functions
+impl<'e, K: Key, V: Value + Debug, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
+    pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
+        algorithm::dump_tree(self.tree_writer.tree.root, &self.epoch_pin, dst)
+    }
+}
+impl<'e, K: Key, V: Value + Debug> TreeReadGuard<'e, K, V> {
+    pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
+        algorithm::dump_tree(self.tree.root, &self.epoch_pin, dst)
+    }
+}
+impl<'e, K: Key, V: Value> TreeWriteAccess<'e, K, V, ArtMultiSlabAllocator<'e, V>> {
+    pub fn get_statistics(&self) -> ArtTreeStatistics {
+        self.allocator.get_statistics();
+        ArtTreeStatistics {
+            blocks: self.allocator.inner.block_allocator.get_statistics(),
+            slabs: self.allocator.get_statistics(),
+            epoch: self.tree.epoch.get_current(),
+            oldest_epoch: self.tree.epoch.get_oldest(),
+            num_garbage: self.garbage.lock().0.len() as u64,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ArtTreeStatistics {
+    pub blocks: allocator::block::BlockAllocatorStats,
+    pub slabs: allocator::ArtMultiSlabStats,
+
+    pub epoch: u64,
+    pub oldest_epoch: u64,
+    pub num_garbage: u64,
+}
--- a/libs/neonart/src/tests.rs
+++ b/libs/neonart/src/tests.rs
@@ -0,0 +1,236 @@
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fmt::{Debug, Formatter};
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use crate::ArtAllocator;
+use crate::ArtMultiSlabAllocator;
+use crate::TreeInitStruct;
+use crate::TreeIterator;
+use crate::TreeWriteAccess;
+use crate::UpdateAction;
+
+use crate::{Key, Value};
+
+use rand::Rng;
+use rand::seq::SliceRandom;
+use rand_distr::Zipf;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl TestKey {
+    const MIN: TestKey = TestKey([0; TEST_KEY_LEN]);
+    const MAX: TestKey = TestKey([u8::MAX; TEST_KEY_LEN]);
+}
+
+impl Key for TestKey {
+    const KEY_LEN: usize = TEST_KEY_LEN;
+    fn as_bytes(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+impl From<&TestKey> for u128 {
+    fn from(val: &TestKey) -> u128 {
+        u128::from_be_bytes(val.0)
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl<'a> From<&'a [u8]> for TestKey {
+    fn from(bytes: &'a [u8]) -> TestKey {
+        TestKey(bytes.try_into().unwrap())
+    }
+}
+
+impl Value for usize {}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
+    const MEM_SIZE: usize = 10000000;
+    let mut area = Box::new_uninit_slice(MEM_SIZE);
+
+    let allocator = ArtMultiSlabAllocator::new(&mut area);
+
+    let init_struct = TreeInitStruct::<TestKey, usize, _>::new(allocator);
+    let tree_writer = init_struct.attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+        let w = tree_writer.start_write();
+        let res = w.insert(&(*k).into(), idx);
+        assert!(res.is_ok());
+    }
+
+    for (idx, k) in keys.iter().enumerate() {
+        let r = tree_writer.start_read();
+        let value = r.get(&(*k).into());
+        assert_eq!(value, Some(idx).as_ref());
+    }
+
+    eprintln!("stats: {:?}", tree_writer.get_statistics());
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut rand::rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.contains(&key) {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
+
+struct TestValue(AtomicUsize);
+
+impl TestValue {
+    fn new(val: usize) -> TestValue {
+        TestValue(AtomicUsize::new(val))
+    }
+
+    fn load(&self) -> usize {
+        self.0.load(Ordering::Relaxed)
+    }
+}
+
+impl Value for TestValue {}
+
+impl Clone for TestValue {
+    fn clone(&self) -> TestValue {
+        TestValue::new(self.load())
+    }
+}
+
+impl Debug for TestValue {
+    fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.load())
+    }
+}
+
+#[derive(Clone, Debug)]
+struct TestOp(TestKey, Option<usize>);
+
+fn apply_op<A: ArtAllocator<TestValue>>(
+    op: &TestOp,
+    tree: &TreeWriteAccess<TestKey, TestValue, A>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    eprintln!("applying op: {op:?}");
+
+    // apply the change to the shadow tree first
+    let shadow_existing = if let Some(v) = op.1 {
+        shadow.insert(op.0, v)
+    } else {
+        shadow.remove(&op.0)
+    };
+
+    // apply to Art tree
+    let w = tree.start_write();
+    w.update_with_fn(&op.0, |existing| {
+        assert_eq!(existing.map(TestValue::load), shadow_existing);
+
+        match (existing, op.1) {
+            (None, None) => UpdateAction::Nothing,
+            (None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
+            (Some(_old_val), None) => UpdateAction::Remove,
+            (Some(old_val), Some(new_val)) => {
+                old_val.0.store(new_val, Ordering::Relaxed);
+                UpdateAction::Nothing
+            }
+        }
+    })
+    .expect("out of memory");
+}
+
+fn test_iter<A: ArtAllocator<TestValue>>(
+    tree: &TreeWriteAccess<TestKey, TestValue, A>,
+    shadow: &BTreeMap<TestKey, usize>,
+) {
+    let mut shadow_iter = shadow.iter();
+    let mut iter = TreeIterator::new(&(TestKey::MIN..TestKey::MAX));
+
+    loop {
+        let shadow_item = shadow_iter.next().map(|(k, v)| (*k, *v));
+        let r = tree.start_read();
+        let item = iter.next(&r);
+
+        if shadow_item != item.map(|(k, v)| (k, v.load())) {
+            eprintln!("FAIL: iterator returned {item:?}, expected {shadow_item:?}");
+            tree.start_read().dump(&mut std::io::stderr());
+
+            eprintln!("SHADOW:");
+            for si in shadow {
+                eprintln!("key: {:?}, val: {}", si.0, si.1);
+            }
+            panic!("FAIL: iterator returned {item:?}, expected {shadow_item:?}");
+        }
+        if item.is_none() {
+            break;
+        }
+    }
+}
+
+#[test]
+fn random_ops() {
+    const MEM_SIZE: usize = 10000000;
+    let mut area = Box::new_uninit_slice(MEM_SIZE);
+
+    let allocator = ArtMultiSlabAllocator::new(&mut area);
+
+    let init_struct = TreeInitStruct::<TestKey, TestValue, _>::new(allocator);
+    let tree_writer = init_struct.attach_writer();
+
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
+    let mut rng = rand::rng();
+    for i in 0..100000 {
+        let mut key: TestKey = (rng.sample(distribution) as u128).into();
+
+        if rng.random_bool(0.10) {
+            key = TestKey::from(u128::from(&key) | 0xffffffff);
+        }
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &tree_writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            eprintln!("stats: {:?}", tree_writer.get_statistics());
+            test_iter(&tree_writer, &shadow);
+        }
+    }
+}
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -5,7 +5,6 @@ mod tests;

 use const_format::formatcp;
 use posthog_client_lite::PostHogClientConfig;
-use utils::serde_percent::Percent;
 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
 pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
@@ -224,9 +223,8 @@ pub struct ConfigToml {
    pub metric_collection_bucket: Option<RemoteStorageConfig>,
    #[serde(with = "humantime_serde")]
    pub synthetic_size_calculation_interval: Duration,
-    pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
+    pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
    pub test_remote_failures: u64,
-    pub test_remote_failures_probability: u64,
    pub ondemand_download_behavior_treat_error_as_warn: bool,
    #[serde(with = "humantime_serde")]
    pub background_task_maximum_delay: Duration,
@@ -272,13 +270,9 @@ pub struct ConfigToml {
    pub timeline_import_config: TimelineImportConfig,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub basebackup_cache_config: Option<BasebackupCacheConfig>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub image_layer_generation_large_timeline_threshold: Option<u64>,
-    pub force_metric_collection_on_scrape: bool,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-#[serde(default)]
 pub struct DiskUsageEvictionTaskConfig {
    pub max_usage_pct: utils::serde_percent::Percent,
    pub min_avail_bytes: u64,
@@ -289,21 +283,6 @@ pub struct DiskUsageEvictionTaskConfig {
    /// Select sorting for evicted layers
    #[serde(default)]
    pub eviction_order: EvictionOrder,
-    pub enabled: bool,
-}
-
-impl Default for DiskUsageEvictionTaskConfig {
-    fn default() -> Self {
-        Self {
-            max_usage_pct: Percent::new(80).unwrap(),
-            min_avail_bytes: 2_000_000_000,
-            period: Duration::from_secs(60),
-            #[cfg(feature = "testing")]
-            mock_statvfs: None,
-            eviction_order: EvictionOrder::default(),
-            enabled: true,
-        }
-    }
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -564,11 +543,6 @@ pub struct TenantConfigToml {
    pub gc_period: Duration,
    // Delta layer churn threshold to create L1 image layers.
    pub image_creation_threshold: usize,
-    // HADRON
-    // When the timeout is reached, PageServer will (1) force compact any remaining L0 deltas and
-    // (2) create image layers if there are any L1 deltas.
-    #[serde(with = "humantime_serde")]
-    pub image_layer_force_creation_period: Option<Duration>,
    // Determines how much history is retained, to allow
    // branching and read replicas at an older point in time.
    // The unit is time.
@@ -764,10 +738,9 @@ impl Default for ConfigToml {

            metric_collection_bucket: (None),

-            disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(),
+            disk_usage_based_eviction: (None),

            test_remote_failures: (0),
-            test_remote_failures_probability: (100),

            ondemand_download_behavior_treat_error_as_warn: (false),

@@ -831,8 +804,6 @@ impl Default for ConfigToml {
            },
            basebackup_cache_config: None,
            posthog_config: None,
-            image_layer_generation_large_timeline_threshold: Some(2 * 1024 * 1024 * 1024),
-            force_metric_collection_on_scrape: true,
        }
    }
 }
@@ -926,7 +897,6 @@ impl Default for TenantConfigToml {
            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                .expect("cannot parse default gc period"),
            image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
-            image_layer_force_creation_period: None,
            pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
                .expect("cannot parse default PITR interval"),
            walreceiver_connect_timeout: humantime::parse_duration(
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -384,7 +384,7 @@ pub struct SafekeepersInfo {
    pub safekeepers: Vec<SafekeeperInfo>,
 }

-#[derive(Serialize, Deserialize, Clone, Debug)]
+#[derive(Serialize, Deserialize, Clone)]
 pub struct SafekeeperInfo {
    pub id: NodeId,
    pub hostname: String,
@@ -597,9 +597,6 @@ pub struct TenantConfigPatch {
    pub gc_period: FieldPatch<String>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub image_creation_threshold: FieldPatch<usize>,
-    // HADRON
-    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
-    pub image_layer_force_creation_period: FieldPatch<String>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub pitr_interval: FieldPatch<String>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
@@ -703,11 +700,6 @@ pub struct TenantConfig {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub image_creation_threshold: Option<usize>,

-    // HADRON
-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(with = "humantime_serde")]
-    pub image_layer_force_creation_period: Option<Duration>,
-
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(with = "humantime_serde")]
    pub pitr_interval: Option<Duration>,
@@ -806,7 +798,6 @@ impl TenantConfig {
            mut gc_horizon,
            mut gc_period,
            mut image_creation_threshold,
-            mut image_layer_force_creation_period,
            mut pitr_interval,
            mut walreceiver_connect_timeout,
            mut lagging_wal_timeout,
@@ -870,11 +861,6 @@ impl TenantConfig {
        patch
            .image_creation_threshold
            .apply(&mut image_creation_threshold);
-        // HADRON
-        patch
-            .image_layer_force_creation_period
-            .map(|v| humantime::parse_duration(&v))?
-            .apply(&mut image_layer_force_creation_period);
        patch
            .pitr_interval
            .map(|v| humantime::parse_duration(&v))?
@@ -956,7 +942,6 @@ impl TenantConfig {
            gc_horizon,
            gc_period,
            image_creation_threshold,
-            image_layer_force_creation_period,
            pitr_interval,
            walreceiver_connect_timeout,
            lagging_wal_timeout,
@@ -1031,9 +1016,6 @@ impl TenantConfig {
            image_creation_threshold: self
                .image_creation_threshold
                .unwrap_or(global_conf.image_creation_threshold),
-            image_layer_force_creation_period: self
-                .image_layer_force_creation_period
-                .or(global_conf.image_layer_force_creation_period),
            pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval),
            walreceiver_connect_timeout: self
                .walreceiver_connect_timeout
--- a/libs/proxy/json/Cargo.toml
+++ b/libs/proxy/json/Cargo.toml
@@ -1,12 +0,0 @@
-[package]
-name = "json"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-[dependencies]
-ryu = "1"
-itoa = "1"
-
-[dev-dependencies]
-futures = "0.3"
--- a/libs/proxy/json/src/lib.rs
+++ b/libs/proxy/json/src/lib.rs
@@ -1,412 +0,0 @@
-//! A JSON serialization lib, designed for more flexibility than `serde_json` offers.
-//!
-//! Features:
-//!
-//! ## Dynamic construction
-//!
-//! Sometimes you have dynamic values you want to serialize, that are not already in a serde-aware model like a struct or a Vec etc.
-//! To achieve this with serde, you need to implement a lot of different traits on a lot of different new-types.
-//! Because of this, it's often easier to give-in and pull all the data into a serde-aware model (`serde_json::Value` or some intermediate struct),
-//! but that is often not very efficient.
-//!
-//! This crate allows full control over the JSON encoding without needing to implement any extra traits. Just call the
-//! relevant functions, and it will guarantee a correctly encoded JSON value.
-//!
-//! ## Async construction
-//!
-//! Similar to the above, sometimes the values arrive asynchronously. Often collecting those values in memory
-//! is more expensive than writing them as JSON, since the overheads of `Vec` and `String` is much higher, however
-//! there are exceptions.
-//!
-//! Serializing to JSON all in one go is also more CPU intensive and can cause lag spikes,
-//! whereas serializing values incrementally spreads out the CPU load and reduces lag.
-//!
-//! ## Examples
-//!
-//! To represent the following JSON as a compact string
-//!
-//! ```json
-//! {
-//!   "results": {
-//!     "rows": [
-//!       {
-//!         "id": 1,
-//!         "value": null
-//!       },
-//!       {
-//!         "id": 2,
-//!         "value": "hello"
-//!       }
-//!     ]
-//!   }
-//! }
-//! ```
-//!
-//! We can use the following code:
-//!
-//! ```
-//! // create the outer object
-//! let s = json::value_to_string!(|v| json::value_as_object!(|v| {
-//!     // create an entry with key "results" and start an object value associated with it.
-//!     let results = v.key("results");
-//!     json::value_as_object!(|results| {
-//!         // create an entry with key "rows" and start an list value associated with it.
-//!         let rows = results.key("rows");
-//!         json::value_as_list!(|rows| {
-//!             // create a list entry and start an object value associated with it.
-//!             let row = rows.entry();
-//!             json::value_as_object!(|row| {
-//!                 // add entry "id": 1
-//!                 row.entry("id", 1);
-//!                 // add entry "value": null
-//!                 row.entry("value", json::Null);
-//!             });
-//!
-//!             // create a list entry and start an object value associated with it.
-//!             let row = rows.entry();
-//!             json::value_as_object!(|row| {
-//!                 // add entry "id": 2
-//!                 row.entry("id", 2);
-//!                 // add entry "value": "hello"
-//!                 row.entry("value", "hello");
-//!             });
-//!         });
-//!     });
-//! }));
-//!
-//! assert_eq!(s, r#"{"results":{"rows":[{"id":1,"value":null},{"id":2,"value":"hello"}]}}"#);
-//! ```
-
-mod macros;
-mod str;
-mod value;
-
-pub use value::{Null, ValueEncoder};
-
-#[must_use]
-/// Serialize a single json value.
-pub struct ValueSer<'buf> {
-    buf: &'buf mut Vec<u8>,
-    start: usize,
-}
-
-impl<'buf> ValueSer<'buf> {
-    /// Create a new json value serializer.
-    pub fn new(buf: &'buf mut Vec<u8>) -> Self {
-        Self { buf, start: 0 }
-    }
-
-    /// Borrow the underlying buffer
-    pub fn as_buffer(&self) -> &[u8] {
-        self.buf
-    }
-
-    #[inline]
-    pub fn value(self, e: impl ValueEncoder) {
-        e.encode(self);
-    }
-
-    /// Write raw bytes to the buf. This must be already JSON encoded.
-    #[inline]
-    pub fn write_raw_json(self, data: &[u8]) {
-        self.buf.extend_from_slice(data);
-        self.finish();
-    }
-
-    /// Start a new object serializer.
-    #[inline]
-    pub fn object(self) -> ObjectSer<'buf> {
-        ObjectSer::new(self)
-    }
-
-    /// Start a new list serializer.
-    #[inline]
-    pub fn list(self) -> ListSer<'buf> {
-        ListSer::new(self)
-    }
-
-    /// Finish the value ser.
-    #[inline]
-    fn finish(self) {
-        // don't trigger the drop handler which triggers a rollback.
-        // this won't cause memory leaks because `ValueSet` owns no allocations.
-        std::mem::forget(self);
-    }
-}
-
-impl Drop for ValueSer<'_> {
-    fn drop(&mut self) {
-        self.buf.truncate(self.start);
-    }
-}
-
-#[must_use]
-/// Serialize a json object.
-pub struct ObjectSer<'buf> {
-    value: ValueSer<'buf>,
-    start: usize,
-}
-
-impl<'buf> ObjectSer<'buf> {
-    /// Start a new object serializer.
-    #[inline]
-    pub fn new(value: ValueSer<'buf>) -> Self {
-        value.buf.push(b'{');
-        let start = value.buf.len();
-        Self { value, start }
-    }
-
-    /// Borrow the underlying buffer
-    pub fn as_buffer(&self) -> &[u8] {
-        self.value.as_buffer()
-    }
-
-    /// Start a new object entry with the given string key, returning a [`ValueSer`] for the associated value.
-    #[inline]
-    pub fn key(&mut self, key: impl KeyEncoder) -> ValueSer<'_> {
-        key.write_key(self)
-    }
-
-    /// Write an entry (key-value pair) to the object.
-    #[inline]
-    pub fn entry(&mut self, key: impl KeyEncoder, val: impl ValueEncoder) {
-        self.key(key).value(val);
-    }
-
-    #[inline]
-    fn entry_inner(&mut self, f: impl FnOnce(&mut Vec<u8>)) -> ValueSer<'_> {
-        // track before the separator so we the value is rolled back it also removes the separator.
-        let start = self.value.buf.len();
-
-        // push separator if necessary
-        if self.value.buf.len() > self.start {
-            self.value.buf.push(b',');
-        }
-        // push key
-        f(self.value.buf);
-        // push value separator
-        self.value.buf.push(b':');
-
-        // return value writer.
-        ValueSer {
-            buf: self.value.buf,
-            start,
-        }
-    }
-
-    /// Reset the buffer back to before this object was started.
-    #[inline]
-    pub fn rollback(self) -> ValueSer<'buf> {
-        // Do not fully reset the value, only reset it to before the `{`.
-        // This ensures any `,` before this value are not clobbered.
-        self.value.buf.truncate(self.start - 1);
-        self.value
-    }
-
-    /// Finish the object ser.
-    #[inline]
-    pub fn finish(self) {
-        self.value.buf.push(b'}');
-        self.value.finish();
-    }
-}
-
-pub trait KeyEncoder {
-    fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a>;
-}
-
-#[must_use]
-/// Serialize a json object.
-pub struct ListSer<'buf> {
-    value: ValueSer<'buf>,
-    start: usize,
-}
-
-impl<'buf> ListSer<'buf> {
-    /// Start a new list serializer.
-    #[inline]
-    pub fn new(value: ValueSer<'buf>) -> Self {
-        value.buf.push(b'[');
-        let start = value.buf.len();
-        Self { value, start }
-    }
-
-    /// Borrow the underlying buffer
-    pub fn as_buffer(&self) -> &[u8] {
-        self.value.as_buffer()
-    }
-
-    /// Write an value to the list.
-    #[inline]
-    pub fn push(&mut self, val: impl ValueEncoder) {
-        self.entry().value(val);
-    }
-
-    /// Start a new value entry in this list.
-    #[inline]
-    pub fn entry(&mut self) -> ValueSer<'_> {
-        // track before the separator so we the value is rolled back it also removes the separator.
-        let start = self.value.buf.len();
-
-        // push separator if necessary
-        if self.value.buf.len() > self.start {
-            self.value.buf.push(b',');
-        }
-
-        // return value writer.
-        ValueSer {
-            buf: self.value.buf,
-            start,
-        }
-    }
-
-    /// Reset the buffer back to before this object was started.
-    #[inline]
-    pub fn rollback(self) -> ValueSer<'buf> {
-        // Do not fully reset the value, only reset it to before the `[`.
-        // This ensures any `,` before this value are not clobbered.
-        self.value.buf.truncate(self.start - 1);
-        self.value
-    }
-
-    /// Finish the object ser.
-    #[inline]
-    pub fn finish(self) {
-        self.value.buf.push(b']');
-        self.value.finish();
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use crate::{Null, ValueSer};
-
-    #[test]
-    fn object() {
-        let mut buf = vec![];
-        let mut object = ValueSer::new(&mut buf).object();
-        object.entry("foo", "bar");
-        object.entry("baz", Null);
-        object.finish();
-
-        assert_eq!(buf, br#"{"foo":"bar","baz":null}"#);
-    }
-
-    #[test]
-    fn list() {
-        let mut buf = vec![];
-        let mut list = ValueSer::new(&mut buf).list();
-        list.entry().value("bar");
-        list.entry().value(Null);
-        list.finish();
-
-        assert_eq!(buf, br#"["bar",null]"#);
-    }
-
-    #[test]
-    fn object_macro() {
-        let res = crate::value_to_string!(|obj| {
-            crate::value_as_object!(|obj| {
-                obj.entry("foo", "bar");
-                obj.entry("baz", Null);
-            })
-        });
-
-        assert_eq!(res, r#"{"foo":"bar","baz":null}"#);
-    }
-
-    #[test]
-    fn list_macro() {
-        let res = crate::value_to_string!(|list| {
-            crate::value_as_list!(|list| {
-                list.entry().value("bar");
-                list.entry().value(Null);
-            })
-        });
-
-        assert_eq!(res, r#"["bar",null]"#);
-    }
-
-    #[test]
-    fn rollback_on_drop() {
-        let res = crate::value_to_string!(|list| {
-            crate::value_as_list!(|list| {
-                list.entry().value("bar");
-
-                'cancel: {
-                    let nested_list = list.entry();
-                    crate::value_as_list!(|nested_list| {
-                        nested_list.entry().value(1);
-
-                        assert_eq!(nested_list.as_buffer(), br#"["bar",[1"#);
-                        if true {
-                            break 'cancel;
-                        }
-                    })
-                }
-
-                assert_eq!(list.as_buffer(), br#"["bar""#);
-
-                list.entry().value(Null);
-            })
-        });
-
-        assert_eq!(res, r#"["bar",null]"#);
-    }
-
-    #[test]
-    fn rollback_object() {
-        let res = crate::value_to_string!(|obj| {
-            crate::value_as_object!(|obj| {
-                let entry = obj.key("1");
-                entry.value(1_i32);
-
-                let entry = obj.key("2");
-                let entry = {
-                    let mut nested_obj = entry.object();
-                    nested_obj.entry("foo", "bar");
-                    nested_obj.rollback()
-                };
-
-                entry.value(2_i32);
-            })
-        });
-
-        assert_eq!(res, r#"{"1":1,"2":2}"#);
-    }
-
-    #[test]
-    fn rollback_list() {
-        let res = crate::value_to_string!(|list| {
-            crate::value_as_list!(|list| {
-                let entry = list.entry();
-                entry.value(1_i32);
-
-                let entry = list.entry();
-                let entry = {
-                    let mut nested_list = entry.list();
-                    nested_list.push("foo");
-                    nested_list.rollback()
-                };
-
-                entry.value(2_i32);
-            })
-        });
-
-        assert_eq!(res, r#"[1,2]"#);
-    }
-
-    #[test]
-    fn string_escaping() {
-        let mut buf = vec![];
-        let mut object = ValueSer::new(&mut buf).object();
-
-        let key = "hello";
-        let value = "\n world";
-
-        object.entry(format_args!("{key:?}"), value);
-        object.finish();
-
-        assert_eq!(buf, br#"{"\"hello\"":"\n world"}"#);
-    }
-}
--- a/libs/proxy/json/src/macros.rs
+++ b/libs/proxy/json/src/macros.rs
@@ -1,86 +0,0 @@
-//! # Examples
-//!
-//! ```
-//! use futures::{StreamExt, TryStream, TryStreamExt};
-//!
-//! async fn stream_to_json_list<S, T, E>(mut s: S) -> Result<String, E>
-//! where
-//!     S: TryStream<Ok = T, Error = E> + Unpin,
-//!     T: json::ValueEncoder
-//! {
-//!     Ok(json::value_to_string!(|val| json::value_as_list!(|val| {
-//!         // note how we can use `.await` and `?` in here.
-//!         while let Some(value) = s.try_next().await? {
-//!             val.push(value);
-//!         }
-//!     })))
-//! }
-//!
-//! let stream = futures::stream::iter([1, 2, 3]).map(Ok::<i32, ()>);
-//! let json_string = futures::executor::block_on(stream_to_json_list(stream)).unwrap();
-//! assert_eq!(json_string, "[1,2,3]");
-//! ```
-
-/// A helper to create a new JSON vec.
-///
-/// Implemented as a macro to preserve all control flow.
-#[macro_export]
-macro_rules! value_to_vec {
-    (|$val:ident| $body:expr) => {{
-        let mut buf = vec![];
-        let $val = $crate::ValueSer::new(&mut buf);
-        let _: () = $body;
-        buf
-    }};
-}
-
-/// A helper to create a new JSON string.
-///
-/// Implemented as a macro to preserve all control flow.
-#[macro_export]
-macro_rules! value_to_string {
-    (|$val:ident| $body:expr) => {{
-        ::std::string::String::from_utf8($crate::value_to_vec!(|$val| $body))
-            .expect("json should be valid utf8")
-    }};
-}
-
-/// A helper that ensures the [`ObjectSer::finish`](crate::ObjectSer::finish) method is called on completion.
-///
-/// Consumes `$val` and assigns it as an [`ObjectSer`](crate::ObjectSer) serializer.
-/// The serializer is only 'finished' if the body completes.
-/// The serializer is rolled back if `break`/`return` escapes the body.
-///
-/// Implemented as a macro to preserve all control flow.
-#[macro_export]
-macro_rules! value_as_object {
-    (|$val:ident| $body:expr) => {{
-        let mut obj = $crate::ObjectSer::new($val);
-
-        let $val = &mut obj;
-        let res = $body;
-
-        obj.finish();
-        res
-    }};
-}
-
-/// A helper that ensures the [`ListSer::finish`](crate::ListSer::finish) method is called on completion.
-///
-/// Consumes `$val` and assigns it as an [`ListSer`](crate::ListSer) serializer.
-/// The serializer is only 'finished' if the body completes.
-/// The serializer is rolled back if `break`/`return` escapes the body.
-///
-/// Implemented as a macro to preserve all control flow.
-#[macro_export]
-macro_rules! value_as_list {
-    (|$val:ident| $body:expr) => {{
-        let mut list = $crate::ListSer::new($val);
-
-        let $val = &mut list;
-        let res = $body;
-
-        list.finish();
-        res
-    }};
-}
--- a/libs/proxy/json/src/str.rs
+++ b/libs/proxy/json/src/str.rs
@@ -1,166 +0,0 @@
-//! Helpers for serializing escaped strings.
-//!
-//! ## License
-//!
-//! <https://github.com/serde-rs/json/blob/c1826ebcccb1a520389c6b78ad3da15db279220d/src/ser.rs#L1514-L1552>
-//! <https://github.com/serde-rs/json/blob/c1826ebcccb1a520389c6b78ad3da15db279220d/src/ser.rs#L2081-L2157>
-//! Licensed by David Tolnay under MIT or Apache-2.0.
-//!
-//! With modifications by Conrad Ludgate on behalf of Databricks.
-
-use std::fmt::{self, Write};
-
-/// Represents a character escape code in a type-safe manner.
-pub enum CharEscape {
-    /// An escaped quote `"`
-    Quote,
-    /// An escaped reverse solidus `\`
-    ReverseSolidus,
-    // /// An escaped solidus `/`
-    // Solidus,
-    /// An escaped backspace character (usually escaped as `\b`)
-    Backspace,
-    /// An escaped form feed character (usually escaped as `\f`)
-    FormFeed,
-    /// An escaped line feed character (usually escaped as `\n`)
-    LineFeed,
-    /// An escaped carriage return character (usually escaped as `\r`)
-    CarriageReturn,
-    /// An escaped tab character (usually escaped as `\t`)
-    Tab,
-    /// An escaped ASCII plane control character (usually escaped as
-    /// `\u00XX` where `XX` are two hex characters)
-    AsciiControl(u8),
-}
-
-impl CharEscape {
-    #[inline]
-    fn from_escape_table(escape: u8, byte: u8) -> CharEscape {
-        match escape {
-            self::BB => CharEscape::Backspace,
-            self::TT => CharEscape::Tab,
-            self::NN => CharEscape::LineFeed,
-            self::FF => CharEscape::FormFeed,
-            self::RR => CharEscape::CarriageReturn,
-            self::QU => CharEscape::Quote,
-            self::BS => CharEscape::ReverseSolidus,
-            self::UU => CharEscape::AsciiControl(byte),
-            _ => unreachable!(),
-        }
-    }
-}
-
-pub(crate) fn format_escaped_str(writer: &mut Vec<u8>, value: &str) {
-    writer.reserve(2 + value.len());
-
-    writer.push(b'"');
-
-    let rest = format_escaped_str_contents(writer, value);
-    writer.extend_from_slice(rest);
-
-    writer.push(b'"');
-}
-
-pub(crate) fn format_escaped_fmt(writer: &mut Vec<u8>, args: fmt::Arguments) {
-    writer.push(b'"');
-
-    Collect { buf: writer }
-        .write_fmt(args)
-        .expect("formatting should not error");
-
-    writer.push(b'"');
-}
-
-struct Collect<'buf> {
-    buf: &'buf mut Vec<u8>,
-}
-
-impl fmt::Write for Collect<'_> {
-    fn write_str(&mut self, s: &str) -> fmt::Result {
-        let last = format_escaped_str_contents(self.buf, s);
-        self.buf.extend(last);
-        Ok(())
-    }
-}
-
-// writes any escape sequences, and returns the suffix still needed to be written.
-fn format_escaped_str_contents<'a>(writer: &mut Vec<u8>, value: &'a str) -> &'a [u8] {
-    let bytes = value.as_bytes();
-
-    let mut start = 0;
-
-    for (i, &byte) in bytes.iter().enumerate() {
-        let escape = ESCAPE[byte as usize];
-        if escape == 0 {
-            continue;
-        }
-
-        writer.extend_from_slice(&bytes[start..i]);
-
-        let char_escape = CharEscape::from_escape_table(escape, byte);
-        write_char_escape(writer, char_escape);
-
-        start = i + 1;
-    }
-
-    &bytes[start..]
-}
-
-const BB: u8 = b'b'; // \x08
-const TT: u8 = b't'; // \x09
-const NN: u8 = b'n'; // \x0A
-const FF: u8 = b'f'; // \x0C
-const RR: u8 = b'r'; // \x0D
-const QU: u8 = b'"'; // \x22
-const BS: u8 = b'\\'; // \x5C
-const UU: u8 = b'u'; // \x00...\x1F except the ones above
-const __: u8 = 0;
-
-// Lookup table of escape sequences. A value of b'x' at index i means that byte
-// i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped.
-static ESCAPE: [u8; 256] = [
-    //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
-    UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0
-    UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1
-    __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
-    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
-    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
-    __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
-    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
-    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
-    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
-    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
-    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
-    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
-    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
-    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
-    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
-    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
-];
-
-fn write_char_escape(writer: &mut Vec<u8>, char_escape: CharEscape) {
-    let s = match char_escape {
-        CharEscape::Quote => b"\\\"",
-        CharEscape::ReverseSolidus => b"\\\\",
-        // CharEscape::Solidus => b"\\/",
-        CharEscape::Backspace => b"\\b",
-        CharEscape::FormFeed => b"\\f",
-        CharEscape::LineFeed => b"\\n",
-        CharEscape::CarriageReturn => b"\\r",
-        CharEscape::Tab => b"\\t",
-        CharEscape::AsciiControl(byte) => {
-            static HEX_DIGITS: [u8; 16] = *b"0123456789abcdef";
-            let bytes = &[
-                b'\\',
-                b'u',
-                b'0',
-                b'0',
-                HEX_DIGITS[(byte >> 4) as usize],
-                HEX_DIGITS[(byte & 0xF) as usize],
-            ];
-            return writer.extend_from_slice(bytes);
-        }
-    };
-
-    writer.extend_from_slice(s);
-}
--- a/libs/proxy/json/src/value.rs
+++ b/libs/proxy/json/src/value.rs
@@ -1,168 +0,0 @@
-use core::fmt;
-use std::collections::{BTreeMap, HashMap};
-
-use crate::str::{format_escaped_fmt, format_escaped_str};
-use crate::{KeyEncoder, ObjectSer, ValueSer, value_as_list, value_as_object};
-
-/// Write a value to the underlying json representation.
-pub trait ValueEncoder {
-    fn encode(self, v: ValueSer<'_>);
-}
-
-pub(crate) fn write_int(x: impl itoa::Integer, b: &mut Vec<u8>) {
-    b.extend_from_slice(itoa::Buffer::new().format(x).as_bytes());
-}
-
-pub(crate) fn write_float(x: impl ryu::Float, b: &mut Vec<u8>) {
-    b.extend_from_slice(ryu::Buffer::new().format(x).as_bytes());
-}
-
-impl<T: Copy + ValueEncoder> ValueEncoder for &T {
-    #[inline]
-    fn encode(self, v: ValueSer<'_>) {
-        T::encode(*self, v);
-    }
-}
-
-impl ValueEncoder for &str {
-    #[inline]
-    fn encode(self, v: ValueSer<'_>) {
-        format_escaped_str(v.buf, self);
-        v.finish();
-    }
-}
-
-impl ValueEncoder for fmt::Arguments<'_> {
-    #[inline]
-    fn encode(self, v: ValueSer<'_>) {
-        if let Some(s) = self.as_str() {
-            format_escaped_str(v.buf, s);
-        } else {
-            format_escaped_fmt(v.buf, self);
-        }
-        v.finish();
-    }
-}
-
-macro_rules! int {
-    [$($t:ty),*] => {
-        $(
-            impl ValueEncoder for $t {
-                #[inline]
-                fn encode(self, v: ValueSer<'_>) {
-                    write_int(self, v.buf);
-                    v.finish();
-                }
-            }
-        )*
-    };
-}
-
-int![u8, u16, u32, u64, usize, u128];
-int![i8, i16, i32, i64, isize, i128];
-
-macro_rules! float {
-    [$($t:ty),*] => {
-        $(
-            impl ValueEncoder for $t {
-                #[inline]
-                fn encode(self, v: ValueSer<'_>) {
-                    write_float(self, v.buf);
-                    v.finish();
-                }
-            }
-        )*
-    };
-}
-
-float![f32, f64];
-
-impl ValueEncoder for bool {
-    #[inline]
-    fn encode(self, v: ValueSer<'_>) {
-        v.write_raw_json(if self { b"true" } else { b"false" });
-    }
-}
-
-impl<T: ValueEncoder> ValueEncoder for Option<T> {
-    #[inline]
-    fn encode(self, v: ValueSer<'_>) {
-        match self {
-            Some(value) => value.encode(v),
-            None => Null.encode(v),
-        }
-    }
-}
-
-impl KeyEncoder for &str {
-    #[inline]
-    fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a> {
-        let obj = &mut *obj;
-        obj.entry_inner(|b| format_escaped_str(b, self))
-    }
-}
-
-impl KeyEncoder for fmt::Arguments<'_> {
-    #[inline]
-    fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a> {
-        if let Some(key) = self.as_str() {
-            obj.entry_inner(|b| format_escaped_str(b, key))
-        } else {
-            obj.entry_inner(|b| format_escaped_fmt(b, self))
-        }
-    }
-}
-
-/// Represents the JSON null value.
-pub struct Null;
-
-impl ValueEncoder for Null {
-    #[inline]
-    fn encode(self, v: ValueSer<'_>) {
-        v.write_raw_json(b"null");
-    }
-}
-
-impl<T: ValueEncoder> ValueEncoder for Vec<T> {
-    #[inline]
-    fn encode(self, v: ValueSer<'_>) {
-        value_as_list!(|v| {
-            for t in self {
-                v.entry().value(t);
-            }
-        });
-    }
-}
-
-impl<T: Copy + ValueEncoder> ValueEncoder for &[T] {
-    #[inline]
-    fn encode(self, v: ValueSer<'_>) {
-        value_as_list!(|v| {
-            for t in self {
-                v.entry().value(t);
-            }
-        });
-    }
-}
-
-impl<K: KeyEncoder, V: ValueEncoder, S> ValueEncoder for HashMap<K, V, S> {
-    #[inline]
-    fn encode(self, o: ValueSer<'_>) {
-        value_as_object!(|o| {
-            for (k, v) in self {
-                o.entry(k, v);
-            }
-        });
-    }
-}
-
-impl<K: KeyEncoder, V: ValueEncoder> ValueEncoder for BTreeMap<K, V> {
-    #[inline]
-    fn encode(self, o: ValueSer<'_>) {
-        value_as_object!(|o| {
-            for (k, v) in self {
-                o.entry(k, v);
-            }
-        });
-    }
-}
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -13,7 +13,6 @@ aws-smithy-async.workspace = true
 aws-smithy-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
-base64.workspace = true
 bytes.workspace = true
 camino = { workspace = true, features = ["serde1"] }
 humantime-serde.workspace = true
@@ -42,9 +41,6 @@ http-body-util.workspace = true
 itertools.workspace = true
 sync_wrapper = { workspace = true, features = ["futures"] }

-byteorder = "1.4"
-rand = "0.8.5"
-
 [dev-dependencies]
 camino-tempfile.workspace = true
 test-context.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -14,25 +14,17 @@ use anyhow::{Context, Result, anyhow};
 use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
 use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions};
 use azure_storage::StorageCredentials;
-use azure_storage_blobs::blob::BlobBlockType;
-use azure_storage_blobs::blob::BlockList;
+use azure_storage_blobs::blob::operations::GetBlobBuilder;
 use azure_storage_blobs::blob::{Blob, CopyStatus};
 use azure_storage_blobs::container::operations::ListBlobsBuilder;
-use azure_storage_blobs::prelude::ClientBuilder;
-use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
-use base64::{Engine as _, engine::general_purpose::URL_SAFE};
-use byteorder::{BigEndian, ByteOrder};
+use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient};
 use bytes::Bytes;
-use camino::Utf8Path;
 use futures::FutureExt;
 use futures::future::Either;
 use futures::stream::Stream;
 use futures_util::{StreamExt, TryStreamExt};
 use http_types::{StatusCode, Url};
 use scopeguard::ScopeGuard;
-use tokio::fs::File;
-use tokio::io::AsyncReadExt;
-use tokio::io::AsyncSeekExt;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 use utils::backoff;
@@ -59,9 +51,6 @@ pub struct AzureBlobStorage {

    // Alternative timeout used for metadata objects which are expected to be small
    pub small_timeout: Duration,
-    /* BEGIN_HADRON */
-    pub put_block_size_mb: Option<usize>,
-    /* END_HADRON */
 }

 impl AzureBlobStorage {
@@ -118,9 +107,6 @@ impl AzureBlobStorage {
            concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
            timeout,
            small_timeout,
-            /* BEGIN_HADRON */
-            put_block_size_mb: azure_config.put_block_size_mb,
-            /* END_HADRON */
        })
    }

@@ -597,137 +583,31 @@ impl RemoteStorage for AzureBlobStorage {

        let started_at = start_measuring_requests(kind);

-        let mut metadata_map = metadata.unwrap_or([].into());
-        let timeline_file_path = metadata_map.0.remove("databricks_azure_put_block");
-
-        /* BEGIN_HADRON */
-        let op = async move {
+        let op = async {
            let blob_client = self.client.blob_client(self.relative_path_to_name(to));
-            let put_block_size = self.put_block_size_mb.unwrap_or(0) * 1024 * 1024;
-            if timeline_file_path.is_none() || put_block_size == 0 {
-                // Use put_block_blob directly.
-                let from: Pin<
-                    Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>,
-                > = Box::pin(from);
-                let from = NonSeekableStream::new(from, data_size_bytes);
-                let body = azure_core::Body::SeekableStream(Box::new(from));

-                let mut builder = blob_client.put_block_blob(body);
-                if !metadata_map.0.is_empty() {
-                    builder = builder.metadata(to_azure_metadata(metadata_map));
-                }
-                let fut = builder.into_future();
-                let fut = tokio::time::timeout(self.timeout, fut);
-                let result = fut.await;
-                match result {
-                    Ok(Ok(_response)) => return Ok(()),
-                    Ok(Err(azure)) => return Err(azure.into()),
-                    Err(_timeout) => return Err(TimeoutOrCancel::Timeout.into()),
-                };
-            }
-            // Upload chunks concurrently using Put Block.
-            // Each PutBlock uploads put_block_size bytes of the file.
-            let mut upload_futures: Vec<tokio::task::JoinHandle<Result<(), azure_core::Error>>> =
-                vec![];
-            let mut block_list = BlockList::default();
-            let mut start_bytes = 0u64;
-            let mut remaining_bytes = data_size_bytes;
-            let mut block_list_count = 0;
+            let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
+                Box::pin(from);

-            while remaining_bytes > 0 {
-                let block_size = std::cmp::min(remaining_bytes, put_block_size);
-                let end_bytes = start_bytes + block_size as u64;
-                let block_id = block_list_count;
-                let timeout = self.timeout;
-                let blob_client = blob_client.clone();
-                let timeline_file = timeline_file_path.clone().unwrap().clone();
+            let from = NonSeekableStream::new(from, data_size_bytes);

-                let mut encoded_block_id = [0u8; 8];
-                BigEndian::write_u64(&mut encoded_block_id, block_id);
-                URL_SAFE.encode(encoded_block_id);
+            let body = azure_core::Body::SeekableStream(Box::new(from));

-                // Put one block.
-                let part_fut = async move {
-                    let mut file = File::open(Utf8Path::new(&timeline_file.clone())).await?;
-                    file.seek(io::SeekFrom::Start(start_bytes)).await?;
-                    let limited_reader = file.take(block_size as u64);
-                    let file_chunk_stream =
-                        tokio_util::io::ReaderStream::with_capacity(limited_reader, 1024 * 1024);
-                    let file_chunk_stream_pin: Pin<
-                        Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>,
-                    > = Box::pin(file_chunk_stream);
-                    let stream_wrapper = NonSeekableStream::new(file_chunk_stream_pin, block_size);
-                    let body = azure_core::Body::SeekableStream(Box::new(stream_wrapper));
-                    // Azure put block takes URL-encoded block ids and all blocks must have the same byte length.
-                    // https://learn.microsoft.com/en-us/rest/api/storageservices/put-block?tabs=microsoft-entra-id#uri-parameters
-                    let builder = blob_client.put_block(encoded_block_id.to_vec(), body);
-                    let fut = builder.into_future();
-                    let fut = tokio::time::timeout(timeout, fut);
-                    let result = fut.await;
-                    tracing::debug!(
-                        "azure put block id-{} size {} start {} end {} file {} response {:#?}",
-                        block_id,
-                        block_size,
-                        start_bytes,
-                        end_bytes,
-                        timeline_file,
-                        result
-                    );
-                    match result {
-                        Ok(Ok(_response)) => Ok(()),
-                        Ok(Err(azure)) => Err(azure),
-                        Err(_timeout) => Err(azure_core::Error::new(
-                            azure_core::error::ErrorKind::Io,
-                            std::io::Error::new(
-                                std::io::ErrorKind::TimedOut,
-                                "Operation timed out",
-                            ),
-                        )),
-                    }
-                };
-                upload_futures.push(tokio::spawn(part_fut));
+            let mut builder = blob_client.put_block_blob(body);

-                block_list_count += 1;
-                remaining_bytes -= block_size;
-                start_bytes += block_size as u64;
-
-                block_list
-                    .blocks
-                    .push(BlobBlockType::Uncommitted(encoded_block_id.to_vec().into()));
+            if let Some(metadata) = metadata {
+                builder = builder.metadata(to_azure_metadata(metadata));
            }

-            tracing::debug!(
-                "azure put blocks {} total MB: {} chunk size MB: {}",
-                block_list_count,
-                data_size_bytes / 1024 / 1024,
-                put_block_size / 1024 / 1024
-            );
-            // Wait for all blocks to be uploaded.
-            let upload_results = futures::future::try_join_all(upload_futures).await;
-            if upload_results.is_err() {
-                return Err(anyhow::anyhow!(format!(
-                    "Failed to upload all blocks {:#?}",
-                    upload_results.unwrap_err()
-                )));
-            }
-
-            // Commit the blocks.
-            let mut builder = blob_client.put_block_list(block_list);
-            if !metadata_map.0.is_empty() {
-                builder = builder.metadata(to_azure_metadata(metadata_map));
-            }
            let fut = builder.into_future();
            let fut = tokio::time::timeout(self.timeout, fut);
-            let result = fut.await;
-            tracing::debug!("azure put block list response {:#?}", result);

-            match result {
+            match fut.await {
                Ok(Ok(_response)) => Ok(()),
                Ok(Err(azure)) => Err(azure.into()),
                Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
            }
        };
-        /* END_HADRON */

        let res = tokio::select! {
            res = op => res,
@@ -742,6 +622,7 @@ impl RemoteStorage for AzureBlobStorage {
        crate::metrics::BUCKET_METRICS
            .req_seconds
            .observe_elapsed(kind, outcome, started_at);
+
        res
    }

--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -195,19 +195,8 @@ pub struct AzureConfig {
    pub max_keys_per_list_response: Option<i32>,
    #[serde(default = "default_azure_conn_pool_size")]
    pub conn_pool_size: usize,
-    /* BEGIN_HADRON */
-    #[serde(default = "default_azure_put_block_size_mb")]
-    pub put_block_size_mb: Option<usize>,
-    /* END_HADRON */
 }

-/* BEGIN_HADRON */
-fn default_azure_put_block_size_mb() -> Option<usize> {
-    // Disable parallel upload by default.
-    Some(0)
-}
-/* END_HADRON */
-
 fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize {
    NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap()
 }
@@ -224,9 +213,6 @@ impl Debug for AzureConfig {
                "max_keys_per_list_response",
                &self.max_keys_per_list_response,
            )
-            /* BEGIN_HADRON */
-            .field("put_block_size_mb", &self.put_block_size_mb)
-            /* END_HADRON */
            .finish()
    }
 }
@@ -366,7 +352,6 @@ timeout = '5s'";
    upload_storage_class = 'INTELLIGENT_TIERING'
    timeout = '7s'
    conn_pool_size = 8
-    put_block_size_mb = 1024
    ";

        let config = parse(toml).unwrap();
@@ -382,9 +367,6 @@ timeout = '5s'";
                    concurrency_limit: default_remote_storage_azure_concurrency_limit(),
                    max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
                    conn_pool_size: 8,
-                    /* BEGIN_HADRON */
-                    put_block_size_mb: Some(1024),
-                    /* END_HADRON */
                }),
                timeout: Duration::from_secs(7),
                small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -732,15 +732,9 @@ impl GenericRemoteStorage {
        })
    }

-    /* BEGIN_HADRON */
-    pub fn unreliable_wrapper(s: Self, fail_first: u64, fail_probability: u64) -> Self {
-        Self::Unreliable(Arc::new(UnreliableWrapper::new(
-            s,
-            fail_first,
-            fail_probability,
-        )))
+    pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self {
+        Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
    }
-    /* END_HADRON */

    /// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
    pub async fn upload_storage_object(
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -1,8 +1,6 @@
 //! This module provides a wrapper around a real RemoteStorage implementation that
 //! causes the first N attempts at each upload or download operatio to fail. For
 //! testing purposes.
-use rand::Rng;
-use std::cmp;
 use std::collections::HashMap;
 use std::collections::hash_map::Entry;
 use std::num::NonZeroU32;
@@ -27,12 +25,6 @@ pub struct UnreliableWrapper {

    // Tracks how many failed attempts of each operation has been made.
    attempts: Mutex<HashMap<RemoteOp, u64>>,
-
-    /* BEGIN_HADRON */
-    // This the probability of failure for each operation, ranged from [0, 100].
-    // The probability is default to 100, which means that all operations will fail.
-    attempt_failure_probability: u64,
-    /* END_HADRON */
 }

 /// Used to identify retries of different unique operation.
@@ -48,11 +40,7 @@ enum RemoteOp {
 }

 impl UnreliableWrapper {
-    pub fn new(
-        inner: crate::GenericRemoteStorage,
-        attempts_to_fail: u64,
-        attempt_failure_probability: u64,
-    ) -> Self {
+    pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
        assert!(attempts_to_fail > 0);
        let inner = match inner {
            GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
@@ -63,11 +51,9 @@ impl UnreliableWrapper {
                panic!("Can't wrap unreliable wrapper unreliably")
            }
        };
-        let actual_attempt_failure_probability = cmp::min(attempt_failure_probability, 100);
        UnreliableWrapper {
            inner,
            attempts_to_fail,
-            attempt_failure_probability: actual_attempt_failure_probability,
            attempts: Mutex::new(HashMap::new()),
        }
    }
@@ -80,7 +66,6 @@ impl UnreliableWrapper {
    ///
    fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
        let mut attempts = self.attempts.lock().unwrap();
-        let mut rng = rand::thread_rng();

        match attempts.entry(op) {
            Entry::Occupied(mut e) => {
@@ -90,19 +75,15 @@ impl UnreliableWrapper {
                    *p
                };

-                /* BEGIN_HADRON */
-                // If there are more attempts to fail, fail the request by probability.
-                if (attempts_before_this < self.attempts_to_fail)
-                    && (rng.gen_range(0..=100) < self.attempt_failure_probability)
-                {
+                if attempts_before_this >= self.attempts_to_fail {
+                    // let it succeed
+                    e.remove();
+                    Ok(attempts_before_this)
+                } else {
                    let error =
                        anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
                    Err(error)
-                } else {
-                    e.remove();
-                    Ok(attempts_before_this)
                }
-                /* END_HADRON */
            }
            Entry::Vacant(e) => {
                let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
--- a/libs/remote_storage/tests/common/mod.rs
+++ b/libs/remote_storage/tests/common/mod.rs
@@ -165,42 +165,10 @@ pub(crate) async fn upload_remote_data(

            let (data, data_len) =
                upload_stream(format!("remote blob data {i}").into_bytes().into());
-
-            /* BEGIN_HADRON */
-            let mut metadata = None;
-            if matches!(&*task_client, GenericRemoteStorage::AzureBlob(_)) {
-                let file_path = "/tmp/dbx_upload_tmp_file.txt";
-                {
-                    // Open the file in append mode
-                    let mut file = std::fs::OpenOptions::new()
-                        .append(true)
-                        .create(true) // Create the file if it doesn't exist
-                        .open(file_path)?;
-                    // Append some bytes to the file
-                    std::io::Write::write_all(
-                        &mut file,
-                        &format!("remote blob data {i}").into_bytes(),
-                    )?;
-                    file.sync_all()?;
-                }
-                metadata = Some(remote_storage::StorageMetadata::from([(
-                    "databricks_azure_put_block",
-                    file_path,
-                )]));
-            }
-            /* END_HADRON */
-
            task_client
-                .upload(data, data_len, &blob_path, metadata, &cancel)
+                .upload(data, data_len, &blob_path, None, &cancel)
                .await?;

-            // TODO: Check upload is using the put_block upload.
-            // We cannot consume data here since data is moved inside the upload.
-            // let total_bytes = data.fold(0, |acc, chunk| async move {
-            //     acc + chunk.map(|bytes| bytes.len()).unwrap_or(0)
-            // }).await;
-            // assert_eq!(total_bytes, data_len);
-
            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
        });
    }
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -219,9 +219,6 @@ async fn create_azure_client(
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
            max_keys_per_list_response,
            conn_pool_size: 8,
-            /* BEGIN_HADRON */
-            put_block_size_mb: Some(1),
-            /* END_HADRON */
        }),
        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
        small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -221,7 +221,7 @@ pub struct TimelineMembershipSwitchRequest {
 pub struct TimelineMembershipSwitchResponse {
    pub previous_conf: Configuration,
    pub current_conf: Configuration,
-    pub last_log_term: Term,
+    pub term: Term,
    pub flush_lsn: Lsn,
 }

--- a/libs/utils/src/env.rs
+++ b/libs/utils/src/env.rs
@@ -44,62 +44,3 @@ where
        }
    }
 }
-
-/* BEGIN_HADRON */
-pub enum DeploymentMode {
-    Dev,
-    Staging,
-    Prod,
-}
-
-pub fn get_deployment_mode() -> Option<DeploymentMode> {
-    match std::env::var("DEPLOYMENT_MODE") {
-        Ok(env) => match env.as_str() {
-            "development" => Some(DeploymentMode::Dev),
-            "staging" => Some(DeploymentMode::Staging),
-            "production" => Some(DeploymentMode::Prod),
-            _ => {
-                tracing::error!("Unexpected DEPLOYMENT_MODE: {}", env);
-                None
-            }
-        },
-        Err(_) => {
-            tracing::error!("DEPLOYMENT_MODE not set");
-            None
-        }
-    }
-}
-
-pub fn is_dev_or_staging() -> bool {
-    matches!(
-        get_deployment_mode(),
-        Some(DeploymentMode::Dev) | Some(DeploymentMode::Staging)
-    )
-}
-
-pub enum TestingMode {
-    Chaos,
-    Stress,
-}
-
-pub fn get_test_mode() -> Option<TestingMode> {
-    match std::env::var("HADRON_TEST_MODE") {
-        Ok(env) => match env.as_str() {
-            "chaos" => Some(TestingMode::Chaos),
-            "stress" => Some(TestingMode::Stress),
-            _ => {
-                tracing::error!("Unexpected HADRON_TEST_MODE: {}", env);
-                None
-            }
-        },
-        Err(_) => {
-            tracing::error!("HADRON_TEST_MODE not set");
-            None
-        }
-    }
-}
-
-pub fn is_chaos_testing() -> bool {
-    matches!(get_test_mode(), Some(TestingMode::Chaos))
-}
-/* END_HADRON */
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -99,8 +99,6 @@ pub mod elapsed_accum;
 #[cfg(target_os = "linux")]
 pub mod linux_socket_ioctl;

-pub mod metrics_collector;
-
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
--- a/libs/utils/src/metrics_collector.rs
+++ b/libs/utils/src/metrics_collector.rs
@@ -1,75 +0,0 @@
-use std::{
-    sync::{Arc, RwLock},
-    time::{Duration, Instant},
-};
-
-use metrics::{IntGauge, proto::MetricFamily, register_int_gauge};
-use once_cell::sync::Lazy;
-
-pub static METRICS_STALE_MILLIS: Lazy<IntGauge> = Lazy::new(|| {
-    register_int_gauge!(
-        "metrics_metrics_stale_milliseconds",
-        "The current metrics stale time in milliseconds"
-    )
-    .expect("failed to define a metric")
-});
-
-#[derive(Debug)]
-pub struct CollectedMetrics {
-    pub metrics: Vec<MetricFamily>,
-    pub collected_at: Instant,
-}
-
-impl CollectedMetrics {
-    fn new(metrics: Vec<MetricFamily>) -> Self {
-        Self {
-            metrics,
-            collected_at: Instant::now(),
-        }
-    }
-}
-
-#[derive(Debug)]
-pub struct MetricsCollector {
-    last_collected: RwLock<Arc<CollectedMetrics>>,
-}
-
-impl MetricsCollector {
-    pub fn new() -> Self {
-        Self {
-            last_collected: RwLock::new(Arc::new(CollectedMetrics::new(vec![]))),
-        }
-    }
-
-    #[tracing::instrument(name = "metrics_collector", skip_all)]
-    pub fn run_once(&self, cache_metrics: bool) -> Arc<CollectedMetrics> {
-        let started = Instant::now();
-        let metrics = metrics::gather();
-        let collected = Arc::new(CollectedMetrics::new(metrics));
-        if cache_metrics {
-            let mut guard = self.last_collected.write().unwrap();
-            *guard = collected.clone();
-        }
-        tracing::info!(
-            "Collected {} metric families in {} ms",
-            collected.metrics.len(),
-            started.elapsed().as_millis()
-        );
-        collected
-    }
-
-    pub fn last_collected(&self) -> Arc<CollectedMetrics> {
-        self.last_collected.read().unwrap().clone()
-    }
-}
-
-impl Default for MetricsCollector {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-// Interval for metrics collection. Currently hard-coded to be the same as the metrics scape interval from the obs agent
-pub static METRICS_COLLECTION_INTERVAL: Duration = Duration::from_secs(30);
-
-pub static METRICS_COLLECTOR: Lazy<MetricsCollector> = Lazy::new(MetricsCollector::default);
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -49,6 +49,12 @@ pub struct TenantShardId {
    pub shard_count: ShardCount,
 }

+impl std::fmt::Display for ShardCount {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 impl ShardCount {
    pub const MAX: Self = Self(u8::MAX);
    pub const MIN: Self = Self(0);
@@ -171,12 +177,6 @@ impl std::fmt::Display for ShardNumber {
    }
 }

-impl std::fmt::Display for ShardCount {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        self.0.fmt(f)
-    }
-}
-
 impl std::fmt::Display for ShardSlug<'_> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -428,12 +428,6 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
        shard_number: 0,
    };

-    let empty_wal_rate_limiter = crate::bindings::WalRateLimiter {
-        should_limit: crate::bindings::pg_atomic_uint32 { value: 0 },
-        sent_bytes: 0,
-        last_recorded_time_us: 0,
-    };
-
    crate::bindings::WalproposerShmemState {
        propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 },
        donor_name: [0; 64],
@@ -447,7 +441,6 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
        num_shards: 0,
        replica_promote: false,
        min_ps_feedback: empty_feedback,
-        wal_rate_limiter: empty_wal_rate_limiter,
    }
 }

--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -54,6 +54,7 @@ pageserver_api.workspace = true
 pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
 pageserver_compaction.workspace = true
 pageserver_page_api.workspace = true
+peekable.workspace = true
 pem.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
@@ -66,6 +67,7 @@ postgres-types.workspace = true
 posthog_client_lite.workspace = true
 pprof.workspace = true
 pq_proto.workspace = true
+prost.workspace = true
 rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
@@ -112,7 +114,6 @@ twox-hash.workspace = true
 procfs.workspace = true

 [dev-dependencies]
-base64.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,4 +1,4 @@
-use std::collections::{BTreeMap, HashMap};
+use std::collections::HashMap;
 use std::error::Error as _;
 use std::time::Duration;

@@ -251,70 +251,6 @@ impl Client {
        Ok(())
    }

-    pub async fn tenant_timeline_compact(
-        &self,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        force_image_layer_creation: bool,
-        must_force_image_layer_creation: bool,
-        scheduled: bool,
-        wait_until_done: bool,
-    ) -> Result<()> {
-        let mut path = reqwest::Url::parse(&format!(
-            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/compact",
-            self.mgmt_api_endpoint
-        ))
-        .expect("Cannot build URL");
-
-        if force_image_layer_creation {
-            path.query_pairs_mut()
-                .append_pair("force_image_layer_creation", "true");
-        }
-
-        if must_force_image_layer_creation {
-            path.query_pairs_mut()
-                .append_pair("must_force_image_layer_creation", "true");
-        }
-
-        if scheduled {
-            path.query_pairs_mut().append_pair("scheduled", "true");
-        }
-        if wait_until_done {
-            path.query_pairs_mut()
-                .append_pair("wait_until_scheduled_compaction_done", "true");
-            path.query_pairs_mut()
-                .append_pair("wait_until_uploaded", "true");
-        }
-        self.request(Method::PUT, path, ()).await?;
-        Ok(())
-    }
-
-    /* BEGIN_HADRON */
-    pub async fn tenant_timeline_describe(
-        &self,
-        tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
-    ) -> Result<TimelineInfo> {
-        let mut path = reqwest::Url::parse(&format!(
-            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
-            self.mgmt_api_endpoint
-        ))
-        .expect("Cannot build URL");
-        path.query_pairs_mut()
-            .append_pair("include-image-consistent-lsn", "true");
-
-        let response: reqwest::Response = self.request(Method::GET, path, ()).await?;
-        let body = response.json().await.map_err(Error::ReceiveBody)?;
-        Ok(body)
-    }
-
-    pub async fn list_tenant_visible_size(&self) -> Result<BTreeMap<TenantShardId, u64>> {
-        let uri = format!("{}/v1/list_tenant_visible_size", self.mgmt_api_endpoint);
-        let resp = self.get(&uri).await?;
-        resp.json().await.map_err(Error::ReceiveBody)
-    }
-    /* END_HADRON */
-
    pub async fn tenant_scan_remote_storage(
        &self,
        tenant_id: TenantId,
--- a/pageserver/client_grpc/Cargo.toml
+++ b/pageserver/client_grpc/Cargo.toml
@@ -1,8 +1,7 @@
 [package]
 name = "pageserver_client_grpc"
 version = "0.1.0"
-edition.workspace = true
-license.workspace = true
+edition = "2024"

 [features]
 testing = ["pageserver_api/testing"]
@@ -11,14 +10,35 @@ testing = ["pageserver_api/testing"]
 anyhow.workspace = true
 arc-swap.workspace = true
 bytes.workspace = true
-compute_api.workspace = true
 futures.workspace = true
-pageserver_api.workspace = true
-pageserver_page_api.workspace = true
-tokio.workspace = true
-tokio-stream.workspace = true
-tokio-util.workspace = true
+http.workspace = true
+thiserror.workspace = true
 tonic.workspace = true
 tracing.workspace = true
+tokio = { version = "1.43.1", features = [
+    "full",
+    "macros",
+    "net",
+    "io-util",
+    "rt",
+    "rt-multi-thread",
+] }
+uuid = { version = "1", features = ["v4"] }
+tower = { version = "0.4", features = ["timeout", "util"] }
+rand = "0.8"
+tokio-util = { version = "0.7", features = ["compat"] }
+hyper-util = "0.1.9"
+hyper = "1.6.0"
+metrics.workspace = true
+priority-queue = "2.3.1"
+scopeguard.workspace = true
+async-trait = { version = "0.1" }
+tokio-stream = "0.1"
+dashmap = "5"
+chrono = { version = "0.4", features = ["serde"] }
+compute_api.workspace = true
+
+
+pageserver_page_api.workspace = true
+pageserver_api.workspace = true
 utils.workspace = true
-workspace_hack.workspace = true
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -6,7 +6,6 @@ use anyhow::anyhow;
 use arc_swap::ArcSwap;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt as _, StreamExt as _};
-use tonic::codec::CompressionEncoding;
 use tracing::instrument;

 use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
@@ -63,8 +62,6 @@ pub struct PageserverClient {
    timeline_id: TimelineId,
    /// The JWT auth token for this tenant, if any.
    auth_token: Option<String>,
-    /// The compression to use, if any.
-    compression: Option<CompressionEncoding>,
    /// The shards for this tenant.
    shards: ArcSwap<Shards>,
    /// The retry configuration.
@@ -79,20 +76,12 @@ impl PageserverClient {
        timeline_id: TimelineId,
        shard_spec: ShardSpec,
        auth_token: Option<String>,
-        compression: Option<CompressionEncoding>,
    ) -> anyhow::Result<Self> {
-        let shards = Shards::new(
-            tenant_id,
-            timeline_id,
-            shard_spec,
-            auth_token.clone(),
-            compression,
-        )?;
+        let shards = Shards::new(tenant_id, timeline_id, shard_spec, auth_token.clone())?;
        Ok(Self {
            tenant_id,
            timeline_id,
            auth_token,
-            compression,
            shards: ArcSwap::new(Arc::new(shards)),
            retry: Retry,
        })
@@ -104,33 +93,11 @@ impl PageserverClient {
    /// TODO: verify that in-flight requests are allowed to complete, and that the old pools are
    /// properly spun down and dropped afterwards.
    pub fn update_shards(&self, shard_spec: ShardSpec) -> anyhow::Result<()> {
-        // Validate the shard spec. We should really use `ArcSwap::rcu` for this, to avoid races
-        // with concurrent updates, but that involves creating a new `Shards` on every attempt,
-        // which spins up a bunch of Tokio tasks and such. These should already be checked elsewhere
-        // in the stack, and if they're violated then we already have problems elsewhere, so a
-        // best-effort but possibly-racy check is okay here.
-        let old = self.shards.load_full();
-        if shard_spec.count < old.count {
-            return Err(anyhow!(
-                "can't reduce shard count from {} to {}",
-                old.count,
-                shard_spec.count
-            ));
-        }
-        if !old.count.is_unsharded() && shard_spec.stripe_size != old.stripe_size {
-            return Err(anyhow!(
-                "can't change stripe size from {} to {}",
-                old.stripe_size,
-                shard_spec.stripe_size
-            ));
-        }
-
        let shards = Shards::new(
            self.tenant_id,
            self.timeline_id,
            shard_spec,
            self.auth_token.clone(),
-            self.compression,
        )?;
        self.shards.store(Arc::new(shards));
        Ok(())
@@ -143,7 +110,7 @@ impl PageserverClient {
        req: page_api::CheckRelExistsRequest,
    ) -> tonic::Result<page_api::CheckRelExistsResponse> {
        self.retry
-            .with(async |_| {
+            .with(async || {
                // Relation metadata is only available on shard 0.
                let mut client = self.shards.load_full().get_zero().client().await?;
                client.check_rel_exists(req).await
@@ -158,7 +125,7 @@ impl PageserverClient {
        req: page_api::GetDbSizeRequest,
    ) -> tonic::Result<page_api::GetDbSizeResponse> {
        self.retry
-            .with(async |_| {
+            .with(async || {
                // Relation metadata is only available on shard 0.
                let mut client = self.shards.load_full().get_zero().client().await?;
                client.get_db_size(req).await
@@ -166,9 +133,8 @@ impl PageserverClient {
            .await
    }

-    /// Fetches pages. The `request_id` must be unique across all in-flight requests, and the
-    /// `attempt` must be 0 (incremented on retry). Automatically splits requests that straddle
-    /// shard boundaries, and assembles the responses.
+    /// Fetches pages. The `request_id` must be unique across all in-flight requests. Automatically
+    /// splits requests that straddle shard boundaries, and assembles the responses.
    ///
    /// Unlike `page_api::Client`, this automatically converts `status_code` into `tonic::Status`
    /// errors. All responses will have `GetPageStatusCode::Ok`.
@@ -188,10 +154,6 @@ impl PageserverClient {
        if req.block_numbers.is_empty() {
            return Err(tonic::Status::invalid_argument("no block number"));
        }
-        // The request attempt must be 0. The client will increment it internally.
-        if req.request_id.attempt != 0 {
-            return Err(tonic::Status::invalid_argument("request attempt must be 0"));
-        }

        // The shards may change while we're fetching pages. We execute the request using a stable
        // view of the shards (especially important for requests that span shards), but retry the
@@ -202,11 +164,7 @@ impl PageserverClient {
        // TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this
        // once we figure out how to handle these.
        self.retry
-            .with(async |attempt| {
-                let mut req = req.clone();
-                req.request_id.attempt = attempt as u32;
-                Self::get_page_with_shards(req, &self.shards.load_full()).await
-            })
+            .with(async || Self::get_page_with_shards(req.clone(), &self.shards.load_full()).await)
            .await
    }

@@ -218,7 +176,7 @@ impl PageserverClient {
    ) -> tonic::Result<page_api::GetPageResponse> {
        // Fast path: request is for a single shard.
        if let Some(shard_id) =
-            GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size)
+            GetPageSplitter::is_single_shard(&req, shards.count, shards.stripe_size)
        {
            return Self::get_page_with_shard(req, shards.get(shard_id)?).await;
        }
@@ -235,10 +193,10 @@ impl PageserverClient {
        }

        while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? {
-            splitter.add_response(shard_id, shard_response)?;
+            splitter.add_response(shard_id, shard_response);
        }

-        splitter.get_response()
+        splitter.assemble_response()
    }

    /// Fetches pages on the given shard. Does not retry internally.
@@ -246,8 +204,9 @@ impl PageserverClient {
        req: page_api::GetPageRequest,
        shard: &Shard,
    ) -> tonic::Result<page_api::GetPageResponse> {
+        let expected = req.block_numbers.len();
        let stream = shard.stream(req.request_class.is_bulk()).await;
-        let resp = stream.send(req.clone()).await?;
+        let resp = stream.send(req).await?;

        // Convert per-request errors into a tonic::Status.
        if resp.status_code != page_api::GetPageStatusCode::Ok {
@@ -257,27 +216,11 @@ impl PageserverClient {
            ));
        }

-        // Check that we received the expected pages.
-        if req.rel != resp.rel {
-            return Err(tonic::Status::internal(format!(
-                "shard {} returned wrong relation, expected {} got {}",
-                shard.id, req.rel, resp.rel
-            )));
-        }
-        if !req
-            .block_numbers
-            .iter()
-            .copied()
-            .eq(resp.pages.iter().map(|p| p.block_number))
-        {
-            return Err(tonic::Status::internal(format!(
-                "shard {} returned wrong pages, expected {:?} got {:?}",
-                shard.id,
-                req.block_numbers,
-                resp.pages
-                    .iter()
-                    .map(|page| page.block_number)
-                    .collect::<Vec<_>>()
+        // Check that we received the expected number of pages.
+        let actual = resp.page_images.len();
+        if expected != actual {
+            return Err(tonic::Status::data_loss(format!(
+                "expected {expected} pages, got {actual}",
            )));
        }

@@ -291,7 +234,7 @@ impl PageserverClient {
        req: page_api::GetRelSizeRequest,
    ) -> tonic::Result<page_api::GetRelSizeResponse> {
        self.retry
-            .with(async |_| {
+            .with(async || {
                // Relation metadata is only available on shard 0.
                let mut client = self.shards.load_full().get_zero().client().await?;
                client.get_rel_size(req).await
@@ -306,7 +249,7 @@ impl PageserverClient {
        req: page_api::GetSlruSegmentRequest,
    ) -> tonic::Result<page_api::GetSlruSegmentResponse> {
        self.retry
-            .with(async |_| {
+            .with(async || {
                // SLRU segments are only available on shard 0.
                let mut client = self.shards.load_full().get_zero().client().await?;
                client.get_slru_segment(req).await
@@ -400,21 +343,13 @@ impl Shards {
        timeline_id: TimelineId,
        shard_spec: ShardSpec,
        auth_token: Option<String>,
-        compression: Option<CompressionEncoding>,
    ) -> anyhow::Result<Self> {
        // NB: the shard spec has already been validated when constructed.
        let mut shards = HashMap::with_capacity(shard_spec.urls.len());
        for (shard_id, url) in shard_spec.urls {
            shards.insert(
                shard_id,
-                Shard::new(
-                    url,
-                    tenant_id,
-                    timeline_id,
-                    shard_id,
-                    auth_token.clone(),
-                    compression,
-                )?,
+                Shard::new(url, tenant_id, timeline_id, shard_id, auth_token.clone())?,
            );
        }

@@ -450,8 +385,6 @@ impl Shards {
 ///   * Bulk client pool: unbounded.
 ///     * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH.
 struct Shard {
-    /// The shard ID.
-    id: ShardIndex,
    /// Unary gRPC client pool.
    client_pool: Arc<ClientPool>,
    /// GetPage stream pool.
@@ -468,7 +401,6 @@ impl Shard {
        timeline_id: TimelineId,
        shard_id: ShardIndex,
        auth_token: Option<String>,
-        compression: Option<CompressionEncoding>,
    ) -> anyhow::Result<Self> {
        // Common channel pool for unary and stream requests. Bounded by client/stream pools.
        let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?;
@@ -480,7 +412,6 @@ impl Shard {
            timeline_id,
            shard_id,
            auth_token.clone(),
-            compression,
            Some(MAX_UNARY_CLIENTS),
        );

@@ -493,7 +424,6 @@ impl Shard {
                timeline_id,
                shard_id,
                auth_token.clone(),
-                compression,
                None, // unbounded, limited by stream pool
            ),
            Some(MAX_STREAMS),
@@ -509,7 +439,6 @@ impl Shard {
                timeline_id,
                shard_id,
                auth_token,
-                compression,
                None, // unbounded, limited by stream pool
            ),
            Some(MAX_BULK_STREAMS),
@@ -517,7 +446,6 @@ impl Shard {
        );

        Ok(Self {
-            id: shard_id,
            client_pool,
            stream_pool,
            bulk_stream_pool,
--- a/Show More
+++ b/Show More
				`@@ -1 +0,0 @@`
				`GRANT pg_signal_backend TO neon_superuser WITH ADMIN OPTION;`