Documentation and tweaks

Add stream pool
Add initial client pool
2026-05-20 22:50:38 +00:00 · 2025-07-01 17:54:41 +02:00 · 2025-07-01 17:54:41 +02:00 · 2025-07-01 17:54:41 +02:00 · 2025-07-01 17:54:41 +02:00 · 2025-07-01 17:54:41 +02:00
178 changed files with 18181 additions and 2425 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -4,6 +4,7 @@
 !Cargo.lock
 !Cargo.toml
 !Makefile
+!postgres.mk
 !rust-toolchain.toml
 !scripts/ninstall.sh
 !docker-compose/run-tests.sh
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -94,11 +94,6 @@ jobs:
        run: |
          make "neon-pg-ext-${{ matrix.postgres-version }}" -j$(sysctl -n hw.ncpu)

-      - name: Get postgres headers ${{ matrix.postgres-version }}
-        if: steps.cache_pg.outputs.cache-hit != 'true'
-        run: |
-          make postgres-headers-${{ matrix.postgres-version }} -j$(sysctl -n hw.ncpu)
-
      - name: Upload "pg_install/${{ matrix.postgres-version }}" artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
@@ -140,6 +135,12 @@ jobs:
          name: pg_install--v17
          path: pg_install/v17

+      # `actions/download-artifact` doesn't preserve permissions:
+      # https://github.com/actions/download-artifact?tab=readme-ov-file#permission-loss
+      - name: Make pg_install/v*/bin/* executable
+        run: |
+          chmod +x pg_install/v*/bin/*
+
      - name: Cache walproposer-lib
        id: cache_walproposer_lib
        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
@@ -167,7 +168,7 @@ jobs:
      - name: Build walproposer-lib (only for v17)
        if: steps.cache_walproposer_lib.outputs.cache-hit != 'true'
        run:
-          make walproposer-lib -j$(sysctl -n hw.ncpu)
+          make walproposer-lib -j$(sysctl -n hw.ncpu) PG_INSTALL_CACHED=1

      - name: Upload "build/walproposer-lib" artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -69,7 +69,7 @@ jobs:
          submodules: true

      - name: Check for file changes
-        uses: step-security/paths-filter@v3
+        uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36  # v3.0.2
        id: files-changed
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/large_oltp_benchmark.yml
+++ b/.github/workflows/large_oltp_benchmark.yml
@@ -153,7 +153,7 @@ jobs:
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"

    - name: Benchmark database maintenance
-      if: ${{ matrix.test_maintenance == 'true' }}
+      if: ${{ matrix.test_maintenance }}
      uses: ./.github/actions/run-python-test-set
      with:
        build_type: ${{ env.BUILD_TYPE }}
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -53,7 +53,7 @@ jobs:
          submodules: true

      - name: Check for Postgres changes
-        uses: step-security/paths-filter@v3
+        uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242  #v3
        id: files_changed
        with:
          token: ${{ github.token }}
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -34,7 +34,7 @@ jobs:

      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
+      - uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        id: python-src
        with:
          files: |
@@ -45,7 +45,7 @@ jobs:
            poetry.lock
            pyproject.toml

-      - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1
+      - uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
        id: rust-src
        with:
          files: |
--- a/.github/workflows/proxy-benchmark.yml
+++ b/.github/workflows/proxy-benchmark.yml
@@ -60,22 +60,23 @@ jobs:
        } >> "$GITHUB_ENV"

    - name: Run proxy-bench
-      run: ./${PROXY_BENCH_PATH}/run.sh
+      run: ${PROXY_BENCH_PATH}/run.sh

    - name: Ingest Bench Results # neon repo script
-      if: success()
+      if: always()
      run: |
        mkdir -p $TEST_OUTPUT
        python $NEON_DIR/scripts/proxy_bench_results_ingest.py --out $TEST_OUTPUT

    - name: Push Metrics to Proxy perf database
-      if: success()
+      if: always()
      env:
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PROXY_TEST_RESULT_CONNSTR }}"
        REPORT_FROM: $TEST_OUTPUT
      run: $NEON_DIR/scripts/generate_and_push_perf_report.sh

    - name: Docker cleanup
+      if: always()
      run: docker compose down

    - name: Notify Failure
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ neon.iml
 /.neon
 /integration_tests/.neon
 compaction-suite-results.*
+pgxn/neon/communicator/communicator_bindings.h

 # Coverage
 *.profraw
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -253,6 +253,17 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"

+[[package]]
+name = "atomic_enum"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.100",
+]
+
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -687,13 +698,40 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "axum"
+version = "0.7.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
+dependencies = [
+ "async-trait",
+ "axum-core 0.4.5",
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
+ "itoa",
+ "matchit 0.7.3",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustversion",
+ "serde",
+ "sync_wrapper 1.0.1",
+ "tower 0.5.2",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "axum"
 version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6d6fd624c75e18b3b4c6b9caf42b1afe24437daaee904069137d8bab077be8b8"
 dependencies = [
- "axum-core",
+ "axum-core 0.5.0",
 "base64 0.22.1",
 "bytes",
 "form_urlencoded",
@@ -701,10 +739,10 @@ dependencies = [
 "http 1.1.0",
 "http-body 1.0.0",
 "http-body-util",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "itoa",
- "matchit",
+ "matchit 0.8.4",
 "memchr",
 "mime",
 "percent-encoding",
@@ -724,6 +762,26 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "axum-core"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
+ "mime",
+ "pin-project-lite",
+ "rustversion",
+ "sync_wrapper 1.0.1",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "axum-core"
 version = "0.5.0"
@@ -750,8 +808,8 @@ version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
 dependencies = [
- "axum",
- "axum-core",
+ "axum 0.8.1",
+ "axum-core 0.5.0",
 "bytes",
 "form_urlencoded",
 "futures-util",
@@ -1083,6 +1141,25 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"

+[[package]]
+name = "cbindgen"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eadd868a2ce9ca38de7eeafdcec9c7065ef89b42b32f0839278d55f35c54d1ff"
+dependencies = [
+ "clap",
+ "heck 0.4.1",
+ "indexmap 2.9.0",
+ "log",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "serde_json",
+ "syn 2.0.100",
+ "tempfile",
+ "toml",
+]
+
 [[package]]
 name = "cc"
 version = "1.2.16"
@@ -1209,7 +1286,7 @@ version = "4.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -1267,6 +1344,34 @@ dependencies = [
 "unicode-width",
 ]

+[[package]]
+name = "communicator"
+version = "0.1.0"
+dependencies = [
+ "atomic_enum",
+ "axum 0.8.1",
+ "bytes",
+ "cbindgen",
+ "clashmap",
+ "http 1.1.0",
+ "libc",
+ "metrics",
+ "neon-shmem",
+ "nix 0.30.1",
+ "pageserver_client_grpc",
+ "pageserver_page_api",
+ "prometheus",
+ "prost 0.13.5",
+ "thiserror 1.0.69",
+ "tokio",
+ "tokio-pipe",
+ "tonic 0.12.3",
+ "tracing",
+ "tracing-subscriber",
+ "uring-common",
+ "utils",
+]
+
 [[package]]
 name = "compute_api"
 version = "0.1.0"
@@ -1279,6 +1384,7 @@ dependencies = [
 "remote_storage",
 "serde",
 "serde_json",
+ "url",
 "utils",
 ]

@@ -1292,7 +1398,7 @@ dependencies = [
 "aws-sdk-kms",
 "aws-sdk-s3",
 "aws-smithy-types",
- "axum",
+ "axum 0.8.1",
 "axum-extra",
 "base64 0.22.1",
 "bytes",
@@ -1316,6 +1422,7 @@ dependencies = [
 "opentelemetry",
 "opentelemetry_sdk",
 "p256 0.13.2",
+ "pageserver_page_api",
 "postgres",
 "postgres_initdb",
 "postgres_versioninfo",
@@ -1335,6 +1442,7 @@ dependencies = [
 "tokio-postgres",
 "tokio-stream",
 "tokio-util",
+ "tonic 0.13.1",
 "tower 0.5.2",
 "tower-http",
 "tower-otel",
@@ -1593,9 +1701,9 @@ dependencies = [

 [[package]]
 name = "crossbeam-utils"
-version = "0.8.19"
+version = "0.8.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"

 [[package]]
 name = "crossterm"
@@ -1935,7 +2043,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
 dependencies = [
 "darling",
 "either",
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -2049,7 +2157,7 @@ name = "endpoint_storage"
 version = "0.0.1"
 dependencies = [
 "anyhow",
- "axum",
+ "axum 0.8.1",
 "axum-extra",
 "camino",
 "camino-tempfile",
@@ -2330,7 +2438,7 @@ dependencies = [
 "futures-core",
 "futures-sink",
 "http-body-util",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "pin-project",
 "rand 0.8.5",
@@ -2500,6 +2608,18 @@ dependencies = [
 "wasm-bindgen",
 ]

+[[package]]
+name = "getrandom"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi",
+ "wasi 0.14.2+wasi-0.2.4",
+]
+
 [[package]]
 name = "gettid"
 version = "0.1.3"
@@ -2712,6 +2832,12 @@ dependencies = [
 "http 1.1.0",
 ]

+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -2883,9 +3009,9 @@ dependencies = [

 [[package]]
 name = "httparse"
-version = "1.8.0"
+version = "1.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904"
+checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"

 [[package]]
 name = "httpdate"
@@ -2935,9 +3061,9 @@ dependencies = [

 [[package]]
 name = "hyper"
-version = "1.4.1"
+version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05"
+checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80"
 dependencies = [
 "bytes",
 "futures-channel",
@@ -2977,7 +3103,7 @@ checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c"
 dependencies = [
 "futures-util",
 "http 1.1.0",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "rustls 0.22.4",
 "rustls-pki-types",
@@ -2992,7 +3118,7 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3203a961e5c83b6f5498933e78b6b263e208c197b63e9c6c53cc82ffd3f63793"
 dependencies = [
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "pin-project-lite",
 "tokio",
@@ -3001,20 +3127,20 @@ dependencies = [

 [[package]]
 name = "hyper-util"
-version = "0.1.7"
+version = "0.1.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9"
+checksum = "cf9f1e950e0d9d1d3c47184416723cf29c0d1f93bd8cccf37e4beb6b44f31710"
 dependencies = [
 "bytes",
 "futures-channel",
 "futures-util",
 "http 1.1.0",
 "http-body 1.0.0",
- "hyper 1.4.1",
+ "hyper 1.6.0",
+ "libc",
 "pin-project-lite",
 "socket2",
 "tokio",
- "tower 0.4.13",
 "tower-service",
 "tracing",
 ]
@@ -3603,6 +3729,12 @@ dependencies = [
 "regex-automata 0.1.10",
 ]

+[[package]]
+name = "matchit"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
+
 [[package]]
 name = "matchit"
 version = "0.8.4"
@@ -3648,7 +3780,7 @@ version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -3710,7 +3842,7 @@ dependencies = [
 "procfs",
 "prometheus",
 "rand 0.8.5",
- "rand_distr",
+ "rand_distr 0.4.3",
 "twox-hash",
 ]

@@ -3799,11 +3931,25 @@ name = "neon-shmem"
 version = "0.1.0"
 dependencies = [
 "nix 0.30.1",
+ "rand 0.9.1",
+ "rand_distr 0.5.1",
+ "spin",
 "tempfile",
 "thiserror 1.0.69",
 "workspace_hack",
 ]

+[[package]]
+name = "neonart"
+version = "0.1.0"
+dependencies = [
+ "crossbeam-utils",
+ "rand 0.9.1",
+ "rand_distr 0.5.1",
+ "spin",
+ "tracing",
+]
+
 [[package]]
 name = "never-say-never"
 version = "6.6.666"
@@ -4237,15 +4383,19 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-trait",
+ "axum 0.8.1",
 "bytes",
 "camino",
 "clap",
 "futures",
 "hdrhistogram",
+ "http 1.1.0",
 "humantime",
 "humantime-serde",
+ "metrics",
 "pageserver_api",
 "pageserver_client",
+ "pageserver_client_grpc",
 "pageserver_page_api",
 "rand 0.8.5",
 "reqwest",
@@ -4329,6 +4479,7 @@ dependencies = [
 "pageserver_client",
 "pageserver_compaction",
 "pageserver_page_api",
+ "peekable",
 "pem",
 "pin-project-lite",
 "postgres-protocol",
@@ -4342,6 +4493,7 @@ dependencies = [
 "pprof",
 "pq_proto",
 "procfs",
+ "prost 0.13.5",
 "rand 0.8.5",
 "range-set-blaze",
 "regex",
@@ -4408,6 +4560,7 @@ dependencies = [
 "postgres_backend",
 "postgres_ffi_types",
 "postgres_versioninfo",
+ "posthog_client_lite",
 "rand 0.8.5",
 "remote_storage",
 "reqwest",
@@ -4418,6 +4571,7 @@ dependencies = [
 "strum",
 "strum_macros",
 "thiserror 1.0.69",
+ "tracing",
 "tracing-utils",
 "utils",
 ]
@@ -4443,6 +4597,36 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "pageserver_client_grpc"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "bytes",
+ "chrono",
+ "dashmap 5.5.0",
+ "futures",
+ "http 1.1.0",
+ "hyper 1.6.0",
+ "hyper-util",
+ "metrics",
+ "pageserver_api",
+ "pageserver_page_api",
+ "priority-queue",
+ "rand 0.8.5",
+ "scopeguard",
+ "thiserror 1.0.69",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "tonic 0.13.1",
+ "tower 0.4.13",
+ "tracing",
+ "utils",
+ "uuid",
+]
+
 [[package]]
 name = "pageserver_compaction"
 version = "0.1.0"
@@ -4474,12 +4658,14 @@ dependencies = [
 "bytes",
 "futures",
 "pageserver_api",
- "postgres_ffi",
+ "postgres_ffi_types",
 "prost 0.13.5",
+ "prost-types 0.13.5",
 "strum",
 "strum_macros",
 "thiserror 1.0.69",
 "tokio",
+ "tokio-util",
 "tonic 0.13.1",
 "tonic-build",
 "utils",
@@ -4612,6 +4798,15 @@ dependencies = [
 "sha2",
 ]

+[[package]]
+name = "peekable"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b"
+dependencies = [
+ "smallvec",
+]
+
 [[package]]
 name = "pem"
 version = "3.0.3"
@@ -5045,6 +5240,17 @@ dependencies = [
 "elliptic-curve 0.13.8",
 ]

+[[package]]
+name = "priority-queue"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef08705fa1589a1a59aa924ad77d14722cb0cd97b67dd5004ed5f4a4873fce8d"
+dependencies = [
+ "autocfg",
+ "equivalent",
+ "indexmap 2.9.0",
+]
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.94"
@@ -5123,7 +5329,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
 dependencies = [
 "bytes",
- "heck",
+ "heck 0.5.0",
 "itertools 0.12.1",
 "log",
 "multimap",
@@ -5144,7 +5350,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
 "bytes",
- "heck",
+ "heck 0.5.0",
 "itertools 0.12.1",
 "log",
 "multimap",
@@ -5152,7 +5358,7 @@ dependencies = [
 "petgraph",
 "prettyplease",
 "prost 0.13.5",
- "prost-types 0.13.3",
+ "prost-types 0.13.5",
 "regex",
 "syn 2.0.100",
 "tempfile",
@@ -5195,9 +5401,9 @@ dependencies = [

 [[package]]
 name = "prost-types"
-version = "0.13.3"
+version = "0.13.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670"
+checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16"
 dependencies = [
 "prost 0.13.5",
 ]
@@ -5245,7 +5451,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 0.14.30",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "indexmap 2.9.0",
 "ipnet",
@@ -5269,7 +5475,7 @@ dependencies = [
 "postgres_backend",
 "pq_proto",
 "rand 0.8.5",
- "rand_distr",
+ "rand_distr 0.4.3",
 "rcgen",
 "redis",
 "regex",
@@ -5373,6 +5579,12 @@ dependencies = [
 "proc-macro2",
 ]

+[[package]]
+name = "r-efi"
+version = "5.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
+
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5397,6 +5609,16 @@ dependencies = [
 "rand_core 0.6.4",
 ]

+[[package]]
+name = "rand"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
+dependencies = [
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.3",
+]
+
 [[package]]
 name = "rand_chacha"
 version = "0.2.2"
@@ -5417,6 +5639,16 @@ dependencies = [
 "rand_core 0.6.4",
 ]

+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.3",
+]
+
 [[package]]
 name = "rand_core"
 version = "0.5.1"
@@ -5435,6 +5667,15 @@ dependencies = [
 "getrandom 0.2.11",
 ]

+[[package]]
+name = "rand_core"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
+dependencies = [
+ "getrandom 0.3.2",
+]
+
 [[package]]
 name = "rand_distr"
 version = "0.4.3"
@@ -5445,6 +5686,16 @@ dependencies = [
 "rand 0.8.5",
 ]

+[[package]]
+name = "rand_distr"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
+dependencies = [
+ "num-traits",
+ "rand 0.9.1",
+]
+
 [[package]]
 name = "rand_hc"
 version = "0.2.0"
@@ -5641,7 +5892,7 @@ dependencies = [
 "http-body-util",
 "http-types",
 "humantime-serde",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "itertools 0.10.5",
 "metrics",
 "once_cell",
@@ -5681,7 +5932,7 @@ dependencies = [
 "http 1.1.0",
 "http-body 1.0.0",
 "http-body-util",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-rustls 0.26.0",
 "hyper-util",
 "ipnet",
@@ -5738,7 +5989,7 @@ dependencies = [
 "futures",
 "getrandom 0.2.11",
 "http 1.1.0",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "parking_lot 0.11.2",
 "reqwest",
 "reqwest-middleware",
@@ -5759,7 +6010,7 @@ dependencies = [
 "async-trait",
 "getrandom 0.2.11",
 "http 1.1.0",
- "matchit",
+ "matchit 0.8.4",
 "opentelemetry",
 "reqwest",
 "reqwest-middleware",
@@ -6707,12 +6958,12 @@ dependencies = [

 [[package]]
 name = "socket2"
-version = "0.5.5"
+version = "0.5.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
+checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef"
 dependencies = [
 "libc",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -6720,6 +6971,9 @@ name = "spin"
 version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
+dependencies = [
+ "lock_api",
+]

 [[package]]
 name = "spinning_top"
@@ -6778,7 +7032,7 @@ dependencies = [
 "http-body-util",
 "http-utils",
 "humantime",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "metrics",
 "once_cell",
@@ -6804,6 +7058,7 @@ dependencies = [
 "chrono",
 "clap",
 "clashmap",
+ "compute_api",
 "control_plane",
 "cron",
 "diesel",
@@ -6815,6 +7070,7 @@ dependencies = [
 "hex",
 "http-utils",
 "humantime",
+ "humantime-serde",
 "hyper 0.14.30",
 "itertools 0.10.5",
 "json-structural-diff",
@@ -6825,6 +7081,7 @@ dependencies = [
 "pageserver_api",
 "pageserver_client",
 "postgres_connection",
+ "posthog_client_lite",
 "rand 0.8.5",
 "regex",
 "reqwest",
@@ -6959,7 +7216,7 @@ version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "rustversion",
@@ -7384,6 +7641,16 @@ dependencies = [
 "syn 2.0.100",
 ]

+[[package]]
+name = "tokio-pipe"
+version = "0.2.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784"
+dependencies = [
+ "libc",
+ "tokio",
+]
+
 [[package]]
 name = "tokio-postgres"
 version = "0.7.10"
@@ -7578,16 +7845,25 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
 dependencies = [
+ "async-stream",
 "async-trait",
+ "axum 0.7.9",
 "base64 0.22.1",
 "bytes",
+ "h2 0.4.4",
 "http 1.1.0",
 "http-body 1.0.0",
 "http-body-util",
+ "hyper 1.6.0",
+ "hyper-timeout",
+ "hyper-util",
 "percent-encoding",
 "pin-project",
 "prost 0.13.5",
+ "socket2",
+ "tokio",
 "tokio-stream",
+ "tower 0.4.13",
 "tower-layer",
 "tower-service",
 "tracing",
@@ -7600,7 +7876,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9"
 dependencies = [
 "async-trait",
- "axum",
+ "axum 0.8.1",
 "base64 0.22.1",
 "bytes",
 "flate2",
@@ -7608,7 +7884,7 @@ dependencies = [
 "http 1.1.0",
 "http-body 1.0.0",
 "http-body-util",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-timeout",
 "hyper-util",
 "percent-encoding",
@@ -7635,7 +7911,7 @@ dependencies = [
 "prettyplease",
 "proc-macro2",
 "prost-build 0.13.3",
- "prost-types 0.13.3",
+ "prost-types 0.13.5",
 "quote",
 "syn 2.0.100",
 ]
@@ -7647,7 +7923,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f9687bd5bfeafebdded2356950f278bba8226f0b32109537c4253406e09aafe1"
 dependencies = [
 "prost 0.13.5",
- "prost-types 0.13.3",
+ "prost-types 0.13.5",
 "tokio",
 "tokio-stream",
 "tonic 0.13.1",
@@ -7661,11 +7937,16 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
 dependencies = [
 "futures-core",
 "futures-util",
+ "indexmap 1.9.3",
 "pin-project",
 "pin-project-lite",
+ "rand 0.8.5",
+ "slab",
 "tokio",
+ "tokio-util",
 "tower-layer",
 "tower-service",
+ "tracing",
 ]

 [[package]]
@@ -8149,7 +8430,7 @@ name = "vm_monitor"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "axum",
+ "axum 0.8.1",
 "cgroups-rs",
 "clap",
 "futures",
@@ -8261,6 +8542,15 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

+[[package]]
+name = "wasi"
+version = "0.14.2+wasi-0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
+dependencies = [
+ "wit-bindgen-rt",
+]
+
 [[package]]
 name = "wasite"
 version = "0.1.0"
@@ -8618,6 +8908,15 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

+[[package]]
+name = "wit-bindgen-rt"
+version = "0.39.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
+dependencies = [
+ "bitflags 2.8.0",
+]
+
 [[package]]
 name = "workspace_hack"
 version = "0.1.0"
@@ -8625,8 +8924,8 @@ dependencies = [
 "ahash",
 "anstream",
 "anyhow",
- "axum",
- "axum-core",
+ "axum 0.8.1",
+ "axum-core 0.5.0",
 "base64 0.21.7",
 "base64ct",
 "bytes",
@@ -8658,7 +8957,7 @@ dependencies = [
 "hex",
 "hmac",
 "hyper 0.14.30",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "indexmap 2.9.0",
 "itertools 0.12.1",
@@ -8676,7 +8975,6 @@ dependencies = [
 "num-iter",
 "num-rational",
 "num-traits",
- "once_cell",
 "p256 0.13.2",
 "parquet",
 "prettyplease",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,6 +8,7 @@ members = [
    "pageserver/compaction",
    "pageserver/ctl",
    "pageserver/client",
+    "pageserver/client_grpc",
    "pageserver/pagebench",
    "pageserver/page_api",
    "proxy",
@@ -34,6 +35,7 @@ members = [
    "libs/pq_proto",
    "libs/tenant_size_model",
    "libs/metrics",
+    "libs/neonart",
    "libs/postgres_connection",
    "libs/remote_storage",
    "libs/tracing-utils",
@@ -46,6 +48,7 @@ members = [
    "libs/proxy/postgres-types2",
    "libs/proxy/tokio-postgres2",
    "endpoint_storage",
+    "pgxn/neon/communicator",
 ]

 [workspace.package]
@@ -89,6 +92,7 @@ clap = { version = "4.0", features = ["derive", "env"] }
 clashmap = { version = "1.0", features = ["raw-api"] }
 comfy-table = "7.1"
 const_format = "0.2"
+crossbeam-utils = "0.8.21"
 crc32c = "0.6"
 diatomic-waker = { version = "0.2.3" }
 either = "1.8"
@@ -147,11 +151,13 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pem = "3.0.3"
+peekable = "0.3.0"
 pin-project-lite = "0.2"
 pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
 procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.13.5"
+prost-types = "0.13.5"
 rand = "0.8"
 redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
@@ -182,6 +188,7 @@ smallvec = "1.11"
 smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 spki = "0.7.3"
+spin = "0.9.8"
 strum = "0.26"
 strum_macros = "0.26"
 "subtle"  = "2.5.0"
@@ -193,13 +200,12 @@ thiserror = "1.0"
 tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
 tokio = { version = "1.43.1", features = ["macros"] }
-tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
 tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
 tokio-stream = "0.1"
 tokio-tar = "0.3"
-tokio-util = { version = "0.7.10", features = ["io", "rt"] }
+tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
 toml = "0.8"
 toml_edit = "0.22"
 tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "gzip", "prost", "router", "server", "tls-ring", "tls-native-roots", "zstd"] }
@@ -235,6 +241,9 @@ x509-cert = { version = "0.2.5" }
 env_logger = "0.11"
 log = "0.4"

+tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
+uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
+
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
@@ -254,9 +263,12 @@ desim = { version = "0.1", path = "./libs/desim" }
 endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" }
 http-utils = { version = "0.1", path = "./libs/http-utils/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
+neonart = { version = "0.1", path = "./libs/neonart/" }
+neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
 pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
+pageserver_client_grpc = { path = "./pageserver/client_grpc" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 pageserver_page_api = { path = "./pageserver/page_api" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
@@ -283,6 +295,7 @@ walproposer = { version = "0.1", path = "./libs/walproposer/" }
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
+cbindgen = "0.28.0"
 criterion = "0.5.1"
 rcgen = "0.13"
 rstest = "0.18"
--- a/1
+++ b/1
@@ -40,6 +40,7 @@ COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
 COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17
 COPY --chown=nonroot pgxn pgxn
 COPY --chown=nonroot Makefile Makefile
+COPY --chown=nonroot postgres.mk postgres.mk
 COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh

 ENV BUILD_TYPE=release
--- a/143
+++ b/143
@@ -4,11 +4,14 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 # managers.
 POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/

+# Supported PostgreSQL versions
+POSTGRES_VERSIONS = v17 v16 v15 v14
+
 # CARGO_BUILD_FLAGS: Extra flags to pass to `cargo build`. `--locked`
 # and `--features testing` are popular examples.
 #
-# CARGO_PROFILE: You can also set to override the cargo profile to
-# use. By default, it is derived from BUILD_TYPE.
+# CARGO_PROFILE: Set to override the cargo profile to use. By default,
+# it is derived from BUILD_TYPE.

 # All intermediate build artifacts are stored here.
 BUILD_DIR := build
@@ -27,11 +30,18 @@ ifeq ($(BUILD_TYPE),release)
 	PG_CFLAGS += -O2 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
 	CARGO_PROFILE ?= --profile=release
+	# NEON_CARGO_ARTIFACT_TARGET_DIR is the directory where `cargo build` places
+	# the final build artifacts. There is unfortunately no easy way of changing
+	# it to a fully predictable path, nor to extract the path with a simple
+	# command. See https://github.com/rust-lang/cargo/issues/9661 and
+	# https://github.com/rust-lang/cargo/issues/6790.
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
 else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
 	PG_CFLAGS += -O0 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
 	CARGO_PROFILE ?= --profile=dev
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
 else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif
@@ -95,96 +105,30 @@ CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
 # Top level Makefile to build Neon and PostgreSQL
 #
 .PHONY: all
-all: neon postgres neon-pg-ext
+all: neon postgres-install neon-pg-ext

 ### Neon Rust bits
 #
 # The 'postgres_ffi' depends on the Postgres headers.
 .PHONY: neon
-neon: postgres-headers walproposer-lib cargo-target-dir
+neon: postgres-headers-install walproposer-lib cargo-target-dir
 	+@echo "Compiling Neon"
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE)
+
 .PHONY: cargo-target-dir
 cargo-target-dir:
 	# https://github.com/rust-lang/cargo/issues/14281
 	mkdir -p target
 	test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG

-### PostgreSQL parts
-# Some rules are duplicated for Postgres v14 and 15. We may want to refactor
-# to avoid the duplication in the future, but it's tolerable for now.
-#
-$(BUILD_DIR)/%/config.status:
-	mkdir -p $(BUILD_DIR)
-	test -e $(BUILD_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(BUILD_DIR)/CACHEDIR.TAG
-
-	+@echo "Configuring Postgres $* build"
-	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
-		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
-		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
-		exit 1; }
-	mkdir -p $(BUILD_DIR)/$*
-
-	VERSION=$*; \
-	EXTRA_VERSION=$$(cd $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION && git rev-parse HEAD); \
-	(cd $(BUILD_DIR)/$$VERSION && \
-	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$$VERSION/configure \
-		CFLAGS='$(PG_CFLAGS)' LDFLAGS='$(PG_LDFLAGS)' \
-		$(PG_CONFIGURE_OPTS) --with-extra-version=" ($$EXTRA_VERSION)" \
-		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/$$VERSION > configure.log)
-
-# nicer alias to run 'configure'
-# Note: I've been unable to use templates for this part of our configuration.
-# I'm not sure why it wouldn't work, but this is the only place (apart from
-# the "build-all-versions" entry points) where direct mention of PostgreSQL
-# versions is used.
-.PHONY: postgres-configure-v17
-postgres-configure-v17: $(BUILD_DIR)/v17/config.status
-.PHONY: postgres-configure-v16
-postgres-configure-v16: $(BUILD_DIR)/v16/config.status
-.PHONY: postgres-configure-v15
-postgres-configure-v15: $(BUILD_DIR)/v15/config.status
-.PHONY: postgres-configure-v14
-postgres-configure-v14: $(BUILD_DIR)/v14/config.status
-
-# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/<version>/include
-.PHONY: postgres-headers-%
-postgres-headers-%: postgres-configure-%
-	+@echo "Installing PostgreSQL $* headers"
-	$(MAKE) -C $(BUILD_DIR)/$*/src/include MAKELEVEL=0 install
-
-# Compile and install PostgreSQL
-.PHONY: postgres-%
-postgres-%: postgres-configure-% \
-		  postgres-headers-% # to prevent `make install` conflicts with neon's `postgres-headers`
-	+@echo "Compiling PostgreSQL $*"
-	$(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 install
-	+@echo "Compiling libpq $*"
-	$(MAKE) -C $(BUILD_DIR)/$*/src/interfaces/libpq install
-	+@echo "Compiling pg_prewarm $*"
-	$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_prewarm install
-	+@echo "Compiling pg_buffercache $*"
-	$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_buffercache install
-	+@echo "Compiling pg_visibility $*"
-	$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_visibility install
-	+@echo "Compiling pageinspect $*"
-	$(MAKE) -C $(BUILD_DIR)/$*/contrib/pageinspect install
-	+@echo "Compiling pg_trgm $*"
-	$(MAKE) -C $(BUILD_DIR)/$*/contrib/pg_trgm install
-	+@echo "Compiling amcheck $*"
-	$(MAKE) -C $(BUILD_DIR)/$*/contrib/amcheck install
-	+@echo "Compiling test_decoding $*"
-	$(MAKE) -C $(BUILD_DIR)/$*/contrib/test_decoding install
-
-.PHONY: postgres-check-%
-postgres-check-%: postgres-%
-	$(MAKE) -C $(BUILD_DIR)/$* MAKELEVEL=0 check
-
 .PHONY: neon-pg-ext-%
-neon-pg-ext-%: postgres-%
+neon-pg-ext-%: postgres-install-%
 	+@echo "Compiling neon-specific Postgres extensions for $*"
 	mkdir -p $(BUILD_DIR)/pgxn-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
+	$(MAKE) PG_CONFIG="$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config" COPT='$(COPT)' \
+		NEON_CARGO_ARTIFACT_TARGET_DIR="$(NEON_CARGO_ARTIFACT_TARGET_DIR)" \
+		CARGO_BUILD_FLAGS="$(CARGO_BUILD_FLAGS)" \
+		CARGO_PROFILE="$(CARGO_PROFILE)" \
 		-C $(BUILD_DIR)/pgxn-$*\
 		-f $(ROOT_PROJECT_DIR)/pgxn/Makefile  install

@@ -220,39 +164,14 @@ ifeq ($(UNAME_S),Linux)
 		pg_crc32c.o
 endif

+# Shorthand to call neon-pg-ext-% target for all Postgres versions
 .PHONY: neon-pg-ext
-neon-pg-ext: \
-	neon-pg-ext-v14 \
-	neon-pg-ext-v15 \
-	neon-pg-ext-v16 \
-	neon-pg-ext-v17
-
-# shorthand to build all Postgres versions
-.PHONY: postgres
-postgres: \
-	postgres-v14 \
-	postgres-v15 \
-	postgres-v16 \
-	postgres-v17
-
-.PHONY: postgres-headers
-postgres-headers: \
-	postgres-headers-v14 \
-	postgres-headers-v15 \
-	postgres-headers-v16 \
-	postgres-headers-v17
-
-.PHONY: postgres-check
-postgres-check: \
-	postgres-check-v14 \
-	postgres-check-v15 \
-	postgres-check-v16 \
-	postgres-check-v17
+neon-pg-ext: $(foreach pg_version,$(POSTGRES_VERSIONS),neon-pg-ext-$(pg_version))

 # This removes everything
 .PHONY: distclean
 distclean:
-	$(RM) -r $(POSTGRES_INSTALL_DIR)
+	$(RM) -r $(POSTGRES_INSTALL_DIR) $(BUILD_DIR)
 	$(CARGO_CMD_PREFIX) cargo clean

 .PHONY: fmt
@@ -300,3 +219,19 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
 .PHONY: setup-pre-commit-hook
 setup-pre-commit-hook:
 	ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
+
+# Targets for building PostgreSQL are defined in postgres.mk.
+#
+# But if the caller has indicated that PostgreSQL is already
+# installed, by setting the PG_INSTALL_CACHED variable, skip it.
+ifdef PG_INSTALL_CACHED
+postgres-install: skip-install
+$(foreach pg_version,$(POSTGRES_VERSIONS),postgres-install-$(pg_version)): skip-install
+postgres-headers-install:
+	+@echo "Skipping installation of PostgreSQL headers because PG_INSTALL_CACHED is set"
+skip-install:
+	+@echo "Skipping PostgreSQL installation because PG_INSTALL_CACHED is set"
+
+else
+include postgres.mk
+endif
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -165,6 +165,7 @@ RUN curl -fsSL \
    && rm sql_exporter.tar.gz

 # protobuf-compiler (protoc)
+# Keep the version the same as in compute/compute-node.Dockerfile
 ENV PROTOC_VERSION=25.1
 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
    && unzip -q protoc.zip -d protoc \
@@ -179,7 +180,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
    && mv s5cmd /usr/local/bin/s5cmd

 # LLVM
-ENV LLVM_VERSION=19
+ENV LLVM_VERSION=20
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
    && echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
    && apt update \
@@ -292,7 +293,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.87.0
+ENV RUSTC_VERSION=1.88.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -115,6 +115,9 @@ ARG EXTENSIONS=all
 FROM $BASE_IMAGE_SHA AS build-deps
 ARG DEBIAN_VERSION

+# Keep in sync with build-tools.Dockerfile
+ENV PROTOC_VERSION=25.1
+
 # Use strict mode for bash to catch errors early
 SHELL ["/bin/bash", "-euo", "pipefail", "-c"]

@@ -149,8 +152,14 @@ RUN case $DEBIAN_VERSION in \
    libclang-dev \
    jsonnet \
    $VERSION_INSTALLS \
-    && apt clean && rm -rf /var/lib/apt/lists/* && \
-    useradd -ms /bin/bash nonroot -b /home
+    && apt clean && rm -rf /var/lib/apt/lists/* \
+    && useradd -ms /bin/bash nonroot -b /home \
+    # Install protoc from binary release, since Debian's versions are too old.
+    && curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
+    && unzip -q protoc.zip -d protoc \
+    && mv protoc/bin/protoc /usr/local/bin/protoc \
+    && mv protoc/include/google /usr/local/include/google \
+    && rm -rf protoc.zip protoc

 #########################################################################################
 #
@@ -171,9 +180,6 @@ RUN cd postgres && \
    eval $CONFIGURE_CMD && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
-    # Install headers
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
    # Enable some of contrib extensions
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
@@ -1173,7 +1179,7 @@ COPY --from=pgrag-src /ext-src/ /ext-src/
 # Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise
 WORKDIR /ext-src/onnxruntime-src
 RUN apt update && apt install --no-install-recommends --no-install-suggests -y \
-    python3 python3-pip python3-venv protobuf-compiler && \
+    python3 python3-pip python3-venv && \
    apt clean && rm -rf /var/lib/apt/lists/* && \
    python3 -m venv venv && \
    . venv/bin/activate && \
@@ -1568,20 +1574,20 @@ ARG PG_VERSION
 WORKDIR /ext-src
 RUN case "${PG_VERSION}" in \
    "v14") \
-    export PGAUDIT_VERSION=1.6.2 \
-    export PGAUDIT_CHECKSUM=1f350d70a0cbf488c0f2b485e3a5c9b11f78ad9e3cbb95ef6904afa1eb3187eb \
+    export PGAUDIT_VERSION=1.6.3 \
+    export PGAUDIT_CHECKSUM=37a8f5a7cc8d9188e536d15cf0fdc457fcdab2547caedb54442c37f124110919 \
    ;; \
    "v15") \
-    export PGAUDIT_VERSION=1.7.0 \
-    export PGAUDIT_CHECKSUM=8f4a73e451c88c567e516e6cba7dc1e23bc91686bb6f1f77f8f3126d428a8bd8 \
+    export PGAUDIT_VERSION=1.7.1 \
+    export PGAUDIT_CHECKSUM=e9c8e6e092d82b2f901d72555ce0fe7780552f35f8985573796cd7e64b09d4ec \
    ;; \
    "v16") \
-    export PGAUDIT_VERSION=16.0 \
-    export PGAUDIT_CHECKSUM=d53ef985f2d0b15ba25c512c4ce967dce07b94fd4422c95bd04c4c1a055fe738 \
+    export PGAUDIT_VERSION=16.1 \
+    export PGAUDIT_CHECKSUM=3bae908ab70ba0c6f51224009dbcfff1a97bd6104c6273297a64292e1b921fee \
    ;; \
    "v17") \
-    export PGAUDIT_VERSION=17.0 \
-    export PGAUDIT_CHECKSUM=7d0d08d030275d525f36cd48b38c6455f1023da863385badff0cec44965bfd8c \
+    export PGAUDIT_VERSION=17.1 \
+    export PGAUDIT_CHECKSUM=9c5f37504d393486cc75d2ced83f75f5899be64fa85f689d6babb833b4361e6c \
    ;; \
    *) \
    echo "pgaudit is not supported on this PostgreSQL version" && exit 1;; \
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -38,6 +38,7 @@ once_cell.workspace = true
 opentelemetry.workspace = true
 opentelemetry_sdk.workspace = true
 p256 = { version = "0.13", features = ["pem"] }
+pageserver_page_api.workspace = true
 postgres.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["json"] }
@@ -53,6 +54,7 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
 tokio-util.workspace = true
 tokio-stream.workspace = true
+tonic.workspace = true
 tower-otel.workspace = true
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,4 +1,4 @@
-use anyhow::{Context, Result};
+use anyhow::{Context, Result, anyhow};
 use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
 use compute_api::responses::{
@@ -6,7 +6,8 @@ use compute_api::responses::{
    LfcPrewarmState, TlsConfig,
 };
 use compute_api::spec::{
-    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
+    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverConnectionInfo,
+    PageserverShardConnectionInfo, PgIdent,
 };
 use futures::StreamExt;
 use futures::future::join_all;
@@ -15,12 +16,12 @@ use itertools::Itertools;
 use nix::sys::signal::{Signal, kill};
 use nix::unistd::Pid;
 use once_cell::sync::Lazy;
+use pageserver_page_api::{self as page_api, BaseBackupCompression};
 use postgres;
 use postgres::NoTls;
 use postgres::error::SqlState;
 use remote_storage::{DownloadError, RemotePath};
 use std::collections::{HashMap, HashSet};
-use std::net::SocketAddr;
 use std::os::unix::fs::{PermissionsExt, symlink};
 use std::path::Path;
 use std::process::{Command, Stdio};
@@ -36,6 +37,7 @@ use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 use utils::measured_stream::MeasuredReader;
 use utils::pid_file;
+use utils::shard::{ShardCount, ShardIndex, ShardNumber};

 use crate::configurator::launch_configurator;
 use crate::disk_quota::set_disk_quota;
@@ -215,10 +217,11 @@ pub struct ParsedSpec {
    pub spec: ComputeSpec,
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
-    pub pageserver_connstr: String,
+    pub pageserver_conninfo: PageserverConnectionInfo,
    pub safekeeper_connstrings: Vec<String>,
    pub storage_auth_token: Option<String>,
-    pub endpoint_storage_addr: Option<SocketAddr>,
+    /// k8s dns name and port
+    pub endpoint_storage_addr: Option<String>,
    pub endpoint_storage_token: Option<String>,
 }

@@ -261,6 +264,27 @@ impl ParsedSpec {
    }
 }

+fn extract_pageserver_conninfo_from_guc(
+    pageserver_connstring_guc: &str,
+) -> PageserverConnectionInfo {
+    PageserverConnectionInfo {
+        shards: pageserver_connstring_guc
+            .split(',')
+            .enumerate()
+            .map(|(i, connstr)| {
+                (
+                    i as u32,
+                    PageserverShardConnectionInfo {
+                        libpq_url: Some(connstr.to_string()),
+                        grpc_url: None,
+                    },
+                )
+            })
+            .collect(),
+        prefer_grpc: false,
+    }
+}
+
 impl TryFrom<ComputeSpec> for ParsedSpec {
    type Error = String;
    fn try_from(spec: ComputeSpec) -> Result<Self, String> {
@@ -270,11 +294,17 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
        // For backwards-compatibility, the top-level fields in the spec file
        // may be empty. In that case, we need to dig them from the GUCs in the
        // cluster.settings field.
-        let pageserver_connstr = spec
-            .pageserver_connstring
-            .clone()
-            .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
-            .ok_or("pageserver connstr should be provided")?;
+        let pageserver_conninfo = match &spec.pageserver_connection_info {
+            Some(x) => x.clone(),
+            None => {
+                if let Some(guc) = spec.cluster.settings.find("neon.pageserver_connstring") {
+                    extract_pageserver_conninfo_from_guc(&guc)
+                } else {
+                    return Err("pageserver connstr should be provided".to_string());
+                }
+            }
+        };
+
        let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
            if matches!(spec.mode, ComputeMode::Primary) {
                spec.cluster
@@ -313,13 +343,10 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
                .or(Err("invalid timeline id"))?
        };

-        let endpoint_storage_addr: Option<SocketAddr> = spec
+        let endpoint_storage_addr: Option<String> = spec
            .endpoint_storage_addr
            .clone()
-            .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_addr"))
-            .unwrap_or_default()
-            .parse()
-            .ok();
+            .or_else(|| spec.cluster.settings.find("neon.endpoint_storage_addr"));
        let endpoint_storage_token = spec
            .endpoint_storage_token
            .clone()
@@ -327,7 +354,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {

        let res = ParsedSpec {
            spec,
-            pageserver_connstr,
+            pageserver_conninfo,
            safekeeper_connstrings,
            storage_auth_token,
            tenant_id,
@@ -417,7 +444,7 @@ impl ComputeNode {

        let mut new_state = ComputeState::new();
        if let Some(spec) = config.spec {
-            let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
+            let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow!(msg))?;
            new_state.pspec = Some(pspec);
        }

@@ -998,15 +1025,87 @@ impl ComputeNode {
        Ok(())
    }

-    // Get basebackup from the libpq connection to pageserver using `connstr` and
-    // unarchive it to `pgdata` directory overriding all its previous content.
+    /// Fetches a basebackup from the Pageserver using the compute state's Pageserver connstring and
+    /// unarchives it to `pgdata` directory, replacing any existing contents.
    #[instrument(skip_all, fields(%lsn))]
    fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");
-        let start_time = Instant::now();

-        let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
-        let mut config = postgres::Config::from_str(shard0_connstr)?;
+        let started = Instant::now();
+        let (connected, size) = if spec.pageserver_conninfo.prefer_grpc {
+            self.try_get_basebackup_grpc(spec, lsn)?
+        } else {
+            self.try_get_basebackup_libpq(spec, lsn)?
+        };
+
+        let mut state = self.state.lock().unwrap();
+        state.metrics.pageserver_connect_micros =
+            connected.duration_since(started).as_micros() as u64;
+        state.metrics.basebackup_bytes = size as u64;
+        state.metrics.basebackup_ms = started.elapsed().as_millis() as u64;
+
+        Ok(())
+    }
+
+    /// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
+    /// the connection was established, and the (compressed) size of the basebackup.
+    fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
+        let shard0 = spec
+            .pageserver_conninfo
+            .shards
+            .get(&0)
+            .expect("shard 0 connection info missing");
+        let shard0_url = shard0.grpc_url.clone().expect("no grpc_url for shard 0");
+
+        let shard_index = match spec.pageserver_conninfo.shards.len() as u8 {
+            0 | 1 => ShardIndex::unsharded(),
+            count => ShardIndex::new(ShardNumber(0), ShardCount(count)),
+        };
+
+        let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
+            let mut client = page_api::Client::connect(
+                shard0_url,
+                spec.tenant_id,
+                spec.timeline_id,
+                shard_index,
+                spec.storage_auth_token.clone(),
+                None, // NB: base backups use payload compression
+            )
+            .await?;
+            let connected = Instant::now();
+            let reader = client
+                .get_base_backup(page_api::GetBaseBackupRequest {
+                    lsn: (lsn != Lsn(0)).then_some(lsn),
+                    compression: BaseBackupCompression::Gzip,
+                    replica: spec.spec.mode != ComputeMode::Primary,
+                    full: false,
+                })
+                .await?;
+            anyhow::Ok((reader, connected))
+        })?;
+
+        let mut reader = MeasuredReader::new(tokio_util::io::SyncIoBridge::new(reader));
+
+        // Set `ignore_zeros` so that unpack() reads the entire stream and doesn't just stop at the
+        // end-of-archive marker. If the server errors, the tar::Builder drop handler will write an
+        // end-of-archive marker before the error is emitted, and we would not see the error.
+        let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut reader));
+        ar.set_ignore_zeros(true);
+        ar.unpack(&self.params.pgdata)?;
+
+        Ok((connected, reader.get_byte_count()))
+    }
+
+    /// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp
+    /// when the connection was established, and the (compressed) size of the basebackup.
+    fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
+        let shard0 = spec
+            .pageserver_conninfo
+            .shards
+            .get(&0)
+            .expect("shard 0 connection info missing");
+        let shard0_connstr = shard0.libpq_url.clone().expect("no libpq_url for shard 0");
+        let mut config = postgres::Config::from_str(&shard0_connstr)?;

        // Use the storage auth token from the config file, if given.
        // Note: this overrides any password set in the connection string.
@@ -1018,16 +1117,14 @@ impl ComputeNode {
        }

        config.application_name("compute_ctl");
-        if let Some(spec) = &compute_state.pspec {
-            config.options(&format!(
-                "-c neon.compute_mode={}",
-                spec.spec.mode.to_type_str()
-            ));
-        }
+        config.options(&format!(
+            "-c neon.compute_mode={}",
+            spec.spec.mode.to_type_str()
+        ));

        // Connect to pageserver
        let mut client = config.connect(NoTls)?;
-        let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
+        let connected = Instant::now();

        let basebackup_cmd = match lsn {
            Lsn(0) => {
@@ -1064,16 +1161,13 @@ impl ComputeNode {
        // Set `ignore_zeros` so that unpack() reads all the Copy data and
        // doesn't stop at the end-of-archive marker. Otherwise, if the server
        // sends an Error after finishing the tarball, we will not notice it.
+        // The tar::Builder drop handler will write an end-of-archive marker
+        // before emitting the error, and we would not see it otherwise.
        let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
        ar.set_ignore_zeros(true);
        ar.unpack(&self.params.pgdata)?;

-        // Report metrics
-        let mut state = self.state.lock().unwrap();
-        state.metrics.pageserver_connect_micros = pageserver_connect_micros;
-        state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
-        state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
-        Ok(())
+        Ok((connected, measured_reader.get_byte_count()))
    }

    // Gets the basebackup in a retry loop
@@ -1097,10 +1191,7 @@ impl ComputeNode {
                    return result;
                }
                Err(ref e) if attempts < max_attempts => {
-                    warn!(
-                        "Failed to get basebackup: {} (attempt {}/{})",
-                        e, attempts, max_attempts
-                    );
+                    warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})");
                    std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
                    retry_period_ms *= 1.5;
                }
@@ -1309,16 +1400,8 @@ impl ComputeNode {
            }
        };

-        info!(
-            "getting basebackup@{} from pageserver {}",
-            lsn, &pspec.pageserver_connstr
-        );
-        self.get_basebackup(compute_state, lsn).with_context(|| {
-            format!(
-                "failed to get basebackup@{} from pageserver {}",
-                lsn, &pspec.pageserver_connstr
-            )
-        })?;
+        self.get_basebackup(compute_state, lsn)
+            .with_context(|| format!("failed to get basebackup@{lsn}"))?;

        // Update pg_hba.conf received with basebackup.
        update_pg_hba(pgdata_path)?;
@@ -1978,7 +2061,7 @@ LIMIT 100",
            self.params
                .remote_ext_base_url
                .as_ref()
-                .ok_or(DownloadError::BadInput(anyhow::anyhow!(
+                .ok_or(DownloadError::BadInput(anyhow!(
                    "Remote extensions storage is not configured",
                )))?;

@@ -2174,7 +2257,7 @@ LIMIT 100",
        let remote_extensions = spec
            .remote_extensions
            .as_ref()
-            .ok_or(anyhow::anyhow!("Remote extensions are not configured"))?;
+            .ok_or(anyhow!("Remote extensions are not configured"))?;

        info!("parse shared_preload_libraries from spec.cluster.settings");
        let mut libs_vec = Vec::new();
@@ -2253,22 +2336,22 @@ LIMIT 100",
    /// The operation will time out after a specified duration.
    pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
        let state = self.state.lock().unwrap();
-        let old_pageserver_connstr = state
+        let old_pageserver_conninfo = state
            .pspec
            .as_ref()
            .expect("spec must be set")
-            .pageserver_connstr
+            .pageserver_conninfo
            .clone();
        let mut unchanged = true;
        let _ = self
            .state_changed
            .wait_timeout_while(state, duration, |s| {
-                let pageserver_connstr = &s
+                let pageserver_conninfo = &s
                    .pspec
                    .as_ref()
                    .expect("spec must be set")
-                    .pageserver_connstr;
-                unchanged = pageserver_connstr == &old_pageserver_connstr;
+                    .pageserver_conninfo;
+                unchanged = pageserver_conninfo == &old_pageserver_conninfo;
                unchanged
            })
            .unwrap();
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -56,9 +56,51 @@ pub fn write_postgres_conf(

    // Add options for connecting to storage
    writeln!(file, "# Neon storage settings")?;
-    if let Some(s) = &spec.pageserver_connstring {
-        writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
+
+    if let Some(conninfo) = &spec.pageserver_connection_info {
+        let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
+        let mut grpc_urls: Option<Vec<String>> = Some(Vec::new());
+
+        for shardno in 0..conninfo.shards.len() {
+            let info = conninfo.shards.get(&(shardno as u32)).ok_or_else(|| {
+                anyhow::anyhow!("shard {shardno} missing from pageserver_connection_info shard map")
+            })?;
+
+            if let Some(url) = &info.libpq_url {
+                if let Some(ref mut urls) = libpq_urls {
+                    urls.push(url.clone());
+                }
+            } else {
+                libpq_urls = None
+            }
+            if let Some(url) = &info.grpc_url {
+                if let Some(ref mut urls) = grpc_urls {
+                    urls.push(url.clone());
+                }
+            } else {
+                grpc_urls = None
+            }
+        }
+        if let Some(libpq_urls) = libpq_urls {
+            writeln!(
+                file,
+                "neon.pageserver_connstring={}",
+                escape_conf_value(&libpq_urls.join(","))
+            )?;
+        } else {
+            writeln!(file, "# no neon.pageserver_connstring")?;
+        }
+        if let Some(grpc_urls) = grpc_urls {
+            writeln!(
+                file,
+                "neon.pageserver_grpc_urls={}",
+                escape_conf_value(&grpc_urls.join(","))
+            )?;
+        } else {
+            writeln!(file, "# no neon.pageserver_grpc_urls")?;
+        }
    }
+
    if let Some(stripe_size) = spec.shard_stripe_size {
        writeln!(file, "neon.stripe_size={stripe_size}")?;
    }
--- a/compute_tools/src/lsn_lease.rs
+++ b/compute_tools/src/lsn_lease.rs
@@ -4,7 +4,8 @@ use std::thread;
 use std::time::{Duration, SystemTime};

 use anyhow::{Result, bail};
-use compute_api::spec::ComputeMode;
+use compute_api::spec::{ComputeMode, PageserverConnectionInfo};
+use pageserver_page_api as page_api;
 use postgres::{NoTls, SimpleQueryMessage};
 use tracing::{info, warn};
 use utils::id::{TenantId, TimelineId};
@@ -76,25 +77,16 @@ fn acquire_lsn_lease_with_retry(

    loop {
        // Note: List of pageservers is dynamic, need to re-read configs before each attempt.
-        let configs = {
+        let (conninfo, auth) = {
            let state = compute.state.lock().unwrap();
-
            let spec = state.pspec.as_ref().expect("spec must be set");
-
-            let conn_strings = spec.pageserver_connstr.split(',');
-
-            conn_strings
-                .map(|connstr| {
-                    let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
-                    if let Some(storage_auth_token) = &spec.storage_auth_token {
-                        config.password(storage_auth_token.clone());
-                    }
-                    config
-                })
-                .collect::<Vec<_>>()
+            (
+                spec.pageserver_conninfo.clone(),
+                spec.storage_auth_token.clone(),
+            )
        };

-        let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs);
+        let result = try_acquire_lsn_lease(conninfo, auth.as_deref(), tenant_id, timeline_id, lsn);
        match result {
            Ok(Some(res)) => {
                return Ok(res);
@@ -116,68 +108,112 @@ fn acquire_lsn_lease_with_retry(
    }
 }

-/// Tries to acquire an LSN lease through PS page_service API.
+/// Tries to acquire LSN leases on all Pageserver shards.
 fn try_acquire_lsn_lease(
+    conninfo: PageserverConnectionInfo,
+    auth: Option<&str>,
    tenant_id: TenantId,
    timeline_id: TimelineId,
    lsn: Lsn,
-    configs: &[postgres::Config],
 ) -> Result<Option<SystemTime>> {
-    fn get_valid_until(
-        config: &postgres::Config,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
-        lsn: Lsn,
-    ) -> Result<Option<SystemTime>> {
-        let mut client = config.connect(NoTls)?;
-        let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} ");
-        let res = client.simple_query(&cmd)?;
-        let msg = match res.first() {
-            Some(msg) => msg,
-            None => bail!("empty response"),
-        };
-        let row = match msg {
-            SimpleQueryMessage::Row(row) => row,
-            _ => bail!("error parsing lsn lease response"),
+    let shard_count = conninfo.shards.len();
+    let mut leases = Vec::new();
+
+    for (shard_number, shard) in conninfo.shards.into_iter() {
+        let tenant_shard_id = match shard_count {
+            0 | 1 => TenantShardId::unsharded(tenant_id),
+            shard_count => TenantShardId {
+                tenant_id,
+                shard_number: ShardNumber(shard_number as u8),
+                shard_count: ShardCount::new(shard_count as u8),
+            },
        };

-        // Note: this will be None if a lease is explicitly not granted.
-        let valid_until_str = row.get("valid_until");
-
-        let valid_until = valid_until_str.map(|s| {
-            SystemTime::UNIX_EPOCH
-                .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
-                .expect("Time larger than max SystemTime could handle")
-        });
-        Ok(valid_until)
+        let lease = if conninfo.prefer_grpc {
+            acquire_lsn_lease_grpc(
+                &shard.grpc_url.unwrap(),
+                auth,
+                tenant_shard_id,
+                timeline_id,
+                lsn,
+            )?
+        } else {
+            acquire_lsn_lease_libpq(
+                &shard.libpq_url.unwrap(),
+                auth,
+                tenant_shard_id,
+                timeline_id,
+                lsn,
+            )?
+        };
+        leases.push(lease);
    }

-    let shard_count = configs.len();
+    Ok(leases.into_iter().min().flatten())
+}

-    let valid_until = if shard_count > 1 {
-        configs
-            .iter()
-            .enumerate()
-            .map(|(shard_number, config)| {
-                let tenant_shard_id = TenantShardId {
-                    tenant_id,
-                    shard_count: ShardCount::new(shard_count as u8),
-                    shard_number: ShardNumber(shard_number as u8),
-                };
-                get_valid_until(config, tenant_shard_id, timeline_id, lsn)
-            })
-            .collect::<Result<Vec<Option<SystemTime>>>>()?
-            .into_iter()
-            .min()
-            .unwrap()
-    } else {
-        get_valid_until(
-            &configs[0],
-            TenantShardId::unsharded(tenant_id),
-            timeline_id,
-            lsn,
-        )?
+/// Acquires an LSN lease on a single shard, using the libpq API. The connstring must use a
+/// postgresql:// scheme.
+fn acquire_lsn_lease_libpq(
+    connstring: &str,
+    auth: Option<&str>,
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+    lsn: Lsn,
+) -> Result<Option<SystemTime>> {
+    let mut config = postgres::Config::from_str(connstring)?;
+    if let Some(auth) = auth {
+        config.password(auth);
+    }
+    let mut client = config.connect(NoTls)?;
+    let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} ");
+    let res = client.simple_query(&cmd)?;
+    let msg = match res.first() {
+        Some(msg) => msg,
+        None => bail!("empty response"),
+    };
+    let row = match msg {
+        SimpleQueryMessage::Row(row) => row,
+        _ => bail!("error parsing lsn lease response"),
    };

+    // Note: this will be None if a lease is explicitly not granted.
+    let valid_until_str = row.get("valid_until");
+
+    let valid_until = valid_until_str.map(|s| {
+        SystemTime::UNIX_EPOCH
+            .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
+            .expect("Time larger than max SystemTime could handle")
+    });
    Ok(valid_until)
 }
+
+/// Acquires an LSN lease on a single shard, using the gRPC API. The connstring must use a
+/// grpc:// scheme.
+fn acquire_lsn_lease_grpc(
+    connstring: &str,
+    auth: Option<&str>,
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+    lsn: Lsn,
+) -> Result<Option<SystemTime>> {
+    tokio::runtime::Handle::current().block_on(async move {
+        let mut client = page_api::Client::connect(
+            connstring.to_string(),
+            tenant_shard_id.tenant_id,
+            timeline_id,
+            tenant_shard_id.to_index(),
+            auth.map(String::from),
+            None,
+        )
+        .await?;
+
+        let req = page_api::LeaseLsnRequest { lsn };
+        match client.lease_lsn(req).await {
+            Ok(expires) => Ok(Some(expires)),
+            // Lease couldn't be acquired because the LSN has been garbage collected.
+            Err(err) if err.code() == tonic::Code::FailedPrecondition => Ok(None),
+            Err(err) => Err(err.into()),
+        }
+    })
+}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -16,9 +16,9 @@ use std::time::Duration;
 use anyhow::{Context, Result, anyhow, bail};
 use clap::Parser;
 use compute_api::requests::ComputeClaimsScope;
-use compute_api::spec::ComputeMode;
+use compute_api::spec::{ComputeMode, PageserverConnectionInfo, PageserverShardConnectionInfo};
 use control_plane::broker::StorageBroker;
-use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode, PageserverProtocol};
+use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode};
 use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
 use control_plane::local_env;
 use control_plane::local_env::{
@@ -1504,29 +1504,35 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                )?;
            }

-            let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
-                let conf = env.get_pageserver_conf(pageserver_id).unwrap();
-                // Use gRPC if requested.
-                let pageserver = if endpoint.grpc {
-                    let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
-                    let (host, port) = parse_host_port(grpc_addr)?;
-                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
-                    (PageserverProtocol::Grpc, host, port)
-                } else {
+            let (shards, stripe_size) = if let Some(ps_id) = pageserver_id {
+                let conf = env.get_pageserver_conf(ps_id).unwrap();
+                let libpq_url = Some({
                    let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
                    let port = port.unwrap_or(5432);
-                    (PageserverProtocol::Libpq, host, port)
+                    format!("postgres://no_user@{host}:{port}")
+                });
+                let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
+                    let (host, port) = parse_host_port(grpc_addr)?;
+                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
+                    Some(format!("grpc://no_user@{host}:{port}"))
+                } else {
+                    None
                };
+                let pageserver = PageserverShardConnectionInfo {
+                    libpq_url,
+                    grpc_url,
+                };
+
                // If caller is telling us what pageserver to use, this is not a tenant which is
                // fully managed by storage controller, therefore not sharded.
-                (vec![pageserver], DEFAULT_STRIPE_SIZE)
+                (vec![(0, pageserver)], DEFAULT_STRIPE_SIZE)
            } else {
                // Look up the currently attached location of the tenant, and its striping metadata,
                // to pass these on to postgres.
                let storage_controller = StorageController::from_env(env);
                let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
-                let pageservers = futures::future::try_join_all(
-                    locate_result.shards.into_iter().map(|shard| async move {
+                let shards = futures::future::try_join_all(locate_result.shards.into_iter().map(
+                    |shard| async move {
                        if let ComputeMode::Static(lsn) = endpoint.mode {
                            // Initialize LSN leases for static computes.
                            let conf = env.get_pageserver_conf(shard.node_id).unwrap();
@@ -1538,28 +1544,34 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                                .await?;
                        }

-                        let pageserver = if endpoint.grpc {
-                            (
-                                PageserverProtocol::Grpc,
-                                Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))?,
-                                shard.listen_grpc_port.expect("no gRPC port"),
-                            )
+                        let libpq_host = Host::parse(&shard.listen_pg_addr)?;
+                        let libpq_port = shard.listen_pg_port;
+                        let libpq_url =
+                            Some(format!("postgres://no_user@{libpq_host}:{libpq_port}"));
+
+                        let grpc_url = if let Some(grpc_host) = shard.listen_grpc_addr {
+                            let grpc_port = shard.listen_grpc_port.expect("no gRPC port");
+                            Some(format!("grpc://no_user@{grpc_host}:{grpc_port}"))
                        } else {
-                            (
-                                PageserverProtocol::Libpq,
-                                Host::parse(&shard.listen_pg_addr)?,
-                                shard.listen_pg_port,
-                            )
+                            None
                        };
-                        anyhow::Ok(pageserver)
-                    }),
-                )
+                        let pageserver = PageserverShardConnectionInfo {
+                            libpq_url,
+                            grpc_url,
+                        };
+                        anyhow::Ok((shard.shard_id.shard_number.0 as u32, pageserver))
+                    },
+                ))
                .await?;
                let stripe_size = locate_result.shard_params.stripe_size;

-                (pageservers, stripe_size)
+                (shards, stripe_size)
+            };
+            assert!(!shards.is_empty());
+            let pageserver_conninfo = PageserverConnectionInfo {
+                shards: shards.into_iter().collect(),
+                prefer_grpc: endpoint.grpc,
            };
-            assert!(!pageservers.is_empty());

            let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
@@ -1591,7 +1603,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                    endpoint_storage_addr,
                    safekeepers_generation,
                    safekeepers,
-                    pageservers,
+                    pageserver_conninfo,
                    remote_ext_base_url.as_ref(),
                    stripe_size.0 as usize,
                    args.create_test_user,
@@ -1606,20 +1618,27 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                .endpoints
                .get(endpoint_id.as_str())
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id {
+            let shards = if let Some(ps_id) = args.endpoint_pageserver_id {
                let conf = env.get_pageserver_conf(ps_id)?;
-                // Use gRPC if requested.
-                let pageserver = if endpoint.grpc {
-                    let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
-                    let (host, port) = parse_host_port(grpc_addr)?;
-                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
-                    (PageserverProtocol::Grpc, host, port)
-                } else {
+                let libpq_url = Some({
                    let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
                    let port = port.unwrap_or(5432);
-                    (PageserverProtocol::Libpq, host, port)
+                    format!("postgres://no_user@{host}:{port}")
+                });
+                let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
+                    let (host, port) = parse_host_port(grpc_addr)?;
+                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
+                    Some(format!("grpc://no_user@{host}:{port}"))
+                } else {
+                    None
                };
-                vec![pageserver]
+                let pageserver = PageserverShardConnectionInfo {
+                    libpq_url,
+                    grpc_url,
+                };
+                // If caller is telling us what pageserver to use, this is not a tenant which is
+                // fully managed by storage controller, therefore not sharded.
+                vec![(0, pageserver)]
            } else {
                let storage_controller = StorageController::from_env(env);
                storage_controller
@@ -1629,27 +1648,37 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                    .into_iter()
                    .map(|shard| {
                        // Use gRPC if requested.
-                        if endpoint.grpc {
-                            (
-                                PageserverProtocol::Grpc,
-                                Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))
-                                    .expect("bad hostname"),
-                                shard.listen_grpc_port.expect("no gRPC port"),
-                            )
+                        let libpq_host = Host::parse(&shard.listen_pg_addr).expect("bad hostname");
+                        let libpq_port = shard.listen_pg_port;
+                        let libpq_url =
+                            Some(format!("postgres://no_user@{libpq_host}:{libpq_port}"));
+
+                        let grpc_url = if let Some(grpc_host) = shard.listen_grpc_addr {
+                            let grpc_port = shard.listen_grpc_port.expect("no gRPC port");
+                            Some(format!("grpc://no_user@{grpc_host}:{grpc_port}"))
                        } else {
-                            (
-                                PageserverProtocol::Libpq,
-                                Host::parse(&shard.listen_pg_addr).expect("bad hostname"),
-                                shard.listen_pg_port,
-                            )
-                        }
+                            None
+                        };
+                        (
+                            shard.shard_id.shard_number.0 as u32,
+                            PageserverShardConnectionInfo {
+                                libpq_url,
+                                grpc_url,
+                            },
+                        )
                    })
                    .collect::<Vec<_>>()
            };
+            let pageserver_conninfo = PageserverConnectionInfo {
+                shards: shards.into_iter().collect(),
+                prefer_grpc: endpoint.grpc,
+            };
            // If --safekeepers argument is given, use only the listed
            // safekeeper nodes; otherwise all from the env.
            let safekeepers = parse_safekeepers(&args.safekeepers)?;
-            endpoint.reconfigure(pageservers, None, safekeepers).await?;
+            endpoint
+                .reconfigure(Some(pageserver_conninfo), None, safekeepers, None)
+                .await?;
        }
        EndpointCmd::Stop(args) => {
            let endpoint_id = &args.endpoint_id;
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -59,6 +59,10 @@ use compute_api::spec::{
    Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
    RemoteExtSpec, Role,
 };
+
+// re-export these, because they're used in the reconfigure() function
+pub use compute_api::spec::{PageserverConnectionInfo, PageserverShardConnectionInfo};
+
 use jsonwebtoken::jwk::{
    AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
    OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
@@ -74,7 +78,6 @@ use sha2::{Digest, Sha256};
 use spki::der::Decode;
 use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
 use tracing::debug;
-use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};

 use crate::local_env::LocalEnv;
@@ -373,29 +376,6 @@ impl std::fmt::Display for EndpointTerminateMode {
    }
 }

-/// Protocol used to connect to a Pageserver.
-#[derive(Clone, Copy, Debug)]
-pub enum PageserverProtocol {
-    Libpq,
-    Grpc,
-}
-
-impl PageserverProtocol {
-    /// Returns the URL scheme for the protocol, used in connstrings.
-    pub fn scheme(&self) -> &'static str {
-        match self {
-            Self::Libpq => "postgresql",
-            Self::Grpc => "grpc",
-        }
-    }
-}
-
-impl Display for PageserverProtocol {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.write_str(self.scheme())
-    }
-}
-
 impl Endpoint {
    fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result<Endpoint> {
        if !entry.file_type()?.is_dir() {
@@ -660,14 +640,6 @@ impl Endpoint {
        }
    }

-    fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String {
-        pageservers
-            .iter()
-            .map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}"))
-            .collect::<Vec<_>>()
-            .join(",")
-    }
-
    /// Map safekeepers ids to the actual connection strings.
    fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
        let mut safekeeper_connstrings = Vec::new();
@@ -708,7 +680,7 @@ impl Endpoint {
        endpoint_storage_addr: String,
        safekeepers_generation: Option<SafekeeperGeneration>,
        safekeepers: Vec<NodeId>,
-        pageservers: Vec<(PageserverProtocol, Host, u16)>,
+        pageserver_conninfo: PageserverConnectionInfo,
        remote_ext_base_url: Option<&String>,
        shard_stripe_size: usize,
        create_test_user: bool,
@@ -727,9 +699,6 @@ impl Endpoint {
            std::fs::remove_dir_all(self.pgdata())?;
        }

-        let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
-        assert!(!pageserver_connstring.is_empty());
-
        let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;

        // check for file remote_extensions_spec.json
@@ -788,7 +757,7 @@ impl Endpoint {
                branch_id: None,
                endpoint_id: Some(self.endpoint_id.clone()),
                mode: self.mode,
-                pageserver_connstring: Some(pageserver_connstring),
+                pageserver_connection_info: Some(pageserver_conninfo),
                safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
                safekeeper_connstrings,
                storage_auth_token: auth_token.clone(),
@@ -997,12 +966,11 @@ impl Endpoint {

    pub async fn reconfigure(
        &self,
-        pageservers: Vec<(PageserverProtocol, Host, u16)>,
+        pageserver_conninfo: Option<PageserverConnectionInfo>,
        stripe_size: Option<ShardStripeSize>,
        safekeepers: Option<Vec<NodeId>>,
+        safekeeper_generation: Option<SafekeeperGeneration>,
    ) -> Result<()> {
-        anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
-
        let (mut spec, compute_ctl_config) = {
            let config_path = self.endpoint_path().join("config.json");
            let file = std::fs::File::open(config_path)?;
@@ -1014,8 +982,15 @@ impl Endpoint {
        let postgresql_conf = self.read_postgresql_conf()?;
        spec.cluster.postgresql_conf = Some(postgresql_conf);

-        let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
-        spec.pageserver_connstring = Some(pageserver_connstr);
+        if let Some(pageserver_conninfo) = pageserver_conninfo {
+            // If pageservers are provided, we need to ensure that they are not empty.
+            // This is a requirement for the compute_ctl configuration.
+            anyhow::ensure!(
+                !pageserver_conninfo.shards.is_empty(),
+                "no pageservers provided"
+            );
+            spec.pageserver_connection_info = Some(pageserver_conninfo);
+        }
        if stripe_size.is_some() {
            spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
        }
@@ -1024,6 +999,9 @@ impl Endpoint {
        if let Some(safekeepers) = safekeepers {
            let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
            spec.safekeeper_connstrings = safekeeper_connstrings;
+            if let Some(g) = safekeeper_generation {
+                spec.safekeepers_generation = Some(g.into_inner());
+            }
        }

        let client = reqwest::Client::builder()
@@ -1061,6 +1039,24 @@ impl Endpoint {
        }
    }

+    pub async fn reconfigure_pageservers(
+        &self,
+        pageservers: PageserverConnectionInfo,
+        stripe_size: Option<ShardStripeSize>,
+    ) -> Result<()> {
+        self.reconfigure(Some(pageservers), stripe_size, None, None)
+            .await
+    }
+
+    pub async fn reconfigure_safekeepers(
+        &self,
+        safekeepers: Vec<NodeId>,
+        generation: SafekeeperGeneration,
+    ) -> Result<()> {
+        self.reconfigure(None, None, Some(safekeepers), Some(generation))
+            .await
+    }
+
    pub async fn stop(
        &self,
        mode: EndpointTerminateMode,
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -12,6 +12,7 @@ use std::{env, fs};

 use anyhow::{Context, bail};
 use clap::ValueEnum;
+use pageserver_api::config::PostHogConfig;
 use pem::Pem;
 use postgres_backend::AuthType;
 use reqwest::{Certificate, Url};
@@ -211,7 +212,9 @@ pub struct NeonStorageControllerConf {

    pub use_local_compute_notifications: bool,

-    pub timeline_safekeeper_count: Option<i64>,
+    pub timeline_safekeeper_count: Option<usize>,
+
+    pub posthog_config: Option<PostHogConfig>,

    pub kick_secondary_downloads: Option<bool>,
 }
@@ -245,6 +248,7 @@ impl Default for NeonStorageControllerConf {
            use_https_safekeeper_api: false,
            use_local_compute_notifications: true,
            timeline_safekeeper_count: None,
+            posthog_config: None,
            kick_secondary_downloads: None,
        }
    }
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -638,10 +638,28 @@ impl StorageController {
            args.push("--timelines-onto-safekeepers".to_string());
        }

-        if let Some(sk_cnt) = self.config.timeline_safekeeper_count {
+        // neon_local is used in test environments where we often have less than 3 safekeepers.
+        if self.config.timeline_safekeeper_count.is_some() || self.env.safekeepers.len() < 3 {
+            let sk_cnt = self
+                .config
+                .timeline_safekeeper_count
+                .unwrap_or(self.env.safekeepers.len());
+
            args.push(format!("--timeline-safekeeper-count={sk_cnt}"));
        }

+        let mut envs = vec![
+            ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+        ];
+
+        if let Some(posthog_config) = &self.config.posthog_config {
+            envs.push((
+                "POSTHOG_CONFIG".to_string(),
+                serde_json::to_string(posthog_config)?,
+            ));
+        }
+
        println!("Starting storage controller");

        background_process::start_process(
@@ -649,10 +667,7 @@ impl StorageController {
            &instance_dir,
            &self.env.storage_controller_bin(),
            args,
-            vec![
-                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-            ],
+            envs,
            background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)),
            &start_args.start_timeout,
            || async {
--- a/docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md
+++ b/docs/rfcs/040-Endpoint-Persistent-Unlogged-Files-Storage.md
@@ -0,0 +1,396 @@
+# Memo: Endpoint Persistent Unlogged Files Storage
+Created on 2024-11-05
+Implemented on N/A
+
+## Summary
+A design for a storage system that allows storage of files required to make
+Neon's Endpoints have a better experience at or after a reboot.
+
+## Motivation
+Several systems inside PostgreSQL (and Neon) need some persistent storage for
+optimal workings across reboots and restarts, but still work without.
+Examples are the query-level statistics files of `pg_stat_statements` in
+`pg_stat/pg_stat_statements.stat`, and `pg_prewarm`'s `autoprewarm.blocks`.
+We need a storage system that can store and manage these files for each
+Endpoint, without necessarily granting users access to an unlimited storage
+device.
+
+## Goals
+- Store known files for Endpoints with reasonable persistence.  
+  _Data loss in this service, while annoying and bad for UX, won't lose any
+  customer's data._
+
+## Non Goals (if relevant)
+- This storage system does not need branching, file versioning, or other such
+  features. The files are as ephemeral to the timeline of the data as the
+  Endpoints that host the data.
+- This storage system does not need to store _all_ user files, only 'known'
+  user files.
+- This storage system does not need to be hosted fully inside Computes.  
+  _Instead, this will be a separate component similar to Pageserver,
+  SafeKeeper, the S3 proxy used for dynamically loaded extensions, etc._
+
+## Impacted components
+- Compute needs new code to load and store these files in its lifetime.
+- Control Plane needs to consider this new storage system when signalling
+  the deletion of an Endpoint, Timeline, or Tenant.
+- Control Plane needs to consider this new storage system when it resets
+  or re-assigns an endpoint's timeline/branch state.
+
+A new service is created: the Endpoint Persistent Unlogged Files Storage
+service.  This could be integrated in e.g. Pageserver or Control Plane, or a
+separately hosted service.
+
+## Proposed implementation
+Endpoint-related data files are managed by a newly designed service (which
+optionally is integrated in an existing service like Pageserver or Control
+Plane), which stores data directly into S3 or any blob storage of choice.
+
+Upon deletion of the Endpoint, or reassignment of the endpoint to a different
+branch, this ephemeral data is dropped: the data stored may not match the
+state of the branch's data after reassignment, and on endpoint deletion the
+data won't have any use to the user.
+
+Compute gets credentials (JWT token with Tenant, Timeline & Endpoint claims)
+which it can use to authenticate to this new service and retrieve and store
+data associated with this endpoint.  This limited scope reduces leaks of data
+across endpoints and timeline resets, and limits the ability of endpoints to
+mess with other endpoints' data.
+
+The path of this endpoint data in S3 is initially as follows:
+
+    s3://<regional-epufs-bucket>/
+      tenants/
+        <hex-tenant-id>/
+          tenants/
+            <hex-timeline-id>/
+              endpoints/
+                <endpoint-id>/
+                  pgdata/
+                    <file_path_in_pgdatadir>
+
+For other blob storages an equivalent or similar path can be constructed.
+
+### Reliability, failure modes and corner cases (if relevant)
+Reliability is important, but not critical to the workings of Neon.  The data
+stored in this service will, when lost, reduce performance, but won't be a
+cause of permanent data loss - only operational metadata is stored.
+
+Most, if not all, blob storage services have sufficiently high persistence
+guarantees to cater our need for persistence and uptime. The only concern with
+blob storages is that the access latency is generally higher than local disk,
+but for the object types stored (cache state, ...) I don't think this will be
+much of an issue.
+
+### Interaction/Sequence diagram (if relevant)
+
+In these diagrams you can replace S3 with any persistent storage device of
+choice, but S3 is chosen as representative name: The well-known and short name
+of AWS' blob storage. Azure Blob Storage should work too, but it has a much
+longer name making it less practical for the diagrams.
+
+Write data:
+
+```http
+POST /tenants/<tenant-id>/timelines/<tl-id>/endpoints/<endpoint-id>/pgdata/<the-pgdata-path>
+Host: epufs.svc.neon.local
+
+<<<
+
+200 OK
+{
+  "version": "<opaque>", # opaque file version token, changes when the file contents change
+  "size": <bytes>,
+}
+```
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant co as Compute
+    participant ep as EPUFS
+    participant s3 as Blob Storage
+
+    co-->ep: Connect with credentials
+    co->>+ep: Store Unlogged Persistent File
+    opt is authenticated
+        ep->>s3: Write UPF to S3
+    end
+    ep->>-co: OK / Failure / Auth Failure
+    co-->ep: Cancel connection
+```
+
+Read data: (optional with cache-relevant request parameters, e.g. If-Modified-Since)
+```http
+GET /tenants/<tenant-id>/timelines/<tl-id>/endpoints/<endpoint-id>/pgdata/<the-pgdata-path>
+Host: epufs.svc.neon.local
+
+<<<
+
+200 OK
+
+<file data>
+```
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant co as Compute
+    participant ep as EPUFS
+    participant s3 as Blob Storage
+
+    co->>+ep: Read Unlogged Persistent File
+    opt is authenticated
+        ep->>+s3: Request UPF from storage
+        s3->>-ep: Receive UPF from storage
+    end
+    ep->>-co: OK(response) / Failure(storage, auth, ...)
+```
+
+Compute Startup:
+```mermaid
+sequenceDiagram
+    autonumber
+    participant co as Compute
+    participant ps as Pageserver
+    participant ep as EPUFS
+    participant es as Extension server
+
+    note over co: Bind endpoint ep-xxx
+    par Get basebackup
+        co->>+ps: Request basebackup @ LSN
+        ps-)ps: Construct basebackup
+        ps->>-co: Receive basebackup TAR @ LSN
+    and Get startup-critical Unlogged Persistent Files
+        co->>+ep: Get all UPFs of endpoint ep-xxx
+        ep-)ep: Retrieve and gather all UPFs
+        ep->>-co: TAR of UPFs
+    and Get startup-critical extensions
+        loop For every startup-critical extension
+            co->>es: Get critical extension
+            es->>co: Receive critical extension
+        end
+    end
+    note over co: Start compute
+```
+
+CPlane ops:
+```http
+DELETE /tenants/<tenant-id>/timelines/<timeline-id>/endpoints/<endpoint-id>
+Host: epufs.svc.neon.local
+
+<<<
+
+200 OK
+{
+  "tenant": "<tenant-id>",
+  "timeline": "<timeline-id>",
+  "endpoint": "<endpoint-id>",
+  "deleted": {
+    "files": <count>,
+    "bytes": <count>,
+  },
+}
+```
+
+```http
+DELETE /tenants/<tenant-id>/timelines/<timeline-id>
+Host: epufs.svc.neon.local
+
+<<<
+
+200 OK
+{
+  "tenant": "<tenant-id>",
+  "timeline": "<timeline-id>",
+  "deleted": {
+    "files": <count>,
+    "bytes": <count>,
+  },
+}
+```
+
+```http
+DELETE /tenants/<tenant-id>
+Host: epufs.svc.neon.local
+
+<<<
+
+200 OK
+{
+  "tenant": "<tenant-id>",
+  "deleted": {
+    "files": <count>,
+    "bytes": <count>,
+  },
+}
+```
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant cp as Control Plane
+    participant ep as EPUFS
+    participant s3 as Blob Storage
+
+    alt Tenant deleted
+        cp-)ep: Tenant deleted
+        loop For every object associated with removed tenant
+            ep->>s3: Remove data of deleted tenant from Storage
+        end
+        opt
+            ep-)cp: Tenant cleanup complete
+        end
+    alt Timeline deleted
+        cp-)ep: Timeline deleted
+        loop For every object associated with removed timeline
+            ep->>s3: Remove data of deleted timeline from Storage
+        end
+        opt
+            ep-)cp: Timeline cleanup complete
+        end
+    else Endpoint reassigned or removed
+        cp->>+ep: Endpoint reassigned
+        loop For every object associated with reassigned/removed endpoint
+            ep->>s3: Remove data from Storage
+        end
+        ep->>-cp: Cleanup complete
+    end
+```
+
+### Scalability (if relevant)
+
+Provisionally:  As this service is going to be part of compute startup, this
+service should be able to quickly respond to all requests.  Therefore this
+service is deployed to every AZ we host Computes in, and Computes communicate
+(generally) only to the EPUFS endpoint of the AZ they're hosted in.
+
+Local caching of frequently restarted endpoints' data or metadata may be
+needed for best performance.  However, due to the regional nature of stored
+data but zonal nature of the service deployment, we should be careful when we
+implement any local caching, as it is possible that computes in AZ 1 will
+update data originally written and thus cached by AZ 2.  Cache version tests
+and invalidation is therefore required if we want to roll out caching to this
+service, which is too broad a scope for an MVC.  This is why caching is left
+out of scope for this RFC, and should be considered separately after this RFC
+is implemented.
+
+### Security implications (if relevant)
+This service must be able to authenticate users at least by Tenant ID,
+Timeline ID and Endpoint ID. This will use the existing JWT infrastructure of
+Compute, which will be upgraded to the extent needed to support Timeline- and
+Endpoint-based claims.
+
+The service requires unlimited access to (a prefix of) a blob storage bucket,
+and thus must be hosted outside the Compute VM sandbox.
+
+A service that generates pre-signed request URLs for Compute to download the
+data from that URL is likely problematic, too:  Compute would be able to write
+unlimited data to the bucket, or exfiltrate this signed URL to get read/write
+access to specific objects in this bucket, which would still effectively give
+users access to the S3 bucket (but with improved access logging).
+
+There may be a use case for transferring data associated with one endpoint to
+another endpoint (e.g. to make one endpoint warm its caches with the state of
+another endpoint), but that's not currently in scope, and specific needs may
+be solved through out-of-line communication of data or pre-signed URLs.
+
+### Unresolved questions (if relevant)
+Caching of files is not in the implementation scope of the document, but
+should at some future point be considered to maximize performance.
+
+## Alternative implementation (if relevant)
+Several ideas have come up to solve this issue:
+
+### Use AUXfile
+One prevalent idea was to WAL-log the files using our AUXfile mechanism.
+
+Benefits:
+
+ We already have this storage mechanism
+
+Demerits:
+
+- It isn't available on read replicas
+- Additional WAL will be consumed during shutdown and after the shutdown
+  checkpoint, which needs PG modifications to work without panics.
+- It increases the data we need to manage in our versioned storage, thus
+  causing higher storage costs with higher retention due to duplication at
+  the storage layer.
+
+### Sign URLs for read/write operations, instead of proxying them
+
+Benefits:
+
+ The service can be implemented with a much reduced IO budget
+
+Demerits:
+
+- Users could get access to these signed credentials
+- Not all blob storage services may implement URL signing
+
+### Give endpoints each their own directly accessed block volume
+
+Benefits:
+
+ Easier to integrate for PostgreSQL
+
+Demerits:
+
+- Little control on data size and contents
+- Potentially problematic as we'd need to store data all across the pgdata
+  directory.
+- EBS is not a good candidate
+   - Attaches in 10s of seconds, if not more; i.e. too cold to start
+   - Shared EBS volumes are a no-go, as you'd have to schedule the endpoint
+     with users of the same EBS volumes, which can't work with VM migration
+   - EBS storage costs are very high (>80$/kilotenant when using a
+     volume/tenant)
+   - EBS volumes can't be mounted across AZ boundaries
+- Bucket per endpoint is unfeasible
+   - S3 buckets are priced at $20/month per 1k, which we could better spend
+     on developers.
+   - Allocating service accounts takes time (100s of ms), and service accounts
+     are a limited resource, too; so they're not a good candidate to allocate
+     on a per-endpoint basis.
+   - Giving credentials limited to prefix has similar issues as the pre-signed
+     URL approach.
+   - Bucket DNS lookup will fill DNS caches and put pressure on DNS lookup
+     much more than our current systems would.
+- Volumes bound by hypervisor are unlikely
+   - This requires significant investment and increased software on the
+     hypervisor.
+   - It is unclear if we can attach volumes after boot, i.e. for pooled
+     instances.
+
+### Put the files into a table
+
+Benefits:
+
+ + Mostly already available in PostgreSQL
+
+Demerits:
+
+ - Uses WAL
+   - Can't be used after shutdown checkpoint
+   - Needs a RW endpoint, and table & catalog access to write to this data
+ - Gets hit with DB size limitations
+ - Depending on user acces:
+   - Inaccessible:  
+     The user doesn't have control over database size caused by
+     these systems.
+   - Accessible:  
+     The user can corrupt these files and cause the system to crash while
+     user-corrupted files are present, thus increasing on-call overhead.
+
+## Definition of Done (if relevant)
+
+This project is done if we have:
+
+- One S3 bucket equivalent per region, which stores this per-endpoint data.
+- A new service endpoint in at least every AZ, which indirectly grants
+  endpoints access to the data stored for these endpoints in these buckets.
+- Compute writes & reads temp-data at shutdown and startup, respectively, for
+  at least the pg_prewarm or lfc_prewarm state files.
+- Cleanup of endpoint data is triggered when the endpoint is deleted or is
+  detached from its current timeline.
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -12,6 +12,7 @@ jsonwebtoken.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 regex.workspace = true
+url.workspace = true

 utils = { path = "../utils" }
 remote_storage = { version = "0.1", path = "../remote_storage/" }
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -4,11 +4,14 @@
 //! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
 //! compute_ctl can fetch it by calling the control plane's API.
 use std::collections::HashMap;
+use std::fmt::Display;

+use anyhow::anyhow;
 use indexmap::IndexMap;
 use regex::Regex;
 use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
+use url::Url;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -102,7 +105,11 @@ pub struct ComputeSpec {
    // updated to fill these fields, we can make these non optional.
    pub tenant_id: Option<TenantId>,
    pub timeline_id: Option<TimelineId>,
-    pub pageserver_connstring: Option<String>,
+
+    // Pageserver information can be passed in two different ways:
+    // 1. Here
+    // 2. in cluster.settings. This is legacy, we are switching to method 1.
+    pub pageserver_connection_info: Option<PageserverConnectionInfo>,

    // More neon ids that we expose to the compute_ctl
    // and to postgres as neon extension GUCs.
@@ -202,6 +209,20 @@ pub enum ComputeFeature {
    UnknownFeature,
 }

+/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
+#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
+pub struct PageserverConnectionInfo {
+    pub shards: HashMap<u32, PageserverShardConnectionInfo>,
+
+    pub prefer_grpc: bool,
+}
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
+pub struct PageserverShardConnectionInfo {
+    pub libpq_url: Option<String>,
+    pub grpc_url: Option<String>,
+}
+
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct RemoteExtSpec {
    pub public_extensions: Option<Vec<String>>,
@@ -319,6 +340,12 @@ impl ComputeMode {
    }
 }

+impl Display for ComputeMode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.to_type_str())
+    }
+}
+
 /// Log level for audit logging
 #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeAudit {
@@ -429,6 +456,47 @@ pub struct JwksSettings {
    pub jwt_audience: Option<String>,
 }

+/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme.
+#[derive(Clone, Copy, Debug, Default)]
+pub enum PageserverProtocol {
+    /// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
+    #[default]
+    Libpq,
+    /// A newer, gRPC-based protocol. Uses grpc:// scheme.
+    Grpc,
+}
+
+impl PageserverProtocol {
+    /// Parses the protocol from a connstring scheme. Defaults to Libpq if no scheme is given.
+    /// Errors if the connstring is an invalid URL.
+    pub fn from_connstring(connstring: &str) -> anyhow::Result<Self> {
+        let scheme = match Url::parse(connstring) {
+            Ok(url) => url.scheme().to_lowercase(),
+            Err(url::ParseError::RelativeUrlWithoutBase) => return Ok(Self::default()),
+            Err(err) => return Err(anyhow!("invalid connstring URL: {err}")),
+        };
+        match scheme.as_str() {
+            "postgresql" | "postgres" => Ok(Self::Libpq),
+            "grpc" => Ok(Self::Grpc),
+            scheme => Err(anyhow!("invalid protocol scheme: {scheme}")),
+        }
+    }
+
+    /// Returns the URL scheme for the protocol, for use in connstrings.
+    pub fn scheme(&self) -> &'static str {
+        match self {
+            Self::Libpq => "postgresql",
+            Self::Grpc => "grpc",
+        }
+    }
+}
+
+impl Display for PageserverProtocol {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.scheme())
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use std::fs::File;
--- a/libs/neon-shmem/Cargo.toml
+++ b/libs/neon-shmem/Cargo.toml
@@ -6,8 +6,13 @@ license.workspace = true

 [dependencies]
 thiserror.workspace = true
-nix.workspace=true
+nix.workspace = true
+spin.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }

+[dev-dependencies]
+rand = "0.9.1"
+rand_distr = "0.5.1"
+
 [target.'cfg(target_os = "macos")'.dependencies]
 tempfile = "3.14.0"
--- a/libs/neon-shmem/src/hash.rs
+++ b/libs/neon-shmem/src/hash.rs
@@ -0,0 +1,367 @@
+//! Hash table implementation on top of 'shmem'
+//!
+//! Features required in the long run by the communicator project:
+//!
+//! [X] Accessible from both Postgres processes and rust threads in the communicator process
+//! [X] Low latency
+//! [ ] Scalable to lots of concurrent accesses (currently uses a single spinlock)
+//! [ ] Resizable
+
+use std::fmt::Debug;
+use std::hash::Hash;
+use std::mem::MaybeUninit;
+use std::ops::Deref;
+
+use crate::shmem::ShmemHandle;
+
+use spin;
+
+mod core;
+
+#[cfg(test)]
+mod tests;
+
+use core::CoreHashMap;
+
+pub enum UpdateAction<V> {
+    Nothing,
+    Insert(V),
+    Remove,
+}
+
+#[derive(Debug)]
+pub struct OutOfMemoryError();
+
+pub struct HashMapInit<'a, K, V> {
+    // Hash table can be allocated in a fixed memory area, or in a resizeable ShmemHandle.
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+}
+
+pub struct HashMapAccess<'a, K, V> {
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+}
+
+unsafe impl<'a, K: Sync, V: Sync> Sync for HashMapAccess<'a, K, V> {}
+unsafe impl<'a, K: Send, V: Send> Send for HashMapAccess<'a, K, V> {}
+
+impl<'a, K, V> HashMapInit<'a, K, V> {
+    pub fn attach_writer(self) -> HashMapAccess<'a, K, V> {
+        HashMapAccess {
+            shmem_handle: self.shmem_handle,
+            shared_ptr: self.shared_ptr,
+        }
+    }
+
+    pub fn attach_reader(self) -> HashMapAccess<'a, K, V> {
+        // no difference to attach_writer currently
+        self.attach_writer()
+    }
+}
+
+// This is stored in the shared memory area
+struct HashMapShared<'a, K, V> {
+    inner: spin::RwLock<CoreHashMap<'a, K, V>>,
+}
+
+impl<'a, K, V> HashMapInit<'a, K, V>
+where
+    K: Clone + Hash + Eq,
+{
+    pub fn estimate_size(num_buckets: u32) -> usize {
+        // add some margin to cover alignment etc.
+        CoreHashMap::<K, V>::estimate_size(num_buckets) + size_of::<HashMapShared<K, V>>() + 1000
+    }
+
+    pub fn init_in_fixed_area(
+        num_buckets: u32,
+        area: &'a mut [MaybeUninit<u8>],
+    ) -> HashMapInit<'a, K, V> {
+        Self::init_common(num_buckets, None, area.as_mut_ptr().cast(), area.len())
+    }
+
+    /// Initialize a new hash map in the given shared memory area
+    pub fn init_in_shmem(num_buckets: u32, mut shmem: ShmemHandle) -> HashMapInit<'a, K, V> {
+        let size = Self::estimate_size(num_buckets);
+        shmem
+            .set_size(size)
+            .expect("could not resize shared memory area");
+
+        let ptr = unsafe { shmem.data_ptr.as_mut() };
+        Self::init_common(num_buckets, Some(shmem), ptr, size)
+    }
+
+    fn init_common(
+        num_buckets: u32,
+        shmem_handle: Option<ShmemHandle>,
+        area_ptr: *mut u8,
+        area_len: usize,
+    ) -> HashMapInit<'a, K, V> {
+        // carve out HashMapShared from the area. This does not include the hashmap's dictionary
+        // and buckets.
+        let mut ptr: *mut u8 = area_ptr;
+        ptr = unsafe { ptr.add(ptr.align_offset(align_of::<HashMapShared<K, V>>())) };
+        let shared_ptr: *mut HashMapShared<K, V> = ptr.cast();
+        ptr = unsafe { ptr.add(size_of::<HashMapShared<K, V>>()) };
+
+        // the rest of the space is given to the hash map's dictionary and buckets
+        let remaining_area = unsafe {
+            std::slice::from_raw_parts_mut(ptr, area_len - ptr.offset_from(area_ptr) as usize)
+        };
+
+        let hashmap = CoreHashMap::new(num_buckets, remaining_area);
+        unsafe {
+            std::ptr::write(
+                shared_ptr,
+                HashMapShared {
+                    inner: spin::RwLock::new(hashmap),
+                },
+            );
+        }
+
+        HashMapInit {
+            shmem_handle,
+            shared_ptr,
+        }
+    }
+}
+
+impl<'a, K, V> HashMapAccess<'a, K, V>
+where
+    K: Clone + Hash + Eq,
+{
+    pub fn get<'e>(&'e self, key: &K) -> Option<ValueReadGuard<'e, K, V>> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        let lock_guard = map.inner.read();
+
+        match lock_guard.get(key) {
+            None => None,
+            Some(val_ref) => {
+                let val_ptr = std::ptr::from_ref(val_ref);
+                Some(ValueReadGuard {
+                    _lock_guard: lock_guard,
+                    value: val_ptr,
+                })
+            }
+        }
+    }
+
+    /// Insert a value
+    pub fn insert(&self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
+        let mut success = None;
+
+        self.update_with_fn(key, |existing| {
+            if existing.is_some() {
+                success = Some(false);
+                UpdateAction::Nothing
+            } else {
+                success = Some(true);
+                UpdateAction::Insert(value)
+            }
+        })?;
+        Ok(success.expect("value_fn not called"))
+    }
+
+    /// Remove value. Returns true if it existed
+    pub fn remove(&self, key: &K) -> bool {
+        let mut result = false;
+        self.update_with_fn(key, |existing| match existing {
+            Some(_) => {
+                result = true;
+                UpdateAction::Remove
+            }
+            None => UpdateAction::Nothing,
+        })
+        .expect("out of memory while removing");
+        result
+    }
+
+    /// Update key using the given function. All the other modifying operations are based on this.
+    pub fn update_with_fn<F>(&self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
+    where
+        F: FnOnce(Option<&V>) -> UpdateAction<V>,
+    {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        let mut lock_guard = map.inner.write();
+
+        let old_val = lock_guard.get(key);
+        let action = value_fn(old_val);
+        match (old_val, action) {
+            (_, UpdateAction::Nothing) => {}
+            (_, UpdateAction::Insert(new_val)) => {
+                let _ = lock_guard.insert(key, new_val);
+            }
+            (None, UpdateAction::Remove) => panic!("Remove action with no old value"),
+            (Some(_), UpdateAction::Remove) => {
+                let _ = lock_guard.remove(key);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Update key using the given function. All the other modifying operations are based on this.
+    pub fn update_with_fn_at_bucket<F>(
+        &self,
+        pos: usize,
+        value_fn: F,
+    ) -> Result<(), OutOfMemoryError>
+    where
+        F: FnOnce(Option<&V>) -> UpdateAction<V>,
+    {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        let mut lock_guard = map.inner.write();
+
+        let old_val = lock_guard.get_bucket(pos);
+        let action = value_fn(old_val.map(|(_k, v)| v));
+        match (old_val, action) {
+            (_, UpdateAction::Nothing) => {}
+            (_, UpdateAction::Insert(_new_val)) => panic!("cannot insert without key"),
+            (None, UpdateAction::Remove) => panic!("Remove action with no old value"),
+            (Some((key, _value)), UpdateAction::Remove) => {
+                let key = key.clone();
+                let _ = lock_guard.remove(&key);
+            }
+        }
+
+        Ok(())
+    }
+
+    pub fn get_num_buckets(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        map.inner.read().get_num_buckets()
+    }
+
+    /// Return the key and value stored in bucket with given index. This can be used to
+    /// iterate through the hash map. (An Iterator might be nicer. The communicator's
+    /// clock algorithm needs to _slowly_ iterate through all buckets with its clock hand,
+    /// without holding a lock. If we switch to an Iterator, it must not hold the lock.)
+    pub fn get_bucket<'e>(&'e self, pos: usize) -> Option<ValueReadGuard<'e, K, V>> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        let lock_guard = map.inner.read();
+
+        match lock_guard.get_bucket(pos) {
+            None => None,
+            Some((_key, val_ref)) => {
+                let val_ptr = std::ptr::from_ref(val_ref);
+                Some(ValueReadGuard {
+                    _lock_guard: lock_guard,
+                    value: val_ptr,
+                })
+            }
+        }
+    }
+
+    // for metrics
+    pub fn get_num_buckets_in_use(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        map.inner.read().buckets_in_use as usize
+    }
+
+    /// Grow
+    ///
+    /// 1. grow the underlying shared memory area
+    /// 2. Initialize new buckets. This overwrites the current dictionary
+    /// 3. Recalculate the dictionary
+    pub fn grow(&self, num_buckets: u32) -> Result<(), crate::shmem::Error> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        let mut lock_guard = map.inner.write();
+        let inner = &mut *lock_guard;
+        let old_num_buckets = inner.buckets.len() as u32;
+
+        if num_buckets < old_num_buckets {
+            panic!("grow called with a smaller number of buckets");
+        }
+        if num_buckets == old_num_buckets {
+            return Ok(());
+        }
+        let shmem_handle = self
+            .shmem_handle
+            .as_ref()
+            .expect("grow called on a fixed-size hash table");
+
+        let size_bytes = HashMapInit::<K, V>::estimate_size(num_buckets);
+        shmem_handle.set_size(size_bytes)?;
+        let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
+
+        // Initialize new buckets. The new buckets are linked to the free list. NB: This overwrites
+        // the dictionary!
+        let buckets_ptr = inner.buckets.as_mut_ptr();
+        unsafe {
+            for i in old_num_buckets..num_buckets {
+                let bucket_ptr = buckets_ptr.add(i as usize);
+                bucket_ptr.write(core::Bucket {
+                    hash: 0,
+                    next: if i < num_buckets {
+                        i + 1
+                    } else {
+                        inner.free_head
+                    },
+                    inner: None,
+                });
+            }
+        }
+
+        // Recalculate the dictionary
+        let buckets;
+        let dictionary;
+        unsafe {
+            let buckets_end_ptr = buckets_ptr.add(num_buckets as usize);
+            let dictionary_ptr: *mut u32 = buckets_end_ptr
+                .byte_add(buckets_end_ptr.align_offset(align_of::<u32>()))
+                .cast();
+            let dictionary_size: usize =
+                end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::<u32>();
+
+            buckets = std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize);
+            dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size);
+        }
+        for item in dictionary.iter_mut() {
+            *item = core::INVALID_POS;
+        }
+
+        #[allow(clippy::needless_range_loop)]
+        for i in 0..old_num_buckets as usize {
+            if buckets[i].inner.is_none() {
+                continue;
+            }
+            let pos: usize = (buckets[i].hash % dictionary.len() as u64) as usize;
+            buckets[i].next = dictionary[pos];
+            dictionary[pos] = i as u32;
+        }
+
+        // Finally, update the CoreHashMap struct
+        inner.dictionary = dictionary;
+        inner.buckets = buckets;
+        inner.free_head = old_num_buckets;
+
+        Ok(())
+    }
+
+    // TODO: Shrinking is a multi-step process that requires co-operation from the caller
+    //
+    // 1. The caller must first call begin_shrink(). That forbids allocation of higher-numbered
+    // buckets.
+    //
+    // 2. Next, the caller must evict all entries in higher-numbered buckets.
+    //
+    // 3. Finally, call finish_shrink(). This recomputes the dictionary and shrinks the underlying
+    //    shmem area
+}
+
+pub struct ValueReadGuard<'a, K, V> {
+    _lock_guard: spin::RwLockReadGuard<'a, CoreHashMap<'a, K, V>>,
+    value: *const V,
+}
+
+impl<'a, K, V> Deref for ValueReadGuard<'a, K, V> {
+    type Target = V;
+
+    fn deref(&self) -> &Self::Target {
+        // SAFETY: The `lock_guard` ensures that the underlying map (and thus the value pointed to
+        // by `value`) remains valid for the lifetime `'a`. The `value` has been obtained from a
+        // valid reference within the map.
+        unsafe { &*self.value }
+    }
+}
--- a/libs/neon-shmem/src/hash/core.rs
+++ b/libs/neon-shmem/src/hash/core.rs
@@ -0,0 +1,233 @@
+//! Simple hash table with chaining
+//!
+//! # Resizing
+//!
+
+use std::hash::{DefaultHasher, Hash, Hasher};
+use std::mem::MaybeUninit;
+
+pub(crate) const INVALID_POS: u32 = u32::MAX;
+
+// Bucket
+pub(crate) struct Bucket<K, V> {
+    pub(crate) hash: u64,
+    pub(crate) next: u32,
+    pub(crate) inner: Option<(K, V)>,
+}
+
+pub(crate) struct CoreHashMap<'a, K, V> {
+    pub(crate) dictionary: &'a mut [u32],
+    pub(crate) buckets: &'a mut [Bucket<K, V>],
+    pub(crate) free_head: u32,
+
+    // metrics
+    pub(crate) buckets_in_use: u32,
+}
+
+pub struct FullError();
+
+impl<'a, K, V> CoreHashMap<'a, K, V>
+where
+    K: Clone + Hash + Eq,
+{
+    const FILL_FACTOR: f32 = 0.60;
+
+    pub fn estimate_size(num_buckets: u32) -> usize {
+        let mut size = 0;
+
+        // buckets
+        size += size_of::<Bucket<K, V>>() * num_buckets as usize;
+
+        // dictionary
+        size += (f32::ceil((size_of::<u32>() * num_buckets as usize) as f32 / Self::FILL_FACTOR))
+            as usize;
+
+        size
+    }
+
+    pub fn new(num_buckets: u32, area: &'a mut [u8]) -> CoreHashMap<'a, K, V> {
+        let len = area.len();
+
+        let mut ptr: *mut u8 = area.as_mut_ptr();
+        let end_ptr: *mut u8 = unsafe { area.as_mut_ptr().add(len) };
+
+        // carve out the buckets
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<Bucket<K, V>>())) };
+        let buckets_ptr = ptr;
+        ptr = unsafe { ptr.add(size_of::<Bucket<K, V>>() * num_buckets as usize) };
+
+        // use remaining space for the dictionary
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<u32>())) };
+        let dictionary_ptr = ptr;
+
+        assert!(ptr.addr() < end_ptr.addr());
+        let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::<u32>() as isize };
+        assert!(dictionary_size > 0);
+
+        // Initialize the buckets
+        let buckets = {
+            let buckets_ptr: *mut MaybeUninit<Bucket<K, V>> = buckets_ptr.cast();
+            let buckets =
+                unsafe { std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize) };
+            for i in 0..buckets.len() {
+                buckets[i].write(Bucket {
+                    hash: 0,
+                    next: if i < buckets.len() - 1 {
+                        i as u32 + 1
+                    } else {
+                        INVALID_POS
+                    },
+                    inner: None,
+                });
+            }
+            // TODO: use std::slice::assume_init_mut() once it stabilizes
+            unsafe { std::slice::from_raw_parts_mut(buckets_ptr.cast(), num_buckets as usize) }
+        };
+
+        // Initialize the dictionary
+        let dictionary = {
+            let dictionary_ptr: *mut MaybeUninit<u32> = dictionary_ptr.cast();
+            let dictionary =
+                unsafe { std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size as usize) };
+
+            for item in dictionary.iter_mut() {
+                item.write(INVALID_POS);
+            }
+            // TODO: use std::slice::assume_init_mut() once it stabilizes
+            unsafe {
+                std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize)
+            }
+        };
+
+        CoreHashMap {
+            dictionary,
+            buckets,
+            free_head: 0,
+            buckets_in_use: 0,
+        }
+    }
+
+    pub fn get(&self, key: &K) -> Option<&V> {
+        let mut hasher = DefaultHasher::new();
+        key.hash(&mut hasher);
+        let hash = hasher.finish();
+
+        let mut next = self.dictionary[hash as usize % self.dictionary.len()];
+        loop {
+            if next == INVALID_POS {
+                return None;
+            }
+
+            let bucket = &self.buckets[next as usize];
+            let (bucket_key, bucket_value) = bucket.inner.as_ref().expect("entry is in use");
+            if bucket_key == key {
+                return Some(bucket_value);
+            }
+            next = bucket.next;
+        }
+    }
+
+    pub fn insert(&mut self, key: &K, value: V) -> Result<(), FullError> {
+        let mut hasher = DefaultHasher::new();
+        key.hash(&mut hasher);
+        let hash = hasher.finish();
+
+        let first = self.dictionary[hash as usize % self.dictionary.len()];
+        if first == INVALID_POS {
+            // no existing entry
+            let pos = self.alloc_bucket(key.clone(), value, hash)?;
+            if pos == INVALID_POS {
+                return Err(FullError());
+            }
+            self.dictionary[hash as usize % self.dictionary.len()] = pos;
+            return Ok(());
+        }
+
+        let mut next = first;
+        loop {
+            let bucket = &mut self.buckets[next as usize];
+            let (bucket_key, bucket_value) = bucket.inner.as_mut().expect("entry is in use");
+            if bucket_key == key {
+                // found existing entry, update its value
+                *bucket_value = value;
+                return Ok(());
+            }
+
+            if bucket.next == INVALID_POS {
+                // No existing entry found. Append to the chain
+                let pos = self.alloc_bucket(key.clone(), value, hash)?;
+                if pos == INVALID_POS {
+                    return Err(FullError());
+                }
+                self.buckets[next as usize].next = pos;
+                return Ok(());
+            }
+            next = bucket.next;
+        }
+    }
+
+    pub fn remove(&mut self, key: &K) -> Result<(), FullError> {
+        let mut hasher = DefaultHasher::new();
+        key.hash(&mut hasher);
+        let hash = hasher.finish();
+
+        let mut next = self.dictionary[hash as usize % self.dictionary.len()];
+        let mut prev_pos: u32 = INVALID_POS;
+        loop {
+            if next == INVALID_POS {
+                // no existing entry
+                return Ok(());
+            }
+            let bucket = &mut self.buckets[next as usize];
+            let (bucket_key, _) = bucket.inner.as_mut().expect("entry is in use");
+            if bucket_key == key {
+                // found existing entry, unlink it from the chain
+                if prev_pos == INVALID_POS {
+                    self.dictionary[hash as usize % self.dictionary.len()] = bucket.next;
+                } else {
+                    self.buckets[prev_pos as usize].next = bucket.next;
+                }
+
+                // and add it to the freelist
+                let bucket = &mut self.buckets[next as usize];
+                bucket.hash = 0;
+                bucket.inner = None;
+                bucket.next = self.free_head;
+                self.free_head = next;
+                self.buckets_in_use -= 1;
+                return Ok(());
+            }
+            prev_pos = next;
+            next = bucket.next;
+        }
+    }
+
+    pub fn get_num_buckets(&self) -> usize {
+        self.buckets.len()
+    }
+
+    pub fn get_bucket(&self, pos: usize) -> Option<&(K, V)> {
+        if pos >= self.buckets.len() {
+            return None;
+        }
+
+        self.buckets[pos].inner.as_ref()
+    }
+
+    fn alloc_bucket(&mut self, key: K, value: V, hash: u64) -> Result<u32, FullError> {
+        let pos = self.free_head;
+        if pos == INVALID_POS {
+            return Err(FullError());
+        }
+
+        let bucket = &mut self.buckets[pos as usize];
+        self.free_head = bucket.next;
+        self.buckets_in_use += 1;
+
+        bucket.hash = hash;
+        bucket.next = INVALID_POS;
+        bucket.inner = Some((key, value));
+
+        Ok(pos)
+    }
+}
--- a/libs/neon-shmem/src/hash/tests.rs
+++ b/libs/neon-shmem/src/hash/tests.rs
@@ -0,0 +1,220 @@
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fmt::{Debug, Formatter};
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use crate::hash::HashMapAccess;
+use crate::hash::HashMapInit;
+use crate::hash::UpdateAction;
+use crate::shmem::ShmemHandle;
+
+use rand::seq::SliceRandom;
+use rand::{Rng, RngCore};
+use rand_distr::Zipf;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl From<&TestKey> for u128 {
+    fn from(val: &TestKey) -> u128 {
+        u128::from_be_bytes(val.0)
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl<'a> From<&'a [u8]> for TestKey {
+    fn from(bytes: &'a [u8]) -> TestKey {
+        TestKey(bytes.try_into().unwrap())
+    }
+}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
+    const MAX_MEM_SIZE: usize = 10000000;
+    let shmem = ShmemHandle::new("test_inserts", 0, MAX_MEM_SIZE).unwrap();
+
+    let init_struct = HashMapInit::<TestKey, usize>::init_in_shmem(100000, shmem);
+    let w = init_struct.attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+        let res = w.insert(&(*k).into(), idx);
+        assert!(res.is_ok());
+    }
+
+    for (idx, k) in keys.iter().enumerate() {
+        let x = w.get(&(*k).into());
+        let value = x.as_deref().copied();
+        assert_eq!(value, Some(idx));
+    }
+
+    //eprintln!("stats: {:?}", tree_writer.get_statistics());
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut rand::rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.contains(&key) {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
+
+struct TestValue(AtomicUsize);
+
+impl TestValue {
+    fn new(val: usize) -> TestValue {
+        TestValue(AtomicUsize::new(val))
+    }
+
+    fn load(&self) -> usize {
+        self.0.load(Ordering::Relaxed)
+    }
+}
+
+impl Clone for TestValue {
+    fn clone(&self) -> TestValue {
+        TestValue::new(self.load())
+    }
+}
+
+impl Debug for TestValue {
+    fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.load())
+    }
+}
+
+#[derive(Clone, Debug)]
+struct TestOp(TestKey, Option<usize>);
+
+fn apply_op(
+    op: &TestOp,
+    sut: &HashMapAccess<TestKey, TestValue>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    eprintln!("applying op: {op:?}");
+
+    // apply the change to the shadow tree first
+    let shadow_existing = if let Some(v) = op.1 {
+        shadow.insert(op.0, v)
+    } else {
+        shadow.remove(&op.0)
+    };
+
+    // apply to Art tree
+    sut.update_with_fn(&op.0, |existing| {
+        assert_eq!(existing.map(TestValue::load), shadow_existing);
+
+        match (existing, op.1) {
+            (None, None) => UpdateAction::Nothing,
+            (None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
+            (Some(_old_val), None) => UpdateAction::Remove,
+            (Some(old_val), Some(new_val)) => {
+                old_val.0.store(new_val, Ordering::Relaxed);
+                UpdateAction::Nothing
+            }
+        }
+    })
+    .expect("out of memory");
+}
+
+#[test]
+fn random_ops() {
+    const MAX_MEM_SIZE: usize = 10000000;
+    let shmem = ShmemHandle::new("test_inserts", 0, MAX_MEM_SIZE).unwrap();
+
+    let init_struct = HashMapInit::<TestKey, TestValue>::init_in_shmem(100000, shmem);
+    let writer = init_struct.attach_writer();
+
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
+    let mut rng = rand::rng();
+    for i in 0..100000 {
+        let key: TestKey = (rng.sample(distribution) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            //eprintln!("stats: {:?}", tree_writer.get_statistics());
+            //test_iter(&tree_writer, &shadow);
+        }
+    }
+}
+
+#[test]
+fn test_grow() {
+    const MEM_SIZE: usize = 10000000;
+    let shmem = ShmemHandle::new("test_grow", 0, MEM_SIZE).unwrap();
+
+    let init_struct = HashMapInit::<TestKey, TestValue>::init_in_shmem(1000, shmem);
+    let writer = init_struct.attach_writer();
+
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let mut rng = rand::rng();
+    for i in 0..10000 {
+        let key: TestKey = ((rng.next_u32() % 1000) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            //eprintln!("stats: {:?}", tree_writer.get_statistics());
+            //test_iter(&tree_writer, &shadow);
+        }
+    }
+
+    writer.grow(1500).unwrap();
+
+    for i in 0..10000 {
+        let key: TestKey = ((rng.next_u32() % 1500) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            //eprintln!("stats: {:?}", tree_writer.get_statistics());
+            //test_iter(&tree_writer, &shadow);
+        }
+    }
+}
--- a/libs/neon-shmem/src/lib.rs
+++ b/libs/neon-shmem/src/lib.rs
@@ -1,418 +1,4 @@
 //! Shared memory utilities for neon communicator

-use std::num::NonZeroUsize;
-use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
-use std::ptr::NonNull;
-use std::sync::atomic::{AtomicUsize, Ordering};
-
-use nix::errno::Errno;
-use nix::sys::mman::MapFlags;
-use nix::sys::mman::ProtFlags;
-use nix::sys::mman::mmap as nix_mmap;
-use nix::sys::mman::munmap as nix_munmap;
-use nix::unistd::ftruncate as nix_ftruncate;
-
-/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
-/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
-/// specified at creation.
-///
-/// The area is backed by an anonymous file created with memfd_create(). The full address space for
-/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
-/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
-/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
-/// future.
-pub struct ShmemHandle {
-    /// memfd file descriptor
-    fd: OwnedFd,
-
-    max_size: usize,
-
-    // Pointer to the beginning of the shared memory area. The header is stored there.
-    shared_ptr: NonNull<SharedStruct>,
-
-    // Pointer to the beginning of the user data
-    pub data_ptr: NonNull<u8>,
-}
-
-/// This is stored at the beginning in the shared memory area.
-struct SharedStruct {
-    max_size: usize,
-
-    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
-    current_size: AtomicUsize,
-}
-
-const RESIZE_IN_PROGRESS: usize = 1 << 63;
-
-const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
-
-/// Error type returned by the ShmemHandle functions.
-#[derive(thiserror::Error, Debug)]
-#[error("{msg}: {errno}")]
-pub struct Error {
-    pub msg: String,
-    pub errno: Errno,
-}
-
-impl Error {
-    fn new(msg: &str, errno: Errno) -> Error {
-        Error {
-            msg: msg.to_string(),
-            errno,
-        }
-    }
-}
-
-impl ShmemHandle {
-    /// Create a new shared memory area. To communicate between processes, the processes need to be
-    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
-    ///
-    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
-    /// processes can continue using it, however.
-    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
-        // create the backing anonymous file.
-        let fd = create_backing_file(name)?;
-
-        Self::new_with_fd(fd, initial_size, max_size)
-    }
-
-    fn new_with_fd(
-        fd: OwnedFd,
-        initial_size: usize,
-        max_size: usize,
-    ) -> Result<ShmemHandle, Error> {
-        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
-        // is a little larger than this because of the SharedStruct header. Make the upper limit
-        // somewhat smaller than that, because with anything close to that, you'll run out of
-        // memory anyway.
-        if max_size >= 1 << 48 {
-            panic!("max size {max_size} too large");
-        }
-        if initial_size > max_size {
-            panic!("initial size {initial_size} larger than max size {max_size}");
-        }
-
-        // The actual initial / max size is the one given by the caller, plus the size of
-        // 'SharedStruct'.
-        let initial_size = HEADER_SIZE + initial_size;
-        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
-
-        // Reserve address space for it with mmap
-        //
-        // TODO: Use MAP_HUGETLB if possible
-        let start_ptr = unsafe {
-            nix_mmap(
-                None,
-                max_size,
-                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
-                MapFlags::MAP_SHARED,
-                &fd,
-                0,
-            )
-        }
-        .map_err(|e| Error::new("mmap failed: {e}", e))?;
-
-        // Reserve space for the initial size
-        enlarge_file(fd.as_fd(), initial_size as u64)?;
-
-        // Initialize the header
-        let shared: NonNull<SharedStruct> = start_ptr.cast();
-        unsafe {
-            shared.write(SharedStruct {
-                max_size: max_size.into(),
-                current_size: AtomicUsize::new(initial_size),
-            })
-        };
-
-        // The user data begins after the header
-        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
-
-        Ok(ShmemHandle {
-            fd,
-            max_size: max_size.into(),
-            shared_ptr: shared,
-            data_ptr,
-        })
-    }
-
-    // return reference to the header
-    fn shared(&self) -> &SharedStruct {
-        unsafe { self.shared_ptr.as_ref() }
-    }
-
-    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
-    /// when creating the area.
-    ///
-    /// This may only be called from one process/thread concurrently. We detect that case
-    /// and return an Error.
-    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
-        let new_size = new_size + HEADER_SIZE;
-        let shared = self.shared();
-
-        if new_size > self.max_size {
-            panic!(
-                "new size ({} is greater than max size ({})",
-                new_size, self.max_size
-            );
-        }
-        assert_eq!(self.max_size, shared.max_size);
-
-        // Lock the area by setting the bit in 'current_size'
-        //
-        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
-        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
-        // since this is not performance-critical, better safe than sorry .
-        let mut old_size = shared.current_size.load(Ordering::Acquire);
-        loop {
-            if (old_size & RESIZE_IN_PROGRESS) != 0 {
-                return Err(Error::new(
-                    "concurrent resize detected",
-                    Errno::UnknownErrno,
-                ));
-            }
-            match shared.current_size.compare_exchange(
-                old_size,
-                new_size,
-                Ordering::Acquire,
-                Ordering::Relaxed,
-            ) {
-                Ok(_) => break,
-                Err(x) => old_size = x,
-            }
-        }
-
-        // Ok, we got the lock.
-        //
-        // NB: If anything goes wrong, we *must* clear the bit!
-        let result = {
-            use std::cmp::Ordering::{Equal, Greater, Less};
-            match new_size.cmp(&old_size) {
-                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
-                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
-                }),
-                Equal => Ok(()),
-                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
-            }
-        };
-
-        // Unlock
-        shared.current_size.store(
-            if result.is_ok() { new_size } else { old_size },
-            Ordering::Release,
-        );
-
-        result
-    }
-
-    /// Returns the current user-visible size of the shared memory segment.
-    ///
-    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
-    /// responsibility not to access the area beyond the current size.
-    pub fn current_size(&self) -> usize {
-        let total_current_size =
-            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
-        total_current_size - HEADER_SIZE
-    }
-}
-
-impl Drop for ShmemHandle {
-    fn drop(&mut self) {
-        // SAFETY: The pointer was obtained from mmap() with the given size.
-        // We unmap the entire region.
-        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
-        // The fd is dropped automatically by OwnedFd.
-    }
-}
-
-/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
-/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
-/// development and testing, but in production we want the file to stay in memory.
-///
-/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
-#[allow(unused_variables)]
-fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
-            .map_err(|e| Error::new("memfd_create failed: {e}", e))
-    }
-    #[cfg(target_os = "macos")]
-    {
-        let file = tempfile::tempfile().map_err(|e| {
-            Error::new(
-                "could not create temporary file to back shmem area: {e}",
-                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
-            )
-        })?;
-        Ok(OwnedFd::from(file))
-    }
-}
-
-fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
-    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
-    // we don't get a segfault later when trying to actually use it.
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
-            Error::new(
-                "could not grow shmem segment, posix_fallocate failed: {e}",
-                e,
-            )
-        })
-    }
-    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
-    #[cfg(target_os = "macos")]
-    {
-        nix::unistd::ftruncate(fd, size as i64)
-            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use nix::unistd::ForkResult;
-    use std::ops::Range;
-
-    /// check that all bytes in given range have the expected value.
-    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
-        for i in range {
-            let b = unsafe { *(ptr.add(i)) };
-            assert_eq!(expected, b, "unexpected byte at offset {i}");
-        }
-    }
-
-    /// Write 'b' to all bytes in the given range
-    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
-        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
-    }
-
-    // simple single-process test of growing and shrinking
-    #[test]
-    fn test_shmem_resize() -> Result<(), Error> {
-        let max_size = 1024 * 1024;
-        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
-
-        assert_eq!(init_struct.current_size(), 0);
-
-        // Initial grow
-        let size1 = 10000;
-        init_struct.set_size(size1).unwrap();
-        assert_eq!(init_struct.current_size(), size1);
-
-        // Write some data
-        let data_ptr = init_struct.data_ptr.as_ptr();
-        write_range(data_ptr, 0xAA, 0..size1);
-        assert_range(data_ptr, 0xAA, 0..size1);
-
-        // Shrink
-        let size2 = 5000;
-        init_struct.set_size(size2).unwrap();
-        assert_eq!(init_struct.current_size(), size2);
-
-        // Grow again
-        let size3 = 20000;
-        init_struct.set_size(size3).unwrap();
-        assert_eq!(init_struct.current_size(), size3);
-
-        // Try to read it. The area that was shrunk and grown again should read as all zeros now
-        assert_range(data_ptr, 0xAA, 0..5000);
-        assert_range(data_ptr, 0, 5000..size1);
-
-        // Try to grow beyond max_size
-        //let size4 = max_size + 1;
-        //assert!(init_struct.set_size(size4).is_err());
-
-        // Dropping init_struct should unmap the memory
-        drop(init_struct);
-
-        Ok(())
-    }
-
-    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
-    /// but is stored in the shared memory area and works across processes. It's implemented by
-    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
-    struct SimpleBarrier {
-        num_procs: usize,
-        count: AtomicUsize,
-    }
-
-    impl SimpleBarrier {
-        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
-            unsafe {
-                *ptr = SimpleBarrier {
-                    num_procs,
-                    count: AtomicUsize::new(0),
-                }
-            }
-        }
-
-        pub fn wait(&self) {
-            let old = self.count.fetch_add(1, Ordering::Relaxed);
-
-            let generation = old / self.num_procs;
-
-            let mut current = old + 1;
-            while current < (generation + 1) * self.num_procs {
-                std::thread::sleep(std::time::Duration::from_millis(10));
-                current = self.count.load(Ordering::Relaxed);
-            }
-        }
-    }
-
-    #[test]
-    fn test_multi_process() {
-        // Initialize
-        let max_size = 1_000_000_000_000;
-        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
-        let ptr = init_struct.data_ptr.as_ptr();
-
-        // Store the SimpleBarrier in the first 1k of the area.
-        init_struct.set_size(10000).unwrap();
-        let barrier_ptr: *mut SimpleBarrier = unsafe {
-            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
-                .cast()
-        };
-        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
-        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
-
-        // Fork another test process. The code after this runs in both processes concurrently.
-        let fork_result = unsafe { nix::unistd::fork().unwrap() };
-
-        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, 1000..2000);
-        } else {
-            write_range(ptr, 0xBB, 2000..3000);
-        }
-        barrier.wait();
-        // Verify the contents. (in both processes)
-        assert_range(ptr, 0xAA, 1000..2000);
-        assert_range(ptr, 0xBB, 2000..3000);
-
-        // Grow, from the child this time
-        let size = 10_000_000;
-        if !fork_result.is_parent() {
-            init_struct.set_size(size).unwrap();
-        }
-        barrier.wait();
-
-        // make some writes at the end
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, (size - 10)..size);
-        } else {
-            write_range(ptr, 0xBB, (size - 20)..(size - 10));
-        }
-        barrier.wait();
-
-        // Verify the contents. (This runs in both processes)
-        assert_range(ptr, 0, (size - 1000)..(size - 20));
-        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
-        assert_range(ptr, 0xAA, (size - 10)..size);
-
-        if let ForkResult::Parent { child } = fork_result {
-            nix::sys::wait::waitpid(child, None).unwrap();
-        }
-    }
-}
+pub mod hash;
+pub mod shmem;
--- a/libs/neon-shmem/src/shmem.rs
+++ b/libs/neon-shmem/src/shmem.rs
@@ -0,0 +1,418 @@
+//! Dynamically resizable contiguous chunk of shared memory
+
+use std::num::NonZeroUsize;
+use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use nix::errno::Errno;
+use nix::sys::mman::MapFlags;
+use nix::sys::mman::ProtFlags;
+use nix::sys::mman::mmap as nix_mmap;
+use nix::sys::mman::munmap as nix_munmap;
+use nix::unistd::ftruncate as nix_ftruncate;
+
+/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
+/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
+/// specified at creation.
+///
+/// The area is backed by an anonymous file created with memfd_create(). The full address space for
+/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
+/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
+/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
+/// future.
+pub struct ShmemHandle {
+    /// memfd file descriptor
+    fd: OwnedFd,
+
+    max_size: usize,
+
+    // Pointer to the beginning of the shared memory area. The header is stored there.
+    shared_ptr: NonNull<SharedStruct>,
+
+    // Pointer to the beginning of the user data
+    pub data_ptr: NonNull<u8>,
+}
+
+/// This is stored at the beginning in the shared memory area.
+struct SharedStruct {
+    max_size: usize,
+
+    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
+    current_size: AtomicUsize,
+}
+
+const RESIZE_IN_PROGRESS: usize = 1 << 63;
+
+const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
+
+/// Error type returned by the ShmemHandle functions.
+#[derive(thiserror::Error, Debug)]
+#[error("{msg}: {errno}")]
+pub struct Error {
+    pub msg: String,
+    pub errno: Errno,
+}
+
+impl Error {
+    fn new(msg: &str, errno: Errno) -> Error {
+        Error {
+            msg: msg.to_string(),
+            errno,
+        }
+    }
+}
+
+impl ShmemHandle {
+    /// Create a new shared memory area. To communicate between processes, the processes need to be
+    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
+    ///
+    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
+    /// processes can continue using it, however.
+    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
+        // create the backing anonymous file.
+        let fd = create_backing_file(name)?;
+
+        Self::new_with_fd(fd, initial_size, max_size)
+    }
+
+    fn new_with_fd(
+        fd: OwnedFd,
+        initial_size: usize,
+        max_size: usize,
+    ) -> Result<ShmemHandle, Error> {
+        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
+        // is a little larger than this because of the SharedStruct header. Make the upper limit
+        // somewhat smaller than that, because with anything close to that, you'll run out of
+        // memory anyway.
+        if max_size >= 1 << 48 {
+            panic!("max size {max_size} too large");
+        }
+        if initial_size > max_size {
+            panic!("initial size {initial_size} larger than max size {max_size}");
+        }
+
+        // The actual initial / max size is the one given by the caller, plus the size of
+        // 'SharedStruct'.
+        let initial_size = HEADER_SIZE + initial_size;
+        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
+
+        // Reserve address space for it with mmap
+        //
+        // TODO: Use MAP_HUGETLB if possible
+        let start_ptr = unsafe {
+            nix_mmap(
+                None,
+                max_size,
+                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
+                MapFlags::MAP_SHARED,
+                &fd,
+                0,
+            )
+        }
+        .map_err(|e| Error::new("mmap failed: {e}", e))?;
+
+        // Reserve space for the initial size
+        enlarge_file(fd.as_fd(), initial_size as u64)?;
+
+        // Initialize the header
+        let shared: NonNull<SharedStruct> = start_ptr.cast();
+        unsafe {
+            shared.write(SharedStruct {
+                max_size: max_size.into(),
+                current_size: AtomicUsize::new(initial_size),
+            })
+        };
+
+        // The user data begins after the header
+        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
+
+        Ok(ShmemHandle {
+            fd,
+            max_size: max_size.into(),
+            shared_ptr: shared,
+            data_ptr,
+        })
+    }
+
+    // return reference to the header
+    fn shared(&self) -> &SharedStruct {
+        unsafe { self.shared_ptr.as_ref() }
+    }
+
+    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
+    /// when creating the area.
+    ///
+    /// This may only be called from one process/thread concurrently. We detect that case
+    /// and return an Error.
+    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
+        let new_size = new_size + HEADER_SIZE;
+        let shared = self.shared();
+
+        if new_size > self.max_size {
+            panic!(
+                "new size ({} is greater than max size ({})",
+                new_size, self.max_size
+            );
+        }
+        assert_eq!(self.max_size, shared.max_size);
+
+        // Lock the area by setting the bit in 'current_size'
+        //
+        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
+        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
+        // since this is not performance-critical, better safe than sorry .
+        let mut old_size = shared.current_size.load(Ordering::Acquire);
+        loop {
+            if (old_size & RESIZE_IN_PROGRESS) != 0 {
+                return Err(Error::new(
+                    "concurrent resize detected",
+                    Errno::UnknownErrno,
+                ));
+            }
+            match shared.current_size.compare_exchange(
+                old_size,
+                new_size,
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => break,
+                Err(x) => old_size = x,
+            }
+        }
+
+        // Ok, we got the lock.
+        //
+        // NB: If anything goes wrong, we *must* clear the bit!
+        let result = {
+            use std::cmp::Ordering::{Equal, Greater, Less};
+            match new_size.cmp(&old_size) {
+                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
+                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
+                }),
+                Equal => Ok(()),
+                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
+            }
+        };
+
+        // Unlock
+        shared.current_size.store(
+            if result.is_ok() { new_size } else { old_size },
+            Ordering::Release,
+        );
+
+        result
+    }
+
+    /// Returns the current user-visible size of the shared memory segment.
+    ///
+    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
+    /// responsibility not to access the area beyond the current size.
+    pub fn current_size(&self) -> usize {
+        let total_current_size =
+            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
+        total_current_size - HEADER_SIZE
+    }
+}
+
+impl Drop for ShmemHandle {
+    fn drop(&mut self) {
+        // SAFETY: The pointer was obtained from mmap() with the given size.
+        // We unmap the entire region.
+        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
+        // The fd is dropped automatically by OwnedFd.
+    }
+}
+
+/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
+/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
+/// development and testing, but in production we want the file to stay in memory.
+///
+/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
+#[allow(unused_variables)]
+fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
+            .map_err(|e| Error::new("memfd_create failed: {e}", e))
+    }
+    #[cfg(target_os = "macos")]
+    {
+        let file = tempfile::tempfile().map_err(|e| {
+            Error::new(
+                "could not create temporary file to back shmem area: {e}",
+                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
+            )
+        })?;
+        Ok(OwnedFd::from(file))
+    }
+}
+
+fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
+    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
+    // we don't get a segfault later when trying to actually use it.
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
+            Error::new(
+                "could not grow shmem segment, posix_fallocate failed: {e}",
+                e,
+            )
+        })
+    }
+    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
+    #[cfg(target_os = "macos")]
+    {
+        nix::unistd::ftruncate(fd, size as i64)
+            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use nix::unistd::ForkResult;
+    use std::ops::Range;
+
+    /// check that all bytes in given range have the expected value.
+    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
+        for i in range {
+            let b = unsafe { *(ptr.add(i)) };
+            assert_eq!(expected, b, "unexpected byte at offset {i}");
+        }
+    }
+
+    /// Write 'b' to all bytes in the given range
+    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
+        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
+    }
+
+    // simple single-process test of growing and shrinking
+    #[test]
+    fn test_shmem_resize() -> Result<(), Error> {
+        let max_size = 1024 * 1024;
+        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
+
+        assert_eq!(init_struct.current_size(), 0);
+
+        // Initial grow
+        let size1 = 10000;
+        init_struct.set_size(size1).unwrap();
+        assert_eq!(init_struct.current_size(), size1);
+
+        // Write some data
+        let data_ptr = init_struct.data_ptr.as_ptr();
+        write_range(data_ptr, 0xAA, 0..size1);
+        assert_range(data_ptr, 0xAA, 0..size1);
+
+        // Shrink
+        let size2 = 5000;
+        init_struct.set_size(size2).unwrap();
+        assert_eq!(init_struct.current_size(), size2);
+
+        // Grow again
+        let size3 = 20000;
+        init_struct.set_size(size3).unwrap();
+        assert_eq!(init_struct.current_size(), size3);
+
+        // Try to read it. The area that was shrunk and grown again should read as all zeros now
+        assert_range(data_ptr, 0xAA, 0..5000);
+        assert_range(data_ptr, 0, 5000..size1);
+
+        // Try to grow beyond max_size
+        //let size4 = max_size + 1;
+        //assert!(init_struct.set_size(size4).is_err());
+
+        // Dropping init_struct should unmap the memory
+        drop(init_struct);
+
+        Ok(())
+    }
+
+    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
+    /// but is stored in the shared memory area and works across processes. It's implemented by
+    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
+    struct SimpleBarrier {
+        num_procs: usize,
+        count: AtomicUsize,
+    }
+
+    impl SimpleBarrier {
+        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
+            unsafe {
+                *ptr = SimpleBarrier {
+                    num_procs,
+                    count: AtomicUsize::new(0),
+                }
+            }
+        }
+
+        pub fn wait(&self) {
+            let old = self.count.fetch_add(1, Ordering::Relaxed);
+
+            let generation = old / self.num_procs;
+
+            let mut current = old + 1;
+            while current < (generation + 1) * self.num_procs {
+                std::thread::sleep(std::time::Duration::from_millis(10));
+                current = self.count.load(Ordering::Relaxed);
+            }
+        }
+    }
+
+    #[test]
+    fn test_multi_process() {
+        // Initialize
+        let max_size = 1_000_000_000_000;
+        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
+        let ptr = init_struct.data_ptr.as_ptr();
+
+        // Store the SimpleBarrier in the first 1k of the area.
+        init_struct.set_size(10000).unwrap();
+        let barrier_ptr: *mut SimpleBarrier = unsafe {
+            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
+                .cast()
+        };
+        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
+        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
+
+        // Fork another test process. The code after this runs in both processes concurrently.
+        let fork_result = unsafe { nix::unistd::fork().unwrap() };
+
+        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, 1000..2000);
+        } else {
+            write_range(ptr, 0xBB, 2000..3000);
+        }
+        barrier.wait();
+        // Verify the contents. (in both processes)
+        assert_range(ptr, 0xAA, 1000..2000);
+        assert_range(ptr, 0xBB, 2000..3000);
+
+        // Grow, from the child this time
+        let size = 10_000_000;
+        if !fork_result.is_parent() {
+            init_struct.set_size(size).unwrap();
+        }
+        barrier.wait();
+
+        // make some writes at the end
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, (size - 10)..size);
+        } else {
+            write_range(ptr, 0xBB, (size - 20)..(size - 10));
+        }
+        barrier.wait();
+
+        // Verify the contents. (This runs in both processes)
+        assert_range(ptr, 0, (size - 1000)..(size - 20));
+        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
+        assert_range(ptr, 0xAA, (size - 10)..size);
+
+        if let ForkResult::Parent { child } = fork_result {
+            nix::sys::wait::waitpid(child, None).unwrap();
+        }
+    }
+}
--- a/libs/neonart/Cargo.toml
+++ b/libs/neonart/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "neonart"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+crossbeam-utils.workspace = true
+spin.workspace = true
+tracing.workspace = true
+
+[dev-dependencies]
+rand = "0.9.1"
+rand_distr = "0.5.1"
--- a/libs/neonart/src/algorithm.rs
+++ b/libs/neonart/src/algorithm.rs
@@ -0,0 +1,599 @@
+mod lock_and_version;
+pub(crate) mod node_ptr;
+mod node_ref;
+
+use std::vec::Vec;
+
+use crate::algorithm::lock_and_version::ConcurrentUpdateError;
+use crate::algorithm::node_ptr::MAX_PREFIX_LEN;
+use crate::algorithm::node_ref::{NewNodeRef, NodeRef, ReadLockedNodeRef, WriteLockedNodeRef};
+use crate::allocator::OutOfMemoryError;
+
+use crate::TreeWriteGuard;
+use crate::UpdateAction;
+use crate::allocator::ArtAllocator;
+use crate::epoch::EpochPin;
+use crate::{Key, Value};
+
+pub(crate) type RootPtr<V> = node_ptr::NodePtr<V>;
+
+#[derive(Debug)]
+pub enum ArtError {
+    ConcurrentUpdate, // need to retry
+    OutOfMemory,
+}
+
+impl From<ConcurrentUpdateError> for ArtError {
+    fn from(_: ConcurrentUpdateError) -> ArtError {
+        ArtError::ConcurrentUpdate
+    }
+}
+
+impl From<OutOfMemoryError> for ArtError {
+    fn from(_: OutOfMemoryError) -> ArtError {
+        ArtError::OutOfMemory
+    }
+}
+
+pub fn new_root<V: Value>(
+    allocator: &impl ArtAllocator<V>,
+) -> Result<RootPtr<V>, OutOfMemoryError> {
+    node_ptr::new_root(allocator)
+}
+
+pub(crate) fn search<'e, K: Key, V: Value>(
+    key: &K,
+    root: RootPtr<V>,
+    epoch_pin: &'e EpochPin,
+) -> Option<&'e V> {
+    loop {
+        let root_ref = NodeRef::from_root_ptr(root);
+        if let Ok(result) = lookup_recurse(key.as_bytes(), root_ref, None, epoch_pin) {
+            break result;
+        }
+        // retry
+    }
+}
+
+pub(crate) fn iter_next<'e, V: Value>(
+    key: &[u8],
+    root: RootPtr<V>,
+    epoch_pin: &'e EpochPin,
+) -> Option<(Vec<u8>, &'e V)> {
+    loop {
+        let mut path = Vec::new();
+        let root_ref = NodeRef::from_root_ptr(root);
+
+        match next_recurse(key, &mut path, root_ref, epoch_pin) {
+            Ok(Some(v)) => {
+                assert_eq!(path.len(), key.len());
+                break Some((path, v));
+            }
+            Ok(None) => break None,
+            Err(ConcurrentUpdateError()) => {
+                // retry
+                continue;
+            }
+        }
+    }
+}
+
+pub(crate) fn update_fn<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>, F>(
+    key: &K,
+    value_fn: F,
+    root: RootPtr<V>,
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), OutOfMemoryError>
+where
+    F: FnOnce(Option<&V>) -> UpdateAction<V>,
+{
+    let value_fn_cell = std::cell::Cell::new(Some(value_fn));
+    loop {
+        let root_ref = NodeRef::from_root_ptr(root);
+        let this_value_fn = |arg: Option<&V>| value_fn_cell.take().unwrap()(arg);
+        let key_bytes = key.as_bytes();
+
+        match update_recurse(
+            key_bytes,
+            this_value_fn,
+            root_ref,
+            None,
+            None,
+            guard,
+            0,
+            key_bytes,
+        ) {
+            Ok(()) => break Ok(()),
+            Err(ArtError::ConcurrentUpdate) => {
+                continue; // retry
+            }
+            Err(ArtError::OutOfMemory) => break Err(OutOfMemoryError()),
+        }
+    }
+}
+
+// Error means you must retry.
+//
+// This corresponds to the 'lookupOpt' function in the paper
+#[allow(clippy::only_used_in_recursion)]
+fn lookup_recurse<'e, V: Value>(
+    key: &[u8],
+    node: NodeRef<'e, V>,
+    parent: Option<ReadLockedNodeRef<V>>,
+    epoch_pin: &'e EpochPin,
+) -> Result<Option<&'e V>, ConcurrentUpdateError> {
+    let rnode = node.read_lock_or_restart()?;
+    if let Some(parent) = parent {
+        parent.read_unlock_or_restart()?;
+    }
+
+    // check if the prefix matches, may increment level
+    let prefix_len = if let Some(prefix_len) = rnode.prefix_matches(key) {
+        prefix_len
+    } else {
+        rnode.read_unlock_or_restart()?;
+        return Ok(None);
+    };
+
+    if rnode.is_leaf() {
+        assert_eq!(key.len(), prefix_len);
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let v = unsafe { vptr.as_ref().unwrap() };
+        return Ok(Some(v));
+    }
+
+    let key = &key[prefix_len..];
+
+    // find child (or leaf value)
+    let next_node = rnode.find_child_or_restart(key[0])?;
+
+    match next_node {
+        None => Ok(None), // key not found
+        Some(child) => lookup_recurse(&key[1..], child, Some(rnode), epoch_pin),
+    }
+}
+
+#[allow(clippy::only_used_in_recursion)]
+fn next_recurse<'e, V: Value>(
+    min_key: &[u8],
+    path: &mut Vec<u8>,
+    node: NodeRef<'e, V>,
+    epoch_pin: &'e EpochPin,
+) -> Result<Option<&'e V>, ConcurrentUpdateError> {
+    let rnode = node.read_lock_or_restart()?;
+    let prefix = rnode.get_prefix();
+    if !prefix.is_empty() {
+        path.extend_from_slice(prefix);
+    }
+
+    use std::cmp::Ordering;
+    let comparison = path.as_slice().cmp(&min_key[0..path.len()]);
+    if comparison == Ordering::Less {
+        rnode.read_unlock_or_restart()?;
+        return Ok(None);
+    }
+
+    if rnode.is_leaf() {
+        assert_eq!(path.len(), min_key.len());
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let v = unsafe { vptr.as_ref().unwrap() };
+        return Ok(Some(v));
+    }
+
+    let mut min_key_byte = match comparison {
+        Ordering::Less => unreachable!(), // checked this above already
+        Ordering::Equal => min_key[path.len()],
+        Ordering::Greater => 0,
+    };
+
+    loop {
+        match rnode.find_next_child_or_restart(min_key_byte)? {
+            None => {
+                return Ok(None);
+            }
+            Some((key_byte, child_ref)) => {
+                let path_len = path.len();
+                path.push(key_byte);
+                let result = next_recurse(min_key, path, child_ref, epoch_pin)?;
+                if result.is_some() {
+                    return Ok(result);
+                }
+                if key_byte == u8::MAX {
+                    return Ok(None);
+                }
+                path.truncate(path_len);
+                min_key_byte = key_byte + 1;
+            }
+        }
+    }
+}
+
+// This corresponds to the 'insertOpt' function in the paper
+#[allow(clippy::only_used_in_recursion)]
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn update_recurse<'e, K: Key, V: Value, A: ArtAllocator<V>, F>(
+    key: &[u8],
+    value_fn: F,
+    node: NodeRef<'e, V>,
+    rparent: Option<(ReadLockedNodeRef<V>, u8)>,
+    rgrandparent: Option<(ReadLockedNodeRef<V>, u8)>,
+    guard: &'_ mut TreeWriteGuard<'e, K, V, A>,
+    level: usize,
+    orig_key: &[u8],
+) -> Result<(), ArtError>
+where
+    F: FnOnce(Option<&V>) -> UpdateAction<V>,
+{
+    let rnode = node.read_lock_or_restart()?;
+
+    let prefix_match_len = rnode.prefix_matches(key);
+    if prefix_match_len.is_none() {
+        let (rparent, parent_key) = rparent.expect("direct children of the root have no prefix");
+        let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+        let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+        match value_fn(None) {
+            UpdateAction::Nothing => {}
+            UpdateAction::Insert(new_value) => {
+                insert_split_prefix(key, new_value, &mut wnode, &mut wparent, parent_key, guard)?;
+            }
+            UpdateAction::Remove => {
+                panic!("unexpected Remove action on insertion");
+            }
+        }
+        wnode.write_unlock();
+        wparent.write_unlock();
+        return Ok(());
+    }
+    let prefix_match_len = prefix_match_len.unwrap();
+    let key = &key[prefix_match_len..];
+    let level = level + prefix_match_len;
+
+    if rnode.is_leaf() {
+        assert_eq!(key.len(), 0);
+        let (rparent, parent_key) = rparent.expect("root cannot be leaf");
+        let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+        let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+        // safety: Now that we have acquired the write lock, we have exclusive access to the
+        // value. XXX: There might be concurrent reads though?
+        let value_mut = wnode.get_leaf_value_mut();
+
+        match value_fn(Some(value_mut)) {
+            UpdateAction::Nothing => {
+                wparent.write_unlock();
+                wnode.write_unlock();
+            }
+            UpdateAction::Insert(_) => panic!("cannot insert over existing value"),
+            UpdateAction::Remove => {
+                guard.remember_obsolete_node(wnode.as_ptr());
+                wparent.delete_child(parent_key);
+                wnode.write_unlock_obsolete();
+
+                if let Some(rgrandparent) = rgrandparent {
+                    // FIXME: Ignore concurrency error. It doesn't lead to
+                    // corruption, but it means we might leak something. Until
+                    // another update cleans it up.
+                    let _ = cleanup_parent(wparent, rgrandparent, guard);
+                }
+            }
+        }
+
+        return Ok(());
+    }
+
+    let next_node = rnode.find_child_or_restart(key[0])?;
+
+    if next_node.is_none() {
+        if rnode.is_full() {
+            let (rparent, parent_key) = rparent.expect("root node cannot become full");
+            let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+            let wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+            match value_fn(None) {
+                UpdateAction::Nothing => {
+                    wnode.write_unlock();
+                    wparent.write_unlock();
+                }
+                UpdateAction::Insert(new_value) => {
+                    insert_and_grow(key, new_value, wnode, &mut wparent, parent_key, guard)?;
+                    wparent.write_unlock();
+                }
+                UpdateAction::Remove => {
+                    panic!("unexpected Remove action on insertion");
+                }
+            };
+        } else {
+            let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+            if let Some((rparent, _)) = rparent {
+                rparent.read_unlock_or_restart()?;
+            }
+            match value_fn(None) {
+                UpdateAction::Nothing => {}
+                UpdateAction::Insert(new_value) => {
+                    insert_to_node(&mut wnode, key, new_value, guard)?;
+                }
+                UpdateAction::Remove => {
+                    panic!("unexpected Remove action on insertion");
+                }
+            };
+            wnode.write_unlock();
+        }
+        Ok(())
+    } else {
+        let next_child = next_node.unwrap(); // checked above it's not None
+        if let Some((ref rparent, _)) = rparent {
+            rparent.check_or_restart()?;
+        }
+
+        // recurse to next level
+        update_recurse(
+            &key[1..],
+            value_fn,
+            next_child,
+            Some((rnode, key[0])),
+            rparent,
+            guard,
+            level + 1,
+            orig_key,
+        )
+    }
+}
+
+#[derive(Clone)]
+enum PathElement {
+    Prefix(Vec<u8>),
+    KeyByte(u8),
+}
+
+impl std::fmt::Debug for PathElement {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            PathElement::Prefix(prefix) => write!(fmt, "{prefix:?}"),
+            PathElement::KeyByte(key_byte) => write!(fmt, "{key_byte}"),
+        }
+    }
+}
+
+pub(crate) fn dump_tree<V: Value + std::fmt::Debug>(
+    root: RootPtr<V>,
+    epoch_pin: &'_ EpochPin,
+    dst: &mut dyn std::io::Write,
+) {
+    let root_ref = NodeRef::from_root_ptr(root);
+
+    let _ = dump_recurse(&[], root_ref, epoch_pin, 0, dst);
+}
+
+// TODO: return an Err if writeln!() returns error, instead of unwrapping
+#[allow(clippy::only_used_in_recursion)]
+fn dump_recurse<'e, V: Value + std::fmt::Debug>(
+    path: &[PathElement],
+    node: NodeRef<'e, V>,
+    epoch_pin: &'e EpochPin,
+    level: usize,
+    dst: &mut dyn std::io::Write,
+) -> Result<(), ConcurrentUpdateError> {
+    let indent = str::repeat(" ", level);
+
+    let rnode = node.read_lock_or_restart()?;
+    let mut path = Vec::from(path);
+    let prefix = rnode.get_prefix();
+    if !prefix.is_empty() {
+        path.push(PathElement::Prefix(Vec::from(prefix)));
+    }
+
+    if rnode.is_leaf() {
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let val = unsafe { vptr.as_ref().unwrap() };
+        writeln!(dst, "{indent} {path:?}: {val:?}").unwrap();
+        return Ok(());
+    }
+
+    for key_byte in 0..=u8::MAX {
+        match rnode.find_child_or_restart(key_byte)? {
+            None => continue,
+            Some(child_ref) => {
+                let rchild = child_ref.read_lock_or_restart()?;
+                writeln!(
+                    dst,
+                    "{} {:?}, {}: prefix {:?}",
+                    indent,
+                    &path,
+                    key_byte,
+                    rchild.get_prefix()
+                )
+                .unwrap();
+
+                let mut child_path = path.clone();
+                child_path.push(PathElement::KeyByte(key_byte));
+
+                dump_recurse(&child_path, child_ref, epoch_pin, level + 1, dst)?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+///```text
+///        [fooba]r -> value
+///
+/// [foo]b -> [a]r  -> value
+///      e -> [ls]e -> value
+///```
+fn insert_split_prefix<K: Key, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    node: &mut WriteLockedNodeRef<V>,
+    parent: &mut WriteLockedNodeRef<V>,
+    parent_key: u8,
+    guard: &'_ TreeWriteGuard<K, V, A>,
+) -> Result<(), OutOfMemoryError> {
+    let old_node = node;
+    let old_prefix = old_node.get_prefix();
+    let common_prefix_len = common_prefix(key, old_prefix);
+
+    // Allocate a node for the new value.
+    let new_value_node = allocate_node_for_value(
+        &key[common_prefix_len + 1..],
+        value,
+        guard.tree_writer.allocator,
+    )?;
+
+    // Allocate a new internal node with the common prefix
+    // FIXME: deallocate 'new_value_node' on OOM
+    let mut prefix_node =
+        node_ref::new_internal(&key[..common_prefix_len], guard.tree_writer.allocator)?;
+
+    // Add the old node and the new nodes to the new internal node
+    prefix_node.insert_old_child(old_prefix[common_prefix_len], old_node);
+    prefix_node.insert_new_child(key[common_prefix_len], new_value_node);
+
+    // Modify the prefix of the old child in place
+    old_node.truncate_prefix(old_prefix.len() - common_prefix_len - 1);
+
+    // replace the pointer in the parent
+    parent.replace_child(parent_key, prefix_node.into_ptr());
+
+    Ok(())
+}
+
+fn insert_to_node<K: Key, V: Value, A: ArtAllocator<V>>(
+    wnode: &mut WriteLockedNodeRef<V>,
+    key: &[u8],
+    value: V,
+    guard: &'_ TreeWriteGuard<K, V, A>,
+) -> Result<(), OutOfMemoryError> {
+    let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
+    wnode.insert_child(key[0], value_child.into_ptr());
+    Ok(())
+}
+
+// On entry: 'parent' and 'node' are locked
+fn insert_and_grow<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    wnode: WriteLockedNodeRef<V>,
+    parent: &mut WriteLockedNodeRef<V>,
+    parent_key_byte: u8,
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), ArtError> {
+    let mut bigger_node = wnode.grow(guard.tree_writer.allocator)?;
+
+    // FIXME: deallocate 'bigger_node' on OOM
+    let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
+    bigger_node.insert_new_child(key[0], value_child);
+
+    // Replace the pointer in the parent
+    parent.replace_child(parent_key_byte, bigger_node.into_ptr());
+
+    guard.remember_obsolete_node(wnode.as_ptr());
+    wnode.write_unlock_obsolete();
+
+    Ok(())
+}
+
+fn cleanup_parent<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
+    wparent: WriteLockedNodeRef<V>,
+    rgrandparent: (ReadLockedNodeRef<V>, u8),
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), ArtError> {
+    let (rgrandparent, grandparent_key_byte) = rgrandparent;
+
+    // If the parent becomes completely empty after the deletion, remove the parent from the
+    // grandparent. (This case is possible because we reserve only 8 bytes for the prefix.)
+    // TODO: not implemented.
+
+    // If the parent has only one child, replace the parent with the remaining child. (This is not
+    // possible if the child's prefix field cannot absorb the parent's)
+    if wparent.num_children() == 1 {
+        // Try to lock the remaining child. This can fail if the child is updated
+        // concurrently.
+        let (key_byte, remaining_child) = wparent.find_remaining_child();
+
+        let mut wremaining_child = remaining_child.write_lock_or_restart()?;
+
+        if 1 + wremaining_child.get_prefix().len() + wparent.get_prefix().len() <= MAX_PREFIX_LEN {
+            let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
+
+            // Ok, we have locked the leaf, the parent, the grandparent, and the parent's only
+            // remaining leaf. Proceed with the updates.
+
+            // Update the prefix on the remaining leaf
+            wremaining_child.prepend_prefix(wparent.get_prefix(), key_byte);
+
+            // Replace the pointer in the grandparent to point directly to the remaining leaf
+            wgrandparent.replace_child(grandparent_key_byte, wremaining_child.as_ptr());
+
+            // Mark the parent as deleted.
+            guard.remember_obsolete_node(wparent.as_ptr());
+            wparent.write_unlock_obsolete();
+            return Ok(());
+        }
+    }
+
+    // If the parent's children would fit on a smaller node type after the deletion, replace it with
+    // a smaller node.
+    if wparent.can_shrink() {
+        let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
+        let smaller_node = wparent.shrink(guard.tree_writer.allocator)?;
+
+        // Replace the pointer in the grandparent
+        wgrandparent.replace_child(grandparent_key_byte, smaller_node.into_ptr());
+
+        guard.remember_obsolete_node(wparent.as_ptr());
+        wparent.write_unlock_obsolete();
+        return Ok(());
+    }
+
+    // nothing to do
+    wparent.write_unlock();
+    Ok(())
+}
+
+// Allocate a new leaf node to hold 'value'. If the key is long, we
+// may need to allocate new internal nodes to hold it too
+fn allocate_node_for_value<'a, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError> {
+    let mut prefix_off = key.len().saturating_sub(MAX_PREFIX_LEN);
+
+    let leaf_node = node_ref::new_leaf(&key[prefix_off..key.len()], value, allocator)?;
+
+    let mut node = leaf_node;
+    while prefix_off > 0 {
+        // Need another internal node
+        let remain_prefix = &key[0..prefix_off];
+
+        prefix_off = remain_prefix.len().saturating_sub(MAX_PREFIX_LEN + 1);
+        let mut internal_node = node_ref::new_internal(
+            &remain_prefix[prefix_off..remain_prefix.len() - 1],
+            allocator,
+        )?;
+        internal_node.insert_new_child(*remain_prefix.last().unwrap(), node);
+        node = internal_node;
+    }
+
+    Ok(node)
+}
+
+fn common_prefix(a: &[u8], b: &[u8]) -> usize {
+    for i in 0..MAX_PREFIX_LEN {
+        if a[i] != b[i] {
+            return i;
+        }
+    }
+    panic!("prefixes are equal");
+}
--- a/libs/neonart/src/algorithm/lock_and_version.rs
+++ b/libs/neonart/src/algorithm/lock_and_version.rs
@@ -0,0 +1,117 @@
+//! Each node in the tree has contains one atomic word that stores three things:
+//!
+//! Bit 0: set if the node is "obsolete". An obsolete node has been removed from the tree,
+//!        but might still be accessed by concurrent readers until the epoch expires.
+//! Bit 1: set if the node is currently write-locked. Used as a spinlock.
+//! Bits 2-63: Version number, incremented every time the node is modified.
+//!
+//! AtomicLockAndVersion represents that.
+
+use std::sync::atomic::{AtomicU64, Ordering};
+
+pub(crate) struct ConcurrentUpdateError();
+
+pub(crate) struct AtomicLockAndVersion {
+    inner: AtomicU64,
+}
+
+impl AtomicLockAndVersion {
+    pub(crate) fn new() -> AtomicLockAndVersion {
+        AtomicLockAndVersion {
+            inner: AtomicU64::new(0),
+        }
+    }
+}
+
+impl AtomicLockAndVersion {
+    pub(crate) fn read_lock_or_restart(&self) -> Result<u64, ConcurrentUpdateError> {
+        let version = self.await_node_unlocked();
+        if is_obsolete(version) {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(version)
+    }
+
+    pub(crate) fn check_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
+        self.read_unlock_or_restart(version)
+    }
+
+    pub(crate) fn read_unlock_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
+        if self.inner.load(Ordering::Acquire) != version {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn upgrade_to_write_lock_or_restart(
+        &self,
+        version: u64,
+    ) -> Result<(), ConcurrentUpdateError> {
+        if self
+            .inner
+            .compare_exchange(
+                version,
+                set_locked_bit(version),
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            )
+            .is_err()
+        {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn write_lock_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
+        let old = self.inner.load(Ordering::Relaxed);
+        if is_obsolete(old) || is_locked(old) {
+            return Err(ConcurrentUpdateError());
+        }
+        if self
+            .inner
+            .compare_exchange(
+                old,
+                set_locked_bit(old),
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            )
+            .is_err()
+        {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn write_unlock(&self) {
+        // reset locked bit and overflow into version
+        self.inner.fetch_add(2, Ordering::Release);
+    }
+
+    pub(crate) fn write_unlock_obsolete(&self) {
+        // set obsolete, reset locked, overflow into version
+        self.inner.fetch_add(3, Ordering::Release);
+    }
+
+    // Helper functions
+    fn await_node_unlocked(&self) -> u64 {
+        let mut version = self.inner.load(Ordering::Acquire);
+        while is_locked(version) {
+            // spinlock
+            std::thread::yield_now();
+            version = self.inner.load(Ordering::Acquire)
+        }
+        version
+    }
+}
+
+fn set_locked_bit(version: u64) -> u64 {
+    version + 2
+}
+
+fn is_obsolete(version: u64) -> bool {
+    (version & 1) == 1
+}
+
+fn is_locked(version: u64) -> bool {
+    (version & 2) == 2
+}
--- a/libs/neonart/src/algorithm/node_ptr.rs
+++ b/libs/neonart/src/algorithm/node_ptr.rs
--- a/libs/neonart/src/algorithm/node_ref.rs
+++ b/libs/neonart/src/algorithm/node_ref.rs
@@ -0,0 +1,349 @@
+use std::fmt::Debug;
+use std::marker::PhantomData;
+
+use super::node_ptr;
+use super::node_ptr::NodePtr;
+use crate::EpochPin;
+use crate::Value;
+use crate::algorithm::lock_and_version::AtomicLockAndVersion;
+use crate::algorithm::lock_and_version::ConcurrentUpdateError;
+use crate::allocator::ArtAllocator;
+use crate::allocator::OutOfMemoryError;
+
+pub struct NodeRef<'e, V> {
+    ptr: NodePtr<V>,
+
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V> Debug for NodeRef<'e, V> {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.ptr)
+    }
+}
+
+impl<'e, V: Value> NodeRef<'e, V> {
+    pub(crate) fn from_root_ptr(root_ptr: NodePtr<V>) -> NodeRef<'e, V> {
+        NodeRef {
+            ptr: root_ptr,
+            phantom: PhantomData,
+        }
+    }
+
+    pub(crate) fn read_lock_or_restart(
+        &self,
+    ) -> Result<ReadLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        let version = self.lockword().read_lock_or_restart()?;
+        Ok(ReadLockedNodeRef {
+            ptr: self.ptr,
+            version,
+            phantom: self.phantom,
+        })
+    }
+
+    pub(crate) fn write_lock_or_restart(
+        &self,
+    ) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        self.lockword().write_lock_or_restart()?;
+        Ok(WriteLockedNodeRef {
+            ptr: self.ptr,
+            phantom: self.phantom,
+        })
+    }
+
+    fn lockword(&self) -> &AtomicLockAndVersion {
+        self.ptr.lockword()
+    }
+}
+
+/// A reference to a node that has been optimistically read-locked. The functions re-check
+/// the version after each read.
+pub struct ReadLockedNodeRef<'e, V> {
+    ptr: NodePtr<V>,
+    version: u64,
+
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V: Value> ReadLockedNodeRef<'e, V> {
+    pub(crate) fn is_leaf(&self) -> bool {
+        self.ptr.is_leaf()
+    }
+
+    pub(crate) fn is_full(&self) -> bool {
+        self.ptr.is_full()
+    }
+
+    pub(crate) fn get_prefix(&self) -> &[u8] {
+        self.ptr.get_prefix()
+    }
+
+    /// Note: because we're only holding a read lock, the prefix can change concurrently.
+    /// You must be prepared to restart, if read_unlock() returns error later.
+    ///
+    /// Returns the length of the prefix, or None if it's not a match
+    pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
+        self.ptr.prefix_matches(key)
+    }
+
+    pub(crate) fn find_child_or_restart(
+        &self,
+        key_byte: u8,
+    ) -> Result<Option<NodeRef<'e, V>>, ConcurrentUpdateError> {
+        let child_or_value = self.ptr.find_child(key_byte);
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        match child_or_value {
+            None => Ok(None),
+            Some(child_ptr) => Ok(Some(NodeRef {
+                ptr: child_ptr,
+                phantom: self.phantom,
+            })),
+        }
+    }
+
+    pub(crate) fn find_next_child_or_restart(
+        &self,
+        min_key_byte: u8,
+    ) -> Result<Option<(u8, NodeRef<'e, V>)>, ConcurrentUpdateError> {
+        let child_or_value = self.ptr.find_next_child(min_key_byte);
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        match child_or_value {
+            None => Ok(None),
+            Some((k, child_ptr)) => Ok(Some((
+                k,
+                NodeRef {
+                    ptr: child_ptr,
+                    phantom: self.phantom,
+                },
+            ))),
+        }
+    }
+
+    pub(crate) fn get_leaf_value_ptr(&self) -> Result<*const V, ConcurrentUpdateError> {
+        let result = self.ptr.get_leaf_value();
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        // Extend the lifetime.
+        let result = std::ptr::from_ref(result);
+
+        Ok(result)
+    }
+
+    pub(crate) fn upgrade_to_write_lock_or_restart(
+        self,
+    ) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        self.ptr
+            .lockword()
+            .upgrade_to_write_lock_or_restart(self.version)?;
+
+        Ok(WriteLockedNodeRef {
+            ptr: self.ptr,
+            phantom: self.phantom,
+        })
+    }
+
+    pub(crate) fn read_unlock_or_restart(self) -> Result<(), ConcurrentUpdateError> {
+        self.ptr.lockword().check_or_restart(self.version)?;
+        Ok(())
+    }
+
+    pub(crate) fn check_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
+        self.ptr.lockword().check_or_restart(self.version)?;
+        Ok(())
+    }
+}
+
+/// A reference to a node that has been optimistically read-locked. The functions re-check
+/// the version after each read.
+pub struct WriteLockedNodeRef<'e, V> {
+    ptr: NodePtr<V>,
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V: Value> WriteLockedNodeRef<'e, V> {
+    pub(crate) fn can_shrink(&self) -> bool {
+        self.ptr.can_shrink()
+    }
+
+    pub(crate) fn num_children(&self) -> usize {
+        self.ptr.num_children()
+    }
+
+    pub(crate) fn write_unlock(mut self) {
+        self.ptr.lockword().write_unlock();
+        self.ptr = NodePtr::null();
+    }
+
+    pub(crate) fn write_unlock_obsolete(mut self) {
+        self.ptr.lockword().write_unlock_obsolete();
+        self.ptr = NodePtr::null();
+    }
+
+    pub(crate) fn get_prefix(&self) -> &[u8] {
+        self.ptr.get_prefix()
+    }
+
+    pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
+        self.ptr.truncate_prefix(new_prefix_len)
+    }
+
+    pub(crate) fn prepend_prefix(&mut self, prefix: &[u8], prefix_byte: u8) {
+        self.ptr.prepend_prefix(prefix, prefix_byte)
+    }
+
+    pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
+        self.ptr.insert_child(key_byte, child)
+    }
+
+    pub(crate) fn get_leaf_value_mut(&mut self) -> &mut V {
+        self.ptr.get_leaf_value_mut()
+    }
+
+    pub(crate) fn grow<'a, A>(
+        &self,
+        allocator: &'a A,
+    ) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+    where
+        A: ArtAllocator<V>,
+    {
+        let new_node = self.ptr.grow(allocator)?;
+        Ok(NewNodeRef {
+            ptr: new_node,
+            allocator,
+            extra_nodes: Vec::new(),
+        })
+    }
+
+    pub(crate) fn shrink<'a, A>(
+        &self,
+        allocator: &'a A,
+    ) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+    where
+        A: ArtAllocator<V>,
+    {
+        let new_node = self.ptr.shrink(allocator)?;
+        Ok(NewNodeRef {
+            ptr: new_node,
+            allocator,
+            extra_nodes: Vec::new(),
+        })
+    }
+
+    pub(crate) fn as_ptr(&self) -> NodePtr<V> {
+        self.ptr
+    }
+
+    pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
+        self.ptr.replace_child(key_byte, replacement);
+    }
+
+    pub(crate) fn delete_child(&mut self, key_byte: u8) {
+        self.ptr.delete_child(key_byte);
+    }
+
+    pub(crate) fn find_remaining_child(&self) -> (u8, NodeRef<'e, V>) {
+        assert_eq!(self.num_children(), 1);
+        let child_or_value = self.ptr.find_next_child(0);
+
+        match child_or_value {
+            None => panic!("could not find only child in node"),
+            Some((k, child_ptr)) => (
+                k,
+                NodeRef {
+                    ptr: child_ptr,
+                    phantom: self.phantom,
+                },
+            ),
+        }
+    }
+}
+
+impl<'e, V> Drop for WriteLockedNodeRef<'e, V> {
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            self.ptr.lockword().write_unlock();
+        }
+    }
+}
+
+pub(crate) struct NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    ptr: NodePtr<V>,
+    allocator: &'a A,
+
+    extra_nodes: Vec<NodePtr<V>>,
+}
+
+impl<'a, V, A> NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    pub(crate) fn insert_old_child(&mut self, key_byte: u8, child: &WriteLockedNodeRef<V>) {
+        self.ptr.insert_child(key_byte, child.as_ptr())
+    }
+
+    pub(crate) fn into_ptr(mut self) -> NodePtr<V> {
+        let ptr = self.ptr;
+        self.ptr = NodePtr::null();
+        ptr
+    }
+
+    pub(crate) fn insert_new_child(&mut self, key_byte: u8, child: NewNodeRef<'a, V, A>) {
+        let child_ptr = child.into_ptr();
+        self.ptr.insert_child(key_byte, child_ptr);
+        self.extra_nodes.push(child_ptr);
+    }
+}
+
+impl<'a, V, A> Drop for NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    /// This drop implementation deallocates the newly allocated node, if into_ptr() was not called.
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            self.ptr.deallocate(self.allocator);
+            for p in self.extra_nodes.iter() {
+                p.deallocate(self.allocator);
+            }
+        }
+    }
+}
+
+pub(crate) fn new_internal<'a, V, A>(
+    prefix: &[u8],
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    Ok(NewNodeRef {
+        ptr: node_ptr::new_internal(prefix, allocator)?,
+        allocator,
+        extra_nodes: Vec::new(),
+    })
+}
+
+pub(crate) fn new_leaf<'a, V, A>(
+    prefix: &[u8],
+    value: V,
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    Ok(NewNodeRef {
+        ptr: node_ptr::new_leaf(prefix, value, allocator)?,
+        allocator,
+        extra_nodes: Vec::new(),
+    })
+}
--- a/libs/neonart/src/allocator.rs
+++ b/libs/neonart/src/allocator.rs
@@ -0,0 +1,156 @@
+pub mod block;
+mod multislab;
+mod slab;
+pub mod r#static;
+
+use std::alloc::Layout;
+use std::marker::PhantomData;
+use std::mem::MaybeUninit;
+use std::sync::atomic::Ordering;
+
+use crate::allocator::multislab::MultiSlabAllocator;
+use crate::allocator::r#static::alloc_from_slice;
+
+use spin;
+
+use crate::Tree;
+pub use crate::algorithm::node_ptr::{
+    NodeInternal4, NodeInternal16, NodeInternal48, NodeInternal256, NodeLeaf,
+};
+
+#[derive(Debug)]
+pub struct OutOfMemoryError();
+
+pub trait ArtAllocator<V: crate::Value> {
+    fn alloc_tree(&self) -> *mut Tree<V>;
+
+    fn alloc_node_internal4(&self) -> *mut NodeInternal4<V>;
+    fn alloc_node_internal16(&self) -> *mut NodeInternal16<V>;
+    fn alloc_node_internal48(&self) -> *mut NodeInternal48<V>;
+    fn alloc_node_internal256(&self) -> *mut NodeInternal256<V>;
+    fn alloc_node_leaf(&self) -> *mut NodeLeaf<V>;
+
+    fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>);
+    fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>);
+    fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>);
+    fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>);
+    fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>);
+}
+
+pub struct ArtMultiSlabAllocator<'t, V>
+where
+    V: crate::Value,
+{
+    tree_area: spin::Mutex<Option<&'t mut MaybeUninit<Tree<V>>>>,
+
+    pub(crate) inner: MultiSlabAllocator<'t, 5>,
+
+    phantom_val: PhantomData<V>,
+}
+
+impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
+    const LAYOUTS: [Layout; 5] = [
+        Layout::new::<NodeInternal4<V>>(),
+        Layout::new::<NodeInternal16<V>>(),
+        Layout::new::<NodeInternal48<V>>(),
+        Layout::new::<NodeInternal256<V>>(),
+        Layout::new::<NodeLeaf<V>>(),
+    ];
+
+    pub fn new(area: &'t mut [MaybeUninit<u8>]) -> &'t mut ArtMultiSlabAllocator<'t, V> {
+        let (allocator_area, remain) = alloc_from_slice::<ArtMultiSlabAllocator<V>>(area);
+        let (tree_area, remain) = alloc_from_slice::<Tree<V>>(remain);
+
+        allocator_area.write(ArtMultiSlabAllocator {
+            tree_area: spin::Mutex::new(Some(tree_area)),
+            inner: MultiSlabAllocator::new(remain, &Self::LAYOUTS),
+            phantom_val: PhantomData,
+        })
+    }
+}
+
+impl<'t, V: crate::Value> ArtAllocator<V> for ArtMultiSlabAllocator<'t, V> {
+    fn alloc_tree(&self) -> *mut Tree<V> {
+        let mut t = self.tree_area.lock();
+        if let Some(tree_area) = t.take() {
+            return tree_area.as_mut_ptr().cast();
+        }
+        panic!("cannot allocate more than one tree");
+    }
+
+    fn alloc_node_internal4(&self) -> *mut NodeInternal4<V> {
+        self.inner.alloc_slab(0).cast()
+    }
+    fn alloc_node_internal16(&self) -> *mut NodeInternal16<V> {
+        self.inner.alloc_slab(1).cast()
+    }
+    fn alloc_node_internal48(&self) -> *mut NodeInternal48<V> {
+        self.inner.alloc_slab(2).cast()
+    }
+    fn alloc_node_internal256(&self) -> *mut NodeInternal256<V> {
+        self.inner.alloc_slab(3).cast()
+    }
+    fn alloc_node_leaf(&self) -> *mut NodeLeaf<V> {
+        self.inner.alloc_slab(4).cast()
+    }
+
+    fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>) {
+        self.inner.dealloc_slab(0, ptr.cast())
+    }
+
+    fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>) {
+        self.inner.dealloc_slab(1, ptr.cast())
+    }
+    fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>) {
+        self.inner.dealloc_slab(2, ptr.cast())
+    }
+    fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>) {
+        self.inner.dealloc_slab(3, ptr.cast())
+    }
+    fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>) {
+        self.inner.dealloc_slab(4, ptr.cast())
+    }
+}
+
+impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
+    pub(crate) fn get_statistics(&self) -> ArtMultiSlabStats {
+        ArtMultiSlabStats {
+            num_internal4: self.inner.slab_descs[0]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal16: self.inner.slab_descs[1]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal48: self.inner.slab_descs[2]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal256: self.inner.slab_descs[3]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_leaf: self.inner.slab_descs[4]
+                .num_allocated
+                .load(Ordering::Relaxed),
+
+            num_blocks_internal4: self.inner.slab_descs[0].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal16: self.inner.slab_descs[1].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal48: self.inner.slab_descs[2].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal256: self.inner.slab_descs[3].num_blocks.load(Ordering::Relaxed),
+            num_blocks_leaf: self.inner.slab_descs[4].num_blocks.load(Ordering::Relaxed),
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ArtMultiSlabStats {
+    pub num_internal4: u64,
+    pub num_internal16: u64,
+    pub num_internal48: u64,
+    pub num_internal256: u64,
+    pub num_leaf: u64,
+
+    pub num_blocks_internal4: u64,
+    pub num_blocks_internal16: u64,
+    pub num_blocks_internal48: u64,
+    pub num_blocks_internal256: u64,
+    pub num_blocks_leaf: u64,
+}
--- a/libs/neonart/src/allocator/block.rs
+++ b/libs/neonart/src/allocator/block.rs
@@ -0,0 +1,191 @@
+//! Simple allocator of fixed-size blocks
+
+use std::mem::MaybeUninit;
+use std::sync::atomic::{AtomicU64, Ordering};
+
+use spin;
+
+pub const BLOCK_SIZE: usize = 16 * 1024;
+
+const INVALID_BLOCK: u64 = u64::MAX;
+
+pub(crate) struct BlockAllocator<'t> {
+    blocks_ptr: &'t [MaybeUninit<u8>],
+    num_blocks: u64,
+    num_initialized: AtomicU64,
+
+    freelist_head: spin::Mutex<u64>,
+}
+
+struct FreeListBlock {
+    inner: spin::Mutex<FreeListBlockInner>,
+}
+
+struct FreeListBlockInner {
+    next: u64,
+
+    num_free_blocks: u64,
+    free_blocks: [u64; 100], // FIXME: fill the rest of the block
+}
+
+impl<'t> BlockAllocator<'t> {
+    pub(crate) fn new(area: &'t mut [MaybeUninit<u8>]) -> Self {
+        // Use all the space for the blocks
+        let padding = area.as_ptr().align_offset(BLOCK_SIZE);
+        let remain = &mut area[padding..];
+
+        let num_blocks = (remain.len() / BLOCK_SIZE) as u64;
+
+        BlockAllocator {
+            blocks_ptr: remain,
+            num_blocks,
+            num_initialized: AtomicU64::new(0),
+            freelist_head: spin::Mutex::new(INVALID_BLOCK),
+        }
+    }
+
+    /// safety: you must hold a lock on the pointer to this block, otherwise it might get
+    /// reused for another kind of block
+    fn read_freelist_block(&self, blkno: u64) -> &FreeListBlock {
+        let ptr: *const FreeListBlock = self.get_block_ptr(blkno).cast();
+        unsafe { ptr.as_ref().unwrap() }
+    }
+
+    fn get_block_ptr(&self, blkno: u64) -> *mut u8 {
+        assert!(blkno < self.num_blocks);
+        unsafe {
+            self.blocks_ptr
+                .as_ptr()
+                .byte_offset(blkno as isize * BLOCK_SIZE as isize)
+        }
+        .cast_mut()
+        .cast()
+    }
+
+    #[allow(clippy::mut_from_ref)]
+    pub(crate) fn alloc_block(&self) -> &mut [MaybeUninit<u8>] {
+        // FIXME: handle OOM
+        let blkno = self.alloc_block_internal();
+        if blkno == INVALID_BLOCK {
+            panic!("out of memory");
+        }
+
+        let ptr: *mut MaybeUninit<u8> = self.get_block_ptr(blkno).cast();
+        unsafe { std::slice::from_raw_parts_mut(ptr, BLOCK_SIZE) }
+    }
+
+    fn alloc_block_internal(&self) -> u64 {
+        //  check the free list.
+        {
+            let mut freelist_head = self.freelist_head.lock();
+            if *freelist_head != INVALID_BLOCK {
+                let freelist_block = self.read_freelist_block(*freelist_head);
+
+                // acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
+                let mut g = freelist_block.inner.lock();
+
+                if g.num_free_blocks > 0 {
+                    g.num_free_blocks -= 1;
+                    let result = g.free_blocks[g.num_free_blocks as usize];
+                    return result;
+                } else {
+                    // consume the freelist block itself
+                    let result = *freelist_head;
+                    *freelist_head = g.next;
+                    // This freelist block is now unlinked and can be repurposed
+                    drop(g);
+                    return result;
+                }
+            }
+        }
+
+        // If there are some blocks left that we've never used, pick next such block
+        let mut next_uninitialized = self.num_initialized.load(Ordering::Relaxed);
+        while next_uninitialized < self.num_blocks {
+            match self.num_initialized.compare_exchange(
+                next_uninitialized,
+                next_uninitialized + 1,
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => {
+                    return next_uninitialized;
+                }
+                Err(old) => {
+                    next_uninitialized = old;
+                    continue;
+                }
+            }
+        }
+
+        // out of blocks
+        INVALID_BLOCK
+    }
+
+    // TODO: this is currently unused. The slab allocator never releases blocks
+    #[allow(dead_code)]
+    pub(crate) fn release_block(&self, block_ptr: *mut u8) {
+        let blockno = unsafe { block_ptr.byte_offset_from(self.blocks_ptr) / BLOCK_SIZE as isize };
+        self.release_block_internal(blockno as u64);
+    }
+
+    fn release_block_internal(&self, blockno: u64) {
+        let mut freelist_head = self.freelist_head.lock();
+        if *freelist_head != INVALID_BLOCK {
+            let freelist_block = self.read_freelist_block(*freelist_head);
+
+            // acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
+            let mut g = freelist_block.inner.lock();
+
+            let num_free_blocks = g.num_free_blocks;
+            if num_free_blocks < g.free_blocks.len() as u64 {
+                g.free_blocks[num_free_blocks as usize] = blockno;
+                g.num_free_blocks += 1;
+                return;
+            }
+        }
+
+        // Convert the block into a new freelist block
+        let block_ptr: *mut FreeListBlock = self.get_block_ptr(blockno).cast();
+        let init = FreeListBlock {
+            inner: spin::Mutex::new(FreeListBlockInner {
+                next: *freelist_head,
+                num_free_blocks: 0,
+                free_blocks: [INVALID_BLOCK; 100],
+            }),
+        };
+        unsafe { (*block_ptr) = init };
+        *freelist_head = blockno;
+    }
+
+    // for debugging
+    pub(crate) fn get_statistics(&self) -> BlockAllocatorStats {
+        let mut num_free_blocks = 0;
+
+        let mut _prev_lock = None;
+        let head_lock = self.freelist_head.lock();
+        let mut next_blk = *head_lock;
+        let mut _head_lock = Some(head_lock);
+        while next_blk != INVALID_BLOCK {
+            let freelist_block = self.read_freelist_block(next_blk);
+            let lock = freelist_block.inner.lock();
+            num_free_blocks += lock.num_free_blocks;
+            next_blk = lock.next;
+            _prev_lock = Some(lock); // hold the lock until we've read the next block
+            _head_lock = None;
+        }
+
+        BlockAllocatorStats {
+            num_blocks: self.num_blocks,
+            num_initialized: self.num_initialized.load(Ordering::Relaxed),
+            num_free_blocks,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct BlockAllocatorStats {
+    pub num_blocks: u64,
+    pub num_initialized: u64,
+    pub num_free_blocks: u64,
+}
--- a/libs/neonart/src/allocator/multislab.rs
+++ b/libs/neonart/src/allocator/multislab.rs
@@ -0,0 +1,33 @@
+use std::alloc::Layout;
+use std::mem::MaybeUninit;
+
+use crate::allocator::block::BlockAllocator;
+use crate::allocator::slab::SlabDesc;
+
+pub struct MultiSlabAllocator<'t, const N: usize> {
+    pub(crate) block_allocator: BlockAllocator<'t>,
+
+    pub(crate) slab_descs: [SlabDesc; N],
+}
+
+impl<'t, const N: usize> MultiSlabAllocator<'t, N> {
+    pub(crate) fn new(
+        area: &'t mut [MaybeUninit<u8>],
+        layouts: &[Layout; N],
+    ) -> MultiSlabAllocator<'t, N> {
+        let block_allocator = BlockAllocator::new(area);
+        MultiSlabAllocator {
+            block_allocator,
+
+            slab_descs: std::array::from_fn(|i| SlabDesc::new(&layouts[i])),
+        }
+    }
+
+    pub(crate) fn alloc_slab(&self, slab_idx: usize) -> *mut u8 {
+        self.slab_descs[slab_idx].alloc_chunk(&self.block_allocator)
+    }
+
+    pub(crate) fn dealloc_slab(&self, slab_idx: usize, ptr: *mut u8) {
+        self.slab_descs[slab_idx].dealloc_chunk(ptr, &self.block_allocator)
+    }
+}
--- a/libs/neonart/src/allocator/slab.rs
+++ b/libs/neonart/src/allocator/slab.rs
@@ -0,0 +1,433 @@
+//! A slab allocator that carves out fixed-size chunks from larger blocks.
+//!
+//!
+
+use std::alloc::Layout;
+use std::mem::MaybeUninit;
+use std::ops::Deref;
+use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
+
+use spin;
+
+use super::alloc_from_slice;
+use super::block::BlockAllocator;
+
+use crate::allocator::block::BLOCK_SIZE;
+
+pub(crate) struct SlabDesc {
+    pub(crate) layout: Layout,
+
+    block_lists: spin::RwLock<BlockLists>,
+
+    pub(crate) num_blocks: AtomicU64,
+    pub(crate) num_allocated: AtomicU64,
+}
+
+// FIXME: Not sure if SlabDesc is really Sync or Send. It probably is when it's empty, but
+// 'block_lists' contains pointers when it's not empty. In the current use as part of the
+// the art tree, SlabDescs are only moved during initialization.
+unsafe impl Sync for SlabDesc {}
+unsafe impl Send for SlabDesc {}
+
+#[derive(Default, Debug)]
+struct BlockLists {
+    full_blocks: BlockList,
+    nonfull_blocks: BlockList,
+}
+
+impl BlockLists {
+    // Unlink a node. It must be in either one of the two lists.
+    unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
+        let list = unsafe {
+            if (*elem).next.is_null() {
+                if self.full_blocks.tail == elem {
+                    Some(&mut self.full_blocks)
+                } else {
+                    Some(&mut self.nonfull_blocks)
+                }
+            } else if (*elem).prev.is_null() {
+                if self.full_blocks.head == elem {
+                    Some(&mut self.full_blocks)
+                } else {
+                    Some(&mut self.nonfull_blocks)
+                }
+            } else {
+                None
+            }
+        };
+        unsafe { unlink_slab_block(list, elem) };
+    }
+}
+
+unsafe fn unlink_slab_block(mut list: Option<&mut BlockList>, elem: *mut SlabBlockHeader) {
+    unsafe {
+        if (*elem).next.is_null() {
+            assert_eq!(list.as_ref().unwrap().tail, elem);
+            list.as_mut().unwrap().tail = (*elem).prev;
+        } else {
+            assert_eq!((*(*elem).next).prev, elem);
+            (*(*elem).next).prev = (*elem).prev;
+        }
+        if (*elem).prev.is_null() {
+            assert_eq!(list.as_ref().unwrap().head, elem);
+            list.as_mut().unwrap().head = (*elem).next;
+        } else {
+            assert_eq!((*(*elem).prev).next, elem);
+            (*(*elem).prev).next = (*elem).next;
+        }
+    }
+}
+
+#[derive(Debug)]
+struct BlockList {
+    head: *mut SlabBlockHeader,
+    tail: *mut SlabBlockHeader,
+}
+
+impl Default for BlockList {
+    fn default() -> Self {
+        BlockList {
+            head: std::ptr::null_mut(),
+            tail: std::ptr::null_mut(),
+        }
+    }
+}
+
+impl BlockList {
+    unsafe fn push_head(&mut self, elem: *mut SlabBlockHeader) {
+        unsafe {
+            if self.is_empty() {
+                self.tail = elem;
+                (*elem).next = std::ptr::null_mut();
+            } else {
+                (*elem).next = self.head;
+                (*self.head).prev = elem;
+            }
+            (*elem).prev = std::ptr::null_mut();
+            self.head = elem;
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        self.head.is_null()
+    }
+
+    unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
+        unsafe { unlink_slab_block(Some(self), elem) }
+    }
+
+    #[cfg(test)]
+    fn dump(&self) {
+        let mut next = self.head;
+
+        while !next.is_null() {
+            let n = unsafe { next.as_ref() }.unwrap();
+            eprintln!(
+                "  blk {:?} (free {}/{})",
+                next,
+                n.num_free_chunks.load(Ordering::Relaxed),
+                n.num_chunks
+            );
+            next = n.next;
+        }
+    }
+}
+
+impl SlabDesc {
+    pub(crate) fn new(layout: &Layout) -> SlabDesc {
+        SlabDesc {
+            layout: *layout,
+            block_lists: spin::RwLock::new(BlockLists::default()),
+            num_allocated: AtomicU64::new(0),
+            num_blocks: AtomicU64::new(0),
+        }
+    }
+}
+
+#[derive(Debug)]
+struct SlabBlockHeader {
+    free_chunks_head: spin::Mutex<*mut FreeChunk>,
+    num_free_chunks: AtomicU32,
+    num_chunks: u32, // this is really a constant for a given Layout
+
+    // these fields are protected by the lock on the BlockLists
+    prev: *mut SlabBlockHeader,
+    next: *mut SlabBlockHeader,
+}
+
+struct FreeChunk {
+    next: *mut FreeChunk,
+}
+
+enum ReadOrWriteGuard<'a, T> {
+    Read(spin::RwLockReadGuard<'a, T>),
+    Write(spin::RwLockWriteGuard<'a, T>),
+}
+
+impl<'a, T> Deref for ReadOrWriteGuard<'a, T> {
+    type Target = T;
+
+    fn deref(&self) -> &<Self as Deref>::Target {
+        match self {
+            ReadOrWriteGuard::Read(g) => g.deref(),
+            ReadOrWriteGuard::Write(g) => g.deref(),
+        }
+    }
+}
+
+impl SlabDesc {
+    pub fn alloc_chunk(&self, block_allocator: &BlockAllocator) -> *mut u8 {
+        // Are there any free chunks?
+        let mut acquire_write = false;
+        'outer: loop {
+            let mut block_lists_guard = if acquire_write {
+                ReadOrWriteGuard::Write(self.block_lists.write())
+            } else {
+                ReadOrWriteGuard::Read(self.block_lists.read())
+            };
+            'inner: loop {
+                let block_ptr = block_lists_guard.nonfull_blocks.head;
+                if block_ptr.is_null() {
+                    break 'outer;
+                }
+                unsafe {
+                    let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
+                    if !(*free_chunks_head).is_null() {
+                        let result = *free_chunks_head;
+                        (*free_chunks_head) = (*result).next;
+                        let _old = (*block_ptr).num_free_chunks.fetch_sub(1, Ordering::Relaxed);
+
+                        self.num_allocated.fetch_add(1, Ordering::Relaxed);
+                        return result.cast();
+                    }
+                }
+
+                // The block at the head of the list was full. Grab write lock and retry
+                match block_lists_guard {
+                    ReadOrWriteGuard::Read(_) => {
+                        acquire_write = true;
+                        continue 'outer;
+                    }
+                    ReadOrWriteGuard::Write(ref mut g) => {
+                        // move the node to the list of full blocks
+                        unsafe {
+                            g.nonfull_blocks.unlink(block_ptr);
+                            g.full_blocks.push_head(block_ptr);
+                        };
+                        continue 'inner;
+                    }
+                }
+            }
+        }
+
+        // no free chunks. Allocate a new block (and the chunk from that)
+        let (new_block, new_chunk) = self.alloc_block_and_chunk(block_allocator);
+        self.num_blocks.fetch_add(1, Ordering::Relaxed);
+
+        // Add the block to the list in the SlabDesc
+        unsafe {
+            let mut block_lists_guard = self.block_lists.write();
+            block_lists_guard.nonfull_blocks.push_head(new_block);
+        }
+        self.num_allocated.fetch_add(1, Ordering::Relaxed);
+        new_chunk
+    }
+
+    pub fn dealloc_chunk(&self, chunk_ptr: *mut u8, _block_allocator: &BlockAllocator) {
+        // Find the block it belongs to. You can find the block from the address. (And knowing the
+        // layout, you could calculate the chunk number too.)
+        let block_ptr: *mut SlabBlockHeader = {
+            let block_addr = (chunk_ptr.addr() / BLOCK_SIZE) * BLOCK_SIZE;
+            chunk_ptr.with_addr(block_addr).cast()
+        };
+        let chunk_ptr: *mut FreeChunk = chunk_ptr.cast();
+
+        // Mark the chunk as free in 'freechunks' list
+        let num_chunks;
+        let num_free_chunks;
+        unsafe {
+            let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
+            (*chunk_ptr).next = *free_chunks_head;
+            *free_chunks_head = chunk_ptr;
+
+            num_free_chunks = (*block_ptr).num_free_chunks.fetch_add(1, Ordering::Relaxed) + 1;
+            num_chunks = (*block_ptr).num_chunks;
+        }
+
+        if num_free_chunks == 1 {
+            // If the block was full previously, add it to the nonfull blocks list. Note that
+            // we're not holding the lock anymore, so it can immediately become full again.
+            // That's harmless, it will be moved back to the full list again when a call
+            // to alloc_chunk() sees it.
+            let mut block_lists = self.block_lists.write();
+            unsafe {
+                block_lists.unlink(block_ptr);
+                block_lists.nonfull_blocks.push_head(block_ptr);
+            };
+        } else if num_free_chunks == num_chunks {
+            // If the block became completely empty, move it to the free list
+            // TODO
+            // FIXME: we're still holding the spinlock. It's not exactly safe to return it to
+            // the free blocks list, is it? Defer it as garbage to wait out concurrent updates?
+            //block_allocator.release_block()
+        }
+
+        // update stats
+        self.num_allocated.fetch_sub(1, Ordering::Relaxed);
+    }
+
+    fn alloc_block_and_chunk(
+        &self,
+        block_allocator: &BlockAllocator,
+    ) -> (*mut SlabBlockHeader, *mut u8) {
+        // fixme: handle OOM
+        let block_slice: &mut [MaybeUninit<u8>] = block_allocator.alloc_block();
+        let (block_header, remain) = alloc_from_slice::<SlabBlockHeader>(block_slice);
+
+        let padding = remain.as_ptr().align_offset(self.layout.align());
+
+        let num_chunks = (remain.len() - padding) / self.layout.size();
+
+        let first_chunk_ptr: *mut FreeChunk = remain[padding..].as_mut_ptr().cast();
+
+        unsafe {
+            let mut chunk_ptr = first_chunk_ptr;
+            for _ in 0..num_chunks - 1 {
+                let next_chunk_ptr = chunk_ptr.byte_add(self.layout.size());
+                (*chunk_ptr).next = next_chunk_ptr;
+                chunk_ptr = next_chunk_ptr;
+            }
+            (*chunk_ptr).next = std::ptr::null_mut();
+
+            let result_chunk = first_chunk_ptr;
+
+            let block_header = block_header.write(SlabBlockHeader {
+                free_chunks_head: spin::Mutex::new((*first_chunk_ptr).next),
+                prev: std::ptr::null_mut(),
+                next: std::ptr::null_mut(),
+                num_chunks: num_chunks as u32,
+                num_free_chunks: AtomicU32::new(num_chunks as u32 - 1),
+            });
+
+            (block_header, result_chunk.cast())
+        }
+    }
+
+    #[cfg(test)]
+    fn dump(&self) {
+        eprintln!(
+            "slab dump ({} blocks, {} allocated chunks)",
+            self.num_blocks.load(Ordering::Relaxed),
+            self.num_allocated.load(Ordering::Relaxed)
+        );
+        let lists = self.block_lists.read();
+
+        eprintln!("nonfull blocks:");
+        lists.nonfull_blocks.dump();
+        eprintln!("full blocks:");
+        lists.full_blocks.dump();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use rand::Rng;
+    use rand_distr::Zipf;
+
+    struct TestObject {
+        val: usize,
+        _dummy: [u8; BLOCK_SIZE / 4],
+    }
+
+    struct TestObjectSlab<'a>(SlabDesc, BlockAllocator<'a>);
+    impl<'a> TestObjectSlab<'a> {
+        fn new(block_allocator: BlockAllocator) -> TestObjectSlab {
+            TestObjectSlab(SlabDesc::new(&Layout::new::<TestObject>()), block_allocator)
+        }
+
+        fn alloc(&self, val: usize) -> *mut TestObject {
+            let obj: *mut TestObject = self.0.alloc_chunk(&self.1).cast();
+            unsafe { (*obj).val = val };
+            obj
+        }
+
+        fn dealloc(&self, obj: *mut TestObject) {
+            self.0.dealloc_chunk(obj.cast(), &self.1)
+        }
+    }
+
+    #[test]
+    fn test_slab_alloc() {
+        const MEM_SIZE: usize = 100000000;
+        let mut area = Box::new_uninit_slice(MEM_SIZE);
+        let block_allocator = BlockAllocator::new(&mut area);
+
+        let slab = TestObjectSlab::new(block_allocator);
+
+        let mut all: Vec<*mut TestObject> = Vec::new();
+        for i in 0..11 {
+            all.push(slab.alloc(i));
+        }
+        #[allow(clippy::needless_range_loop)]
+        for i in 0..11 {
+            assert!(unsafe { (*all[i]).val == i });
+        }
+
+        let distribution = Zipf::new(10.0, 1.1).unwrap();
+        let mut rng = rand::rng();
+        for _ in 0..100000 {
+            slab.0.dump();
+            let idx = rng.sample(distribution) as usize;
+            let ptr: *mut TestObject = all[idx];
+            if !ptr.is_null() {
+                assert_eq!(unsafe { (*ptr).val }, idx);
+                slab.dealloc(ptr);
+                all[idx] = std::ptr::null_mut();
+            } else {
+                all[idx] = slab.alloc(idx);
+            }
+        }
+    }
+
+    fn new_test_blk(i: u32) -> *mut SlabBlockHeader {
+        Box::into_raw(Box::new(SlabBlockHeader {
+            free_chunks_head: spin::Mutex::new(std::ptr::null_mut()),
+            num_free_chunks: AtomicU32::new(0),
+            num_chunks: i,
+            prev: std::ptr::null_mut(),
+            next: std::ptr::null_mut(),
+        }))
+    }
+
+    #[test]
+    fn test_block_linked_list() {
+        // note: these are leaked, but that's OK for tests
+        let a = new_test_blk(0);
+        let b = new_test_blk(1);
+
+        let mut list = BlockList::default();
+        assert!(list.is_empty());
+
+        unsafe {
+            list.push_head(a);
+            assert!(!list.is_empty());
+            list.unlink(a);
+        }
+        assert!(list.is_empty());
+
+        unsafe {
+            list.push_head(b);
+            list.push_head(a);
+            assert_eq!(list.head, a);
+            assert_eq!((*a).next, b);
+            assert_eq!((*b).prev, a);
+            assert_eq!(list.tail, b);
+
+            list.unlink(a);
+            list.unlink(b);
+            assert!(list.is_empty());
+        }
+    }
+}
--- a/libs/neonart/src/allocator/static.rs
+++ b/libs/neonart/src/allocator/static.rs
@@ -0,0 +1,44 @@
+use std::mem::MaybeUninit;
+
+pub fn alloc_from_slice<T>(
+    area: &mut [MaybeUninit<u8>],
+) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
+    let layout = std::alloc::Layout::new::<T>();
+
+    let area_start = area.as_mut_ptr();
+
+    // pad to satisfy alignment requirements
+    let padding = area_start.align_offset(layout.align());
+    if padding + layout.size() > area.len() {
+        panic!("out of memory");
+    }
+    let area = &mut area[padding..];
+    let (result_area, remain) = area.split_at_mut(layout.size());
+
+    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
+    let result = unsafe { result_ptr.as_mut().unwrap() };
+
+    (result, remain)
+}
+
+pub fn alloc_array_from_slice<T>(
+    area: &mut [MaybeUninit<u8>],
+    len: usize,
+) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
+    let layout = std::alloc::Layout::new::<T>();
+
+    let area_start = area.as_mut_ptr();
+
+    // pad to satisfy alignment requirements
+    let padding = area_start.align_offset(layout.align());
+    if padding + layout.size() * len > area.len() {
+        panic!("out of memory");
+    }
+    let area = &mut area[padding..];
+    let (result_area, remain) = area.split_at_mut(layout.size() * len);
+
+    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
+    let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
+
+    (result, remain)
+}
--- a/libs/neonart/src/epoch.rs
+++ b/libs/neonart/src/epoch.rs
@@ -0,0 +1,142 @@
+//! This is similar to crossbeam_epoch crate, but works in shared memory
+
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
+
+use crossbeam_utils::CachePadded;
+
+const NUM_SLOTS: usize = 1000;
+
+/// This is the struct that is stored in shmem
+///
+/// bit 0: is it pinned or not?
+/// rest of the bits are the epoch counter.
+pub struct EpochShared {
+    global_epoch: AtomicU64,
+    participants: [CachePadded<AtomicU64>; NUM_SLOTS],
+
+    broadcast_lock: spin::Mutex<()>,
+}
+
+impl EpochShared {
+    pub fn new() -> EpochShared {
+        EpochShared {
+            global_epoch: AtomicU64::new(2),
+            participants: [const { CachePadded::new(AtomicU64::new(2)) }; NUM_SLOTS],
+            broadcast_lock: spin::Mutex::new(()),
+        }
+    }
+
+    pub fn register(&self) -> LocalHandle {
+        LocalHandle {
+            global: self,
+            last_slot: AtomicUsize::new(0), // todo: choose more intelligently
+        }
+    }
+
+    fn release_pin(&self, slot: usize, _epoch: u64) {
+        let global_epoch = self.global_epoch.load(Ordering::Relaxed);
+        self.participants[slot].store(global_epoch, Ordering::Relaxed);
+    }
+
+    fn pin_internal(&self, slot_hint: usize) -> (usize, u64) {
+        // pick a slot
+        let mut slot = slot_hint;
+        let epoch = loop {
+            let old = self.participants[slot].fetch_or(1, Ordering::Relaxed);
+            if old & 1 == 0 {
+                // Got this slot
+                break old;
+            }
+
+            // the slot was busy by another thread / process. try a different slot
+            slot += 1;
+            if slot == NUM_SLOTS {
+                slot = 0;
+            }
+            continue;
+        };
+        (slot, epoch)
+    }
+
+    pub(crate) fn advance(&self) -> u64 {
+        // Advance the global epoch
+        let old_epoch = self.global_epoch.fetch_add(2, Ordering::Relaxed);
+        // Anyone that release their pin after this will update their slot.
+        old_epoch + 2
+    }
+
+    pub(crate) fn broadcast(&self) {
+        let Some(_guard) = self.broadcast_lock.try_lock() else {
+            return;
+        };
+
+        let epoch = self.global_epoch.load(Ordering::Relaxed);
+        let old_epoch = epoch.wrapping_sub(2);
+
+        // Update all free slots.
+        for i in 0..NUM_SLOTS {
+            // TODO: check result, as a sanity check. It should either be the old epoch, or pinned
+            let _ = self.participants[i].compare_exchange(
+                old_epoch,
+                epoch,
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+            );
+        }
+
+        // FIXME: memory fence here, since we used Relaxed?
+    }
+
+    pub(crate) fn get_oldest(&self) -> u64 {
+        // Read all slots.
+        let now = self.global_epoch.load(Ordering::Relaxed);
+        let mut oldest = now;
+        for i in 0..NUM_SLOTS {
+            let this_epoch = self.participants[i].load(Ordering::Relaxed);
+            let delta = now.wrapping_sub(this_epoch);
+            if delta > u64::MAX / 2 {
+                // this is very recent
+            } else if delta > now.wrapping_sub(oldest) {
+                oldest = this_epoch;
+            }
+        }
+        oldest
+    }
+
+    pub(crate) fn get_current(&self) -> u64 {
+        self.global_epoch.load(Ordering::Relaxed)
+    }
+}
+
+pub(crate) struct EpochPin<'e> {
+    slot: usize,
+    pub(crate) epoch: u64,
+
+    handle: &'e LocalHandle<'e>,
+}
+
+impl<'e> Drop for EpochPin<'e> {
+    fn drop(&mut self) {
+        self.handle.global.release_pin(self.slot, self.epoch);
+    }
+}
+
+pub struct LocalHandle<'g> {
+    global: &'g EpochShared,
+
+    last_slot: AtomicUsize,
+}
+
+impl<'g> LocalHandle<'g> {
+    pub fn pin(&self) -> EpochPin {
+        let (slot, epoch) = self
+            .global
+            .pin_internal(self.last_slot.load(Ordering::Relaxed));
+        self.last_slot.store(slot, Ordering::Relaxed);
+        EpochPin {
+            handle: self,
+            epoch,
+            slot,
+        }
+    }
+}
--- a/libs/neonart/src/lib.rs
+++ b/libs/neonart/src/lib.rs
@@ -0,0 +1,583 @@
+//! Adaptive Radix Tree (ART) implementation, with Optimistic Lock Coupling.
+//!
+//! The data structure is described in these two papers:
+//!
+//! [1] Leis, V. & Kemper, Alfons & Neumann, Thomas. (2013).
+//!     The adaptive radix tree: ARTful indexing for main-memory databases.
+//!     Proceedings - International Conference on Data Engineering. 38-49. 10.1109/ICDE.2013.6544812.
+//!     https://db.in.tum.de/~leis/papers/ART.pdf
+//!
+//! [2] Leis, Viktor & Scheibner, Florian & Kemper, Alfons & Neumann, Thomas. (2016).
+//!     The ART of practical synchronization.
+//!     1-8. 10.1145/2933349.2933352.
+//!     https://db.in.tum.de/~leis/papers/artsync.pdf
+//!
+//! [1] describes the base data structure, and [2] describes the Optimistic Lock Coupling that we
+//! use.
+//!
+//! The papers mention a few different variants. We have made the following choices in this
+//! implementation:
+//!
+//! - All keys have the same length
+//!
+//! - Single-value leaves.
+//!
+//! - For collapsing inner nodes, we use the Pessimistic approach, where each inner node stores a
+//!   variable length "prefix", which stores the keys of all the one-way nodes which have been
+//!   removed. However, similar to the "hybrid" approach described in the paper, each node only has
+//!   space for a constant-size prefix of 8 bytes. If a node would have a longer prefix, then we
+//!   create create one-way nodes to store them. (There was no particular reason for this choice,
+//!   the "hybrid" approach described in the paper might be better.)
+//!
+//! - For concurrency, we use Optimistic Lock Coupling. The paper [2] also describes another method,
+//!   ROWEX, which generally performs better when there is contention, but that is not important
+//!   for use and Optimisic Lock Coupling is simpler to implement.
+//!
+//! ## Requirements
+//!
+//! This data structure is currently used for the integrated LFC, relsize and last-written LSN cache
+//! in the compute communicator, part of the 'neon' Postgres extension. We have some unique
+//! requirements, which is why we had to write our own. Namely:
+//!
+//! - The data structure has to live in fixed-sized shared memory segment. That rules out any
+//!   built-in Rust collections and most crates. (Except possibly with the 'allocator_api' rust
+//!   feature, which still nightly-only experimental as of this writing).
+//!
+//! - The data structure is accessed from multiple processes. Only one process updates the data
+//!   structure, but other processes perform reads. That rules out using built-in Rust locking
+//!   primitives like Mutex and RwLock, and most crates too.
+//!
+//! - Within the one process with write-access, multiple threads can perform updates concurrently.
+//!   That rules out using PostgreSQL LWLocks for the locking.
+//!
+//! The implementation is generic, and doesn't depend on any PostgreSQL specifics, but it has been
+//! written with that usage and the above constraints in mind. Some noteworthy assumptions:
+//!
+//! - Contention is assumed to be rare. In the integrated cache in PostgreSQL, there's higher level
+//!   locking in the PostgreSQL buffer manager, which ensures that two backends should not try to
+//!   read / write the same page at the same time. (Prefetching can conflict with actual reads,
+//!   however.)
+//!
+//!  - The keys in the integrated cache are 17 bytes long.
+//!
+//! ## Usage
+//!
+//! Because this is designed to be used as a Postgres shared memory data structure, initialization
+//! happens in three stages:
+//!
+//! 0. A fixed area of shared memory is allocated at postmaster startup.
+//!
+//! 1. TreeInitStruct::new() is called to initialize it, still in Postmaster process, before any
+//!    other process or thread is running. It returns a TreeInitStruct, which is inherited by all
+//!    the processes through fork().
+//!
+//! 2. One process may have write-access to the struct, by calling
+//!    [TreeInitStruct::attach_writer]. (That process is the communicator process.)
+//!
+//! 3. Other processes get read-access to the struct, by calling [TreeInitStruct::attach_reader]
+//!
+//! "Write access" means that you can insert / update / delete values in the tree.
+//!
+//! NOTE: The Values stored in the tree are sometimes moved, when a leaf node fills up and a new
+//! larger node needs to be allocated. The versioning and epoch-based allocator ensure that the data
+//! structure stays consistent, but if the Value has interior mutability, like atomic fields,
+//! updates to such fields might be lost if the leaf node is concurrently moved! If that becomes a
+//! problem, the version check could be passed up to the caller, so that the caller could detect the
+//! lost updates and retry the operation.
+//!
+//! ## Implementation
+//!
+//! node_ptr: Provides low-level implementations of the four different node types (eight actually,
+//! since there is an Internal and Leaf variant of each)
+//!
+//! lock_and_version.rs: Provides an abstraction for the combined lock and version counter on each
+//! node.
+//!
+//! node_ref.rs: The code in node_ptr.rs deals with raw pointers. node_ref.rs provides more type-safe
+//!   abstractions on top.
+//!
+//! algorithm.rs: Contains the functions to implement lookups and updates in the tree
+//!
+//! allocator.rs: Provides a facility to allocate memory for the tree nodes. (We must provide our
+//!   own abstraction for that because we need the data structure to live in a pre-allocated shared
+//!   memory segment).
+//!
+//! epoch.rs: The data structure requires that when a node is removed from the tree, it is not
+//!   immediately deallocated, but stays around for as long as concurrent readers might still have
+//!   pointers to them. This is enforced by an epoch system. This is similar to
+//!   e.g. crossbeam_epoch, but we couldn't use that either because it has to work across processes
+//!   communicating over the shared memory segment.
+//!
+//! ## See also
+//!
+//! There are some existing Rust ART implementations out there, but none of them filled all
+//! the requirements:
+//!
+//! - https://github.com/XiangpengHao/congee
+//! - https://github.com/declanvk/blart
+//!
+//! ## TODO
+//!
+//! - Removing values has not been implemented
+
+mod algorithm;
+pub mod allocator;
+mod epoch;
+
+use algorithm::RootPtr;
+use algorithm::node_ptr::NodePtr;
+
+use std::collections::VecDeque;
+use std::fmt::Debug;
+use std::marker::PhantomData;
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+use crate::epoch::EpochPin;
+
+#[cfg(test)]
+mod tests;
+
+use allocator::ArtAllocator;
+pub use allocator::ArtMultiSlabAllocator;
+pub use allocator::OutOfMemoryError;
+
+/// Fixed-length key type.
+///
+pub trait Key: Debug {
+    const KEY_LEN: usize;
+
+    fn as_bytes(&self) -> &[u8];
+}
+
+/// Values stored in the tree
+///
+/// Values need to be Cloneable, because when a node "grows", the value is copied to a new node and
+/// the old sticks around until all readers that might see the old value are gone.
+// fixme obsolete, no longer needs Clone
+pub trait Value {}
+
+const MAX_GARBAGE: usize = 1024;
+
+/// The root of the tree, plus other tree-wide data. This is stored in the shared memory.
+pub struct Tree<V: Value> {
+    /// For simplicity, so that we never need to grow or shrink the root, the root node is always an
+    /// Internal256 node. Also, it never has a prefix (that's actually a bit wasteful, incurring one
+    /// indirection to every lookup)
+    root: RootPtr<V>,
+
+    writer_attached: AtomicBool,
+
+    epoch: epoch::EpochShared,
+}
+
+unsafe impl<V: Value + Sync> Sync for Tree<V> {}
+unsafe impl<V: Value + Send> Send for Tree<V> {}
+
+struct GarbageQueue<V>(VecDeque<(NodePtr<V>, u64)>);
+
+unsafe impl<V: Value + Sync> Sync for GarbageQueue<V> {}
+unsafe impl<V: Value + Send> Send for GarbageQueue<V> {}
+
+impl<V> GarbageQueue<V> {
+    fn new() -> GarbageQueue<V> {
+        GarbageQueue(VecDeque::with_capacity(MAX_GARBAGE))
+    }
+
+    fn remember_obsolete_node(&mut self, ptr: NodePtr<V>, epoch: u64) {
+        self.0.push_front((ptr, epoch));
+    }
+
+    fn next_obsolete(&mut self, cutoff_epoch: u64) -> Option<NodePtr<V>> {
+        if let Some(back) = self.0.back() {
+            if back.1 < cutoff_epoch {
+                return Some(self.0.pop_back().unwrap().0);
+            }
+        }
+        None
+    }
+}
+
+/// Struct created at postmaster startup
+pub struct TreeInitStruct<'t, K: Key, V: Value, A: ArtAllocator<V>> {
+    tree: &'t Tree<V>,
+
+    allocator: &'t A,
+
+    phantom_key: PhantomData<K>,
+}
+
+/// The worker process has a reference to this. The write operations are only safe
+/// from the worker process
+pub struct TreeWriteAccess<'t, K: Key, V: Value, A: ArtAllocator<V>>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'t Tree<V>,
+
+    pub allocator: &'t A,
+
+    epoch_handle: epoch::LocalHandle<'t>,
+
+    phantom_key: PhantomData<K>,
+
+    /// Obsolete nodes that cannot be recycled until their epoch expires.
+    garbage: spin::Mutex<GarbageQueue<V>>,
+}
+
+/// The backends have a reference to this. It cannot be used to modify the tree
+pub struct TreeReadAccess<'t, K: Key, V: Value>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'t Tree<V>,
+
+    epoch_handle: epoch::LocalHandle<'t>,
+
+    phantom_key: PhantomData<K>,
+}
+
+impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeInitStruct<'t, K, V, A> {
+    pub fn new(allocator: &'t A) -> TreeInitStruct<'t, K, V, A> {
+        let tree_ptr = allocator.alloc_tree();
+        let tree_ptr = NonNull::new(tree_ptr).expect("out of memory");
+        let init = Tree {
+            root: algorithm::new_root(allocator).expect("out of memory"),
+            writer_attached: AtomicBool::new(false),
+            epoch: epoch::EpochShared::new(),
+        };
+        unsafe { tree_ptr.write(init) };
+
+        TreeInitStruct {
+            tree: unsafe { tree_ptr.as_ref() },
+            allocator,
+            phantom_key: PhantomData,
+        }
+    }
+
+    pub fn attach_writer(self) -> TreeWriteAccess<'t, K, V, A> {
+        let previously_attached = self.tree.writer_attached.swap(true, Ordering::Relaxed);
+        if previously_attached {
+            panic!("writer already attached");
+        }
+        TreeWriteAccess {
+            tree: self.tree,
+            allocator: self.allocator,
+            phantom_key: PhantomData,
+            epoch_handle: self.tree.epoch.register(),
+            garbage: spin::Mutex::new(GarbageQueue::new()),
+        }
+    }
+
+    pub fn attach_reader(self) -> TreeReadAccess<'t, K, V> {
+        TreeReadAccess {
+            tree: self.tree,
+            phantom_key: PhantomData,
+            epoch_handle: self.tree.epoch.register(),
+        }
+    }
+}
+
+impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteAccess<'t, K, V, A> {
+    pub fn start_write<'g>(&'t self) -> TreeWriteGuard<'g, K, V, A>
+    where
+        't: 'g,
+    {
+        TreeWriteGuard {
+            tree_writer: self,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+            created_garbage: false,
+        }
+    }
+
+    pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
+        TreeReadGuard {
+            tree: self.tree,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+        }
+    }
+}
+
+impl<'t, K: Key, V: Value> TreeReadAccess<'t, K, V> {
+    pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
+        TreeReadGuard {
+            tree: self.tree,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+        }
+    }
+}
+
+pub struct TreeReadGuard<'e, K, V>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'e Tree<V>,
+
+    epoch_pin: EpochPin<'e>,
+    phantom_key: PhantomData<K>,
+}
+
+impl<'e, K: Key, V: Value> TreeReadGuard<'e, K, V> {
+    pub fn get(&'e self, key: &K) -> Option<&'e V> {
+        algorithm::search(key, self.tree.root, &self.epoch_pin)
+    }
+}
+
+pub struct TreeWriteGuard<'e, K, V, A>
+where
+    K: Key,
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    tree_writer: &'e TreeWriteAccess<'e, K, V, A>,
+
+    epoch_pin: EpochPin<'e>,
+    phantom_key: PhantomData<K>,
+
+    created_garbage: bool,
+}
+
+pub enum UpdateAction<V> {
+    Nothing,
+    Insert(V),
+    Remove,
+}
+
+impl<'e, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
+    /// Get a value
+    pub fn get(&'e mut self, key: &K) -> Option<&'e V> {
+        algorithm::search(key, self.tree_writer.tree.root, &self.epoch_pin)
+    }
+
+    /// Insert a value
+    pub fn insert(self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
+        let mut success = None;
+
+        self.update_with_fn(key, |existing| {
+            if existing.is_some() {
+                success = Some(false);
+                UpdateAction::Nothing
+            } else {
+                success = Some(true);
+                UpdateAction::Insert(value)
+            }
+        })?;
+        Ok(success.expect("value_fn not called"))
+    }
+
+    /// Remove value. Returns true if it existed
+    pub fn remove(self, key: &K) -> bool {
+        let mut result = false;
+        // FIXME: It's not clear if OOM is expected while removing. It seems
+        // not nice, but shrinking a node can OOM. Then again, we could opt
+        // to not shrink a node if we cannot allocate, to live a little longer.
+        self.update_with_fn(key, |existing| match existing {
+            Some(_) => {
+                result = true;
+                UpdateAction::Remove
+            }
+            None => UpdateAction::Nothing,
+        })
+        .expect("out of memory while removing");
+        result
+    }
+
+    /// Try to remove value and return the old value.
+    pub fn remove_and_return(self, key: &K) -> Option<V>
+    where
+        V: Clone,
+    {
+        let mut old = None;
+        self.update_with_fn(key, |existing| {
+            old = existing.cloned();
+            UpdateAction::Remove
+        })
+        .expect("out of memory while removing");
+        old
+    }
+
+    /// Update key using the given function. All the other modifying operations are based on this.
+    ///
+    /// The function is passed a reference to the existing value, if any. If the function
+    /// returns None, the value is removed from the tree (or if there was no existing value,
+    /// does nothing). If the function returns Some, the existing value is replaced, of if there
+    /// was no existing value, it is inserted. FIXME: update comment
+    pub fn update_with_fn<F>(mut self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
+    where
+        F: FnOnce(Option<&V>) -> UpdateAction<V>,
+    {
+        algorithm::update_fn(key, value_fn, self.tree_writer.tree.root, &mut self)?;
+
+        if self.created_garbage {
+            let _ = self.collect_garbage();
+        }
+        Ok(())
+    }
+
+    fn remember_obsolete_node(&mut self, ptr: NodePtr<V>) {
+        self.tree_writer
+            .garbage
+            .lock()
+            .remember_obsolete_node(ptr, self.epoch_pin.epoch);
+        self.created_garbage = true;
+    }
+
+    // returns number of nodes recycled
+    fn collect_garbage(&self) -> usize {
+        self.tree_writer.tree.epoch.advance();
+        self.tree_writer.tree.epoch.broadcast();
+
+        let cutoff_epoch = self.tree_writer.tree.epoch.get_oldest();
+
+        let mut result = 0;
+        let mut garbage_queue = self.tree_writer.garbage.lock();
+        while let Some(ptr) = garbage_queue.next_obsolete(cutoff_epoch) {
+            ptr.deallocate(self.tree_writer.allocator);
+            result += 1;
+        }
+        result
+    }
+}
+
+pub struct TreeIterator<K>
+where
+    K: Key + for<'a> From<&'a [u8]>,
+{
+    done: bool,
+    pub next_key: Vec<u8>,
+    max_key: Option<Vec<u8>>,
+
+    phantom_key: PhantomData<K>,
+}
+
+impl<K> TreeIterator<K>
+where
+    K: Key + for<'a> From<&'a [u8]>,
+{
+    pub fn new_wrapping() -> TreeIterator<K> {
+        TreeIterator {
+            done: false,
+            next_key: vec![0; K::KEY_LEN],
+            max_key: None,
+            phantom_key: PhantomData,
+        }
+    }
+
+    pub fn new(range: &std::ops::Range<K>) -> TreeIterator<K> {
+        let result = TreeIterator {
+            done: false,
+            next_key: Vec::from(range.start.as_bytes()),
+            max_key: Some(Vec::from(range.end.as_bytes())),
+            phantom_key: PhantomData,
+        };
+        assert_eq!(result.next_key.len(), K::KEY_LEN);
+        assert_eq!(result.max_key.as_ref().unwrap().len(), K::KEY_LEN);
+
+        result
+    }
+
+    pub fn next<'g, V>(&mut self, read_guard: &'g TreeReadGuard<'g, K, V>) -> Option<(K, &'g V)>
+    where
+        V: Value,
+    {
+        if self.done {
+            return None;
+        }
+
+        let mut wrapped_around = false;
+        loop {
+            assert_eq!(self.next_key.len(), K::KEY_LEN);
+            if let Some((k, v)) =
+                algorithm::iter_next(&self.next_key, read_guard.tree.root, &read_guard.epoch_pin)
+            {
+                assert_eq!(k.len(), K::KEY_LEN);
+                assert_eq!(self.next_key.len(), K::KEY_LEN);
+
+                // Check if we reached the end of the range
+                if let Some(max_key) = &self.max_key {
+                    if k.as_slice() >= max_key.as_slice() {
+                        self.done = true;
+                        break None;
+                    }
+                }
+
+                // increment the key
+                self.next_key = k.clone();
+                increment_key(self.next_key.as_mut_slice());
+                let k = k.as_slice().into();
+
+                break Some((k, v));
+            } else {
+                if self.max_key.is_some() {
+                    self.done = true;
+                } else {
+                    // Start from beginning
+                    if !wrapped_around {
+                        for i in 0..K::KEY_LEN {
+                            self.next_key[i] = 0;
+                        }
+                        wrapped_around = true;
+                        continue;
+                    } else {
+                        // The tree is completely empty
+                        // FIXME: perhaps we should remember the starting point instead.
+                        // Currently this will scan some ranges twice.
+                        break None;
+                    }
+                }
+                break None;
+            }
+        }
+    }
+}
+
+fn increment_key(key: &mut [u8]) -> bool {
+    for i in (0..key.len()).rev() {
+        let (byte, overflow) = key[i].overflowing_add(1);
+        key[i] = byte;
+        if !overflow {
+            return false;
+        }
+    }
+    true
+}
+
+// Debugging functions
+impl<'e, K: Key, V: Value + Debug, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
+    pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
+        algorithm::dump_tree(self.tree_writer.tree.root, &self.epoch_pin, dst)
+    }
+}
+impl<'e, K: Key, V: Value + Debug> TreeReadGuard<'e, K, V> {
+    pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
+        algorithm::dump_tree(self.tree.root, &self.epoch_pin, dst)
+    }
+}
+impl<'e, K: Key, V: Value> TreeWriteAccess<'e, K, V, ArtMultiSlabAllocator<'e, V>> {
+    pub fn get_statistics(&self) -> ArtTreeStatistics {
+        self.allocator.get_statistics();
+        ArtTreeStatistics {
+            blocks: self.allocator.inner.block_allocator.get_statistics(),
+            slabs: self.allocator.get_statistics(),
+            epoch: self.tree.epoch.get_current(),
+            oldest_epoch: self.tree.epoch.get_oldest(),
+            num_garbage: self.garbage.lock().0.len() as u64,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ArtTreeStatistics {
+    pub blocks: allocator::block::BlockAllocatorStats,
+    pub slabs: allocator::ArtMultiSlabStats,
+
+    pub epoch: u64,
+    pub oldest_epoch: u64,
+    pub num_garbage: u64,
+}
--- a/libs/neonart/src/tests.rs
+++ b/libs/neonart/src/tests.rs
@@ -0,0 +1,236 @@
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fmt::{Debug, Formatter};
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use crate::ArtAllocator;
+use crate::ArtMultiSlabAllocator;
+use crate::TreeInitStruct;
+use crate::TreeIterator;
+use crate::TreeWriteAccess;
+use crate::UpdateAction;
+
+use crate::{Key, Value};
+
+use rand::Rng;
+use rand::seq::SliceRandom;
+use rand_distr::Zipf;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl TestKey {
+    const MIN: TestKey = TestKey([0; TEST_KEY_LEN]);
+    const MAX: TestKey = TestKey([u8::MAX; TEST_KEY_LEN]);
+}
+
+impl Key for TestKey {
+    const KEY_LEN: usize = TEST_KEY_LEN;
+    fn as_bytes(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+impl From<&TestKey> for u128 {
+    fn from(val: &TestKey) -> u128 {
+        u128::from_be_bytes(val.0)
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl<'a> From<&'a [u8]> for TestKey {
+    fn from(bytes: &'a [u8]) -> TestKey {
+        TestKey(bytes.try_into().unwrap())
+    }
+}
+
+impl Value for usize {}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
+    const MEM_SIZE: usize = 10000000;
+    let mut area = Box::new_uninit_slice(MEM_SIZE);
+
+    let allocator = ArtMultiSlabAllocator::new(&mut area);
+
+    let init_struct = TreeInitStruct::<TestKey, usize, _>::new(allocator);
+    let tree_writer = init_struct.attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+        let w = tree_writer.start_write();
+        let res = w.insert(&(*k).into(), idx);
+        assert!(res.is_ok());
+    }
+
+    for (idx, k) in keys.iter().enumerate() {
+        let r = tree_writer.start_read();
+        let value = r.get(&(*k).into());
+        assert_eq!(value, Some(idx).as_ref());
+    }
+
+    eprintln!("stats: {:?}", tree_writer.get_statistics());
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut rand::rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.contains(&key) {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
+
+struct TestValue(AtomicUsize);
+
+impl TestValue {
+    fn new(val: usize) -> TestValue {
+        TestValue(AtomicUsize::new(val))
+    }
+
+    fn load(&self) -> usize {
+        self.0.load(Ordering::Relaxed)
+    }
+}
+
+impl Value for TestValue {}
+
+impl Clone for TestValue {
+    fn clone(&self) -> TestValue {
+        TestValue::new(self.load())
+    }
+}
+
+impl Debug for TestValue {
+    fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.load())
+    }
+}
+
+#[derive(Clone, Debug)]
+struct TestOp(TestKey, Option<usize>);
+
+fn apply_op<A: ArtAllocator<TestValue>>(
+    op: &TestOp,
+    tree: &TreeWriteAccess<TestKey, TestValue, A>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    eprintln!("applying op: {op:?}");
+
+    // apply the change to the shadow tree first
+    let shadow_existing = if let Some(v) = op.1 {
+        shadow.insert(op.0, v)
+    } else {
+        shadow.remove(&op.0)
+    };
+
+    // apply to Art tree
+    let w = tree.start_write();
+    w.update_with_fn(&op.0, |existing| {
+        assert_eq!(existing.map(TestValue::load), shadow_existing);
+
+        match (existing, op.1) {
+            (None, None) => UpdateAction::Nothing,
+            (None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
+            (Some(_old_val), None) => UpdateAction::Remove,
+            (Some(old_val), Some(new_val)) => {
+                old_val.0.store(new_val, Ordering::Relaxed);
+                UpdateAction::Nothing
+            }
+        }
+    })
+    .expect("out of memory");
+}
+
+fn test_iter<A: ArtAllocator<TestValue>>(
+    tree: &TreeWriteAccess<TestKey, TestValue, A>,
+    shadow: &BTreeMap<TestKey, usize>,
+) {
+    let mut shadow_iter = shadow.iter();
+    let mut iter = TreeIterator::new(&(TestKey::MIN..TestKey::MAX));
+
+    loop {
+        let shadow_item = shadow_iter.next().map(|(k, v)| (*k, *v));
+        let r = tree.start_read();
+        let item = iter.next(&r);
+
+        if shadow_item != item.map(|(k, v)| (k, v.load())) {
+            eprintln!("FAIL: iterator returned {item:?}, expected {shadow_item:?}");
+            tree.start_read().dump(&mut std::io::stderr());
+
+            eprintln!("SHADOW:");
+            for si in shadow {
+                eprintln!("key: {:?}, val: {}", si.0, si.1);
+            }
+            panic!("FAIL: iterator returned {item:?}, expected {shadow_item:?}");
+        }
+        if item.is_none() {
+            break;
+        }
+    }
+}
+
+#[test]
+fn random_ops() {
+    const MEM_SIZE: usize = 10000000;
+    let mut area = Box::new_uninit_slice(MEM_SIZE);
+
+    let allocator = ArtMultiSlabAllocator::new(&mut area);
+
+    let init_struct = TreeInitStruct::<TestKey, TestValue, _>::new(allocator);
+    let tree_writer = init_struct.attach_writer();
+
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
+    let mut rng = rand::rng();
+    for i in 0..100000 {
+        let mut key: TestKey = (rng.sample(distribution) as u128).into();
+
+        if rng.random_bool(0.10) {
+            key = TestKey::from(u128::from(&key) | 0xffffffff);
+        }
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &tree_writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            eprintln!("stats: {:?}", tree_writer.get_statistics());
+            test_iter(&tree_writer, &shadow);
+        }
+    }
+}
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -19,6 +19,7 @@ byteorder.workspace = true
 utils.workspace = true
 postgres_ffi_types.workspace = true
 postgres_versioninfo.workspace = true
+posthog_client_lite.workspace = true
 enum-map.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
@@ -29,12 +30,13 @@ humantime-serde.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 itertools.workspace = true
 storage_broker.workspace = true
-camino = {workspace = true, features = ["serde1"]}
+camino = { workspace = true, features = ["serde1"] }
 remote_storage.workspace = true
 postgres_backend.workspace = true
-nix = {workspace = true, optional = true}
+nix = { workspace = true, optional = true }
 reqwest.workspace = true
 rand.workspace = true
+tracing.workspace = true
 tracing-utils.workspace = true
 once_cell.workspace = true

--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -4,6 +4,7 @@ use camino::Utf8PathBuf;
 mod tests;

 use const_format::formatcp;
+use posthog_client_lite::PostHogClientConfig;
 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
 pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
@@ -63,25 +64,66 @@ impl Display for NodeMetadata {
    }
 }

-/// PostHog integration config.
+/// PostHog integration config. This is used in pageserver, storcon, and neon_local.
+/// Ensure backward compatibility when adding new fields.
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub struct PostHogConfig {
    /// PostHog project ID
-    pub project_id: String,
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub project_id: Option<String>,
    /// Server-side (private) API key
-    pub server_api_key: String,
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub server_api_key: Option<String>,
    /// Client-side (public) API key
-    pub client_api_key: String,
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub client_api_key: Option<String>,
    /// Private API URL
-    pub private_api_url: String,
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub private_api_url: Option<String>,
    /// Public API URL
-    pub public_api_url: String,
-    /// Refresh interval for the feature flag spec
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub public_api_url: Option<String>,
+    /// Refresh interval for the feature flag spec.
+    /// The storcon will push the feature flag spec to the pageserver. If the pageserver does not receive
+    /// the spec for `refresh_interval`, it will fetch the spec from the PostHog API.
+    #[serde(default)]
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(with = "humantime_serde")]
    pub refresh_interval: Option<Duration>,
 }

+impl PostHogConfig {
+    pub fn try_into_posthog_config(self) -> Result<PostHogClientConfig, &'static str> {
+        let Some(project_id) = self.project_id else {
+            return Err("project_id is required");
+        };
+        let Some(server_api_key) = self.server_api_key else {
+            return Err("server_api_key is required");
+        };
+        let Some(client_api_key) = self.client_api_key else {
+            return Err("client_api_key is required");
+        };
+        let Some(private_api_url) = self.private_api_url else {
+            return Err("private_api_url is required");
+        };
+        let Some(public_api_url) = self.public_api_url else {
+            return Err("public_api_url is required");
+        };
+        Ok(PostHogClientConfig {
+            project_id,
+            server_api_key,
+            client_api_key,
+            private_api_url,
+            public_api_url,
+        })
+    }
+}
+
 /// `pageserver.toml`
 ///
 /// We use serde derive with `#[serde(default)]` to generate a deserializer
@@ -367,6 +409,9 @@ pub struct BasebackupCacheConfig {
    // TODO(diko): support max_entry_size_bytes.
    // pub max_entry_size_bytes: u64,
    pub max_size_entries: usize,
+    /// Size of the channel used to send prepare requests to the basebackup cache worker.
+    /// If exceeded, new prepare requests will be dropped.
+    pub prepare_channel_size: usize,
 }

 impl Default for BasebackupCacheConfig {
@@ -375,7 +420,8 @@ impl Default for BasebackupCacheConfig {
            cleanup_period: Duration::from_secs(60),
            max_total_size_bytes: 1024 * 1024 * 1024, // 1 GiB
            // max_entry_size_bytes: 16 * 1024 * 1024,   // 16 MiB
-            max_size_entries: 1000,
+            max_size_entries: 10000,
+            prepare_channel_size: 100,
        }
    }
 }
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -546,6 +546,11 @@ pub struct TimelineImportRequest {
    pub sk_set: Vec<NodeId>,
 }

+#[derive(serde::Serialize, serde::Deserialize, Clone)]
+pub struct TimelineSafekeeperMigrateRequest {
+    pub new_sk_set: Vec<NodeId>,
+}
+
 #[cfg(test)]
 mod test {
    use serde_json;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -21,7 +21,9 @@ use utils::{completion, serde_system_time};

 use crate::config::Ratio;
 use crate::key::{CompactKey, Key};
-use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
+use crate::shard::{
+    DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardStripeSize, TenantShardId,
+};

 /// The state of a tenant in this pageserver.
 ///
@@ -475,7 +477,7 @@ pub struct TenantShardSplitResponse {
 }

 /// Parameters that apply to all shards in a tenant.  Used during tenant creation.
-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Clone, Copy, Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct ShardParameters {
    pub count: ShardCount,
@@ -497,6 +499,15 @@ impl Default for ShardParameters {
    }
 }

+impl From<ShardIdentity> for ShardParameters {
+    fn from(identity: ShardIdentity) -> Self {
+        Self {
+            count: identity.count,
+            stripe_size: identity.stripe_size,
+        }
+    }
+}
+
 #[derive(Debug, Default, Clone, Eq, PartialEq)]
 pub enum FieldPatch<T> {
    Upsert(T),
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -37,6 +37,7 @@ use std::hash::{Hash, Hasher};
 pub use ::utils::shard::*;
 use postgres_ffi_types::forknum::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
+use utils::critical;

 use crate::key::Key;
 use crate::models::ShardParameters;
@@ -179,7 +180,7 @@ impl ShardIdentity {

    /// For use when creating ShardIdentity instances for new shards, where a creation request
    /// specifies the ShardParameters that apply to all shards.
-    pub fn from_params(number: ShardNumber, params: &ShardParameters) -> Self {
+    pub fn from_params(number: ShardNumber, params: ShardParameters) -> Self {
        Self {
            number,
            count: params.count,
@@ -188,6 +189,17 @@ impl ShardIdentity {
        }
    }

+    /// Asserts that the given shard identities are equal. Changes to shard parameters will likely
+    /// result in data corruption.
+    pub fn assert_equal(&self, other: ShardIdentity) {
+        if self != &other {
+            // TODO: for now, we're conservative and just log errors in production. Turn this into a
+            // real assertion when we're confident it doesn't misfire, and also reject requests that
+            // attempt to change it with an error response.
+            critical!("shard identity mismatch: {self:?} != {other:?}");
+        }
+    }
+
    fn is_broken(&self) -> bool {
        self.layout == LAYOUT_BROKEN
    }
--- a/libs/posthog_client_lite/src/background_loop.rs
+++ b/libs/posthog_client_lite/src/background_loop.rs
@@ -1,17 +1,22 @@
 //! A background loop that fetches feature flags from PostHog and updates the feature store.

-use std::{sync::Arc, time::Duration};
+use std::{
+    sync::Arc,
+    time::{Duration, SystemTime},
+};

 use arc_swap::ArcSwap;
 use tokio_util::sync::CancellationToken;
 use tracing::{Instrument, info_span};

-use crate::{CaptureEvent, FeatureStore, PostHogClient, PostHogClientConfig};
+use crate::{
+    CaptureEvent, FeatureStore, LocalEvaluationResponse, PostHogClient, PostHogClientConfig,
+};

 /// A background loop that fetches feature flags from PostHog and updates the feature store.
 pub struct FeatureResolverBackgroundLoop {
    posthog_client: PostHogClient,
-    feature_store: ArcSwap<FeatureStore>,
+    feature_store: ArcSwap<(SystemTime, Arc<FeatureStore>)>,
    cancel: CancellationToken,
 }

@@ -19,11 +24,35 @@ impl FeatureResolverBackgroundLoop {
    pub fn new(config: PostHogClientConfig, shutdown_pageserver: CancellationToken) -> Self {
        Self {
            posthog_client: PostHogClient::new(config),
-            feature_store: ArcSwap::new(Arc::new(FeatureStore::new())),
+            feature_store: ArcSwap::new(Arc::new((
+                SystemTime::UNIX_EPOCH,
+                Arc::new(FeatureStore::new()),
+            ))),
            cancel: shutdown_pageserver,
        }
    }

+    /// Update the feature store with a new feature flag spec bypassing the normal refresh loop.
+    pub fn update(&self, spec: String) -> anyhow::Result<()> {
+        let resp: LocalEvaluationResponse = serde_json::from_str(&spec)?;
+        self.update_feature_store_nofail(resp, "http_propagate");
+        Ok(())
+    }
+
+    fn update_feature_store_nofail(&self, resp: LocalEvaluationResponse, source: &'static str) {
+        let project_id = self.posthog_client.config.project_id.parse::<u64>().ok();
+        match FeatureStore::new_with_flags(resp.flags, project_id) {
+            Ok(feature_store) => {
+                self.feature_store
+                    .store(Arc::new((SystemTime::now(), Arc::new(feature_store))));
+                tracing::info!("Feature flag updated from {}", source);
+            }
+            Err(e) => {
+                tracing::warn!("Cannot process feature flag spec from {}: {}", source, e);
+            }
+        }
+    }
+
    pub fn spawn(
        self: Arc<Self>,
        handle: &tokio::runtime::Handle,
@@ -47,6 +76,17 @@ impl FeatureResolverBackgroundLoop {
                        _ = ticker.tick() => {}
                        _ = cancel.cancelled() => break
                    }
+                    {
+                        let last_update = this.feature_store.load().0;
+                        if let Ok(elapsed) = last_update.elapsed() {
+                            if elapsed < refresh_period {
+                                tracing::debug!(
+                                    "Skipping feature flag refresh because it's too soon"
+                                );
+                                continue;
+                            }
+                        }
+                    }
                    let resp = match this
                        .posthog_client
                        .get_feature_flags_local_evaluation()
@@ -58,16 +98,7 @@ impl FeatureResolverBackgroundLoop {
                            continue;
                        }
                    };
-                    let project_id = this.posthog_client.config.project_id.parse::<u64>().ok();
-                    match FeatureStore::new_with_flags(resp.flags, project_id) {
-                        Ok(feature_store) => {
-                            this.feature_store.store(Arc::new(feature_store));
-                            tracing::info!("Feature flag updated");
-                        }
-                        Err(e) => {
-                            tracing::warn!("Cannot process feature flag spec: {}", e);
-                        }
-                    }
+                    this.update_feature_store_nofail(resp, "refresh_loop");
                }
                tracing::info!("PostHog feature resolver stopped");
            }
@@ -92,6 +123,6 @@ impl FeatureResolverBackgroundLoop {
    }

    pub fn feature_store(&self) -> Arc<FeatureStore> {
-        self.feature_store.load_full()
+        self.feature_store.load().1.clone()
    }
 }
--- a/libs/posthog_client_lite/src/lib.rs
+++ b/libs/posthog_client_lite/src/lib.rs
@@ -544,17 +544,8 @@ impl PostHogClient {
        self.config.server_api_key.starts_with("phs_")
    }

-    /// Fetch the feature flag specs from the server.
-    ///
-    /// This is unfortunately an undocumented API at:
-    /// - <https://posthog.com/docs/api/feature-flags#get-api-projects-project_id-feature_flags-local_evaluation>
-    /// - <https://posthog.com/docs/feature-flags/local-evaluation>
-    ///
-    /// The handling logic in [`FeatureStore`] mostly follows the Python API implementation.
-    /// See `_compute_flag_locally` in <https://github.com/PostHog/posthog-python/blob/master/posthog/client.py>
-    pub async fn get_feature_flags_local_evaluation(
-        &self,
-    ) -> anyhow::Result<LocalEvaluationResponse> {
+    /// Get the raw JSON spec, same as `get_feature_flags_local_evaluation` but without parsing.
+    pub async fn get_feature_flags_local_evaluation_raw(&self) -> anyhow::Result<String> {
        // BASE_URL/api/projects/:project_id/feature_flags/local_evaluation
        // with bearer token of self.server_api_key
        // OR
@@ -588,7 +579,22 @@ impl PostHogClient {
                body
            ));
        }
-        Ok(serde_json::from_str(&body)?)
+        Ok(body)
+    }
+
+    /// Fetch the feature flag specs from the server.
+    ///
+    /// This is unfortunately an undocumented API at:
+    /// - <https://posthog.com/docs/api/feature-flags#get-api-projects-project_id-feature_flags-local_evaluation>
+    /// - <https://posthog.com/docs/feature-flags/local-evaluation>
+    ///
+    /// The handling logic in [`FeatureStore`] mostly follows the Python API implementation.
+    /// See `_compute_flag_locally` in <https://github.com/PostHog/posthog-python/blob/master/posthog/client.py>
+    pub async fn get_feature_flags_local_evaluation(
+        &self,
+    ) -> Result<LocalEvaluationResponse, anyhow::Error> {
+        let raw = self.get_feature_flags_local_evaluation_raw().await?;
+        Ok(serde_json::from_str(&raw)?)
    }

    /// Capture an event. This will only be used to report the feature flag usage back to PostHog, though
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -12,7 +12,9 @@ use tokio::net::TcpStream;

 use crate::connect::connect;
 use crate::connect_raw::{RawConnection, connect_raw};
-use crate::tls::{MakeTlsConnect, TlsConnect};
+use crate::connect_tls::connect_tls;
+use crate::maybe_tls_stream::MaybeTlsStream;
+use crate::tls::{MakeTlsConnect, TlsConnect, TlsStream};
 use crate::{Client, Connection, Error};

 /// TLS configuration.
@@ -238,7 +240,7 @@ impl Config {
        connect(tls, self).await
    }

-    pub async fn connect_raw<S, T>(
+    pub async fn tls_and_authenticate<S, T>(
        &self,
        stream: S,
        tls: T,
@@ -247,7 +249,19 @@ impl Config {
        S: AsyncRead + AsyncWrite + Unpin,
        T: TlsConnect<S>,
    {
-        connect_raw(stream, tls, self).await
+        let stream = connect_tls(stream, self.ssl_mode, tls).await?;
+        connect_raw(stream, self).await
+    }
+
+    pub async fn authenticate<S, T>(
+        &self,
+        stream: MaybeTlsStream<S, T>,
+    ) -> Result<RawConnection<S, T>, Error>
+    where
+        S: AsyncRead + AsyncWrite + Unpin,
+        T: TlsStream + Unpin,
+    {
+        connect_raw(stream, self).await
    }
 }

--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -9,6 +9,7 @@ use crate::codec::BackendMessage;
 use crate::config::Host;
 use crate::connect_raw::connect_raw;
 use crate::connect_socket::connect_socket;
+use crate::connect_tls::connect_tls;
 use crate::tls::{MakeTlsConnect, TlsConnect};
 use crate::{Client, Config, Connection, Error, RawConnection};

@@ -44,13 +45,14 @@ where
    T: TlsConnect<TcpStream>,
 {
    let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?;
+    let stream = connect_tls(socket, config.ssl_mode, tls).await?;
    let RawConnection {
        stream,
        parameters,
        delayed_notice,
        process_id,
        secret_key,
-    } = connect_raw(socket, tls, config).await?;
+    } = connect_raw(stream, config).await?;

    let socket_config = SocketConfig {
        host_addr,
--- a/libs/proxy/tokio-postgres2/src/connect_raw.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs
@@ -16,9 +16,8 @@ use tokio_util::codec::Framed;
 use crate::Error;
 use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
 use crate::config::{self, AuthKeys, Config};
-use crate::connect_tls::connect_tls;
 use crate::maybe_tls_stream::MaybeTlsStream;
-use crate::tls::{TlsConnect, TlsStream};
+use crate::tls::TlsStream;

 pub struct StartupStream<S, T> {
    inner: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
@@ -87,16 +86,13 @@ pub struct RawConnection<S, T> {
 }

 pub async fn connect_raw<S, T>(
-    stream: S,
-    tls: T,
+    stream: MaybeTlsStream<S, T>,
    config: &Config,
-) -> Result<RawConnection<S, T::Stream>, Error>
+) -> Result<RawConnection<S, T>, Error>
 where
    S: AsyncRead + AsyncWrite + Unpin,
-    T: TlsConnect<S>,
+    T: TlsStream + Unpin,
 {
-    let stream = connect_tls(stream, config.ssl_mode, tls).await?;
-
    let mut stream = StartupStream {
        inner: Framed::new(stream, PostgresCodec),
        buf: BackendMessages::empty(),
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -210,7 +210,7 @@ pub struct TimelineStatus {
 }

 /// Request to switch membership configuration.
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 #[serde(transparent)]
 pub struct TimelineMembershipSwitchRequest {
    pub mconf: Configuration,
@@ -221,6 +221,8 @@ pub struct TimelineMembershipSwitchRequest {
 pub struct TimelineMembershipSwitchResponse {
    pub previous_conf: Configuration,
    pub current_conf: Configuration,
+    pub term: Term,
+    pub flush_lsn: Lsn,
 }

 #[derive(Clone, Copy, Serialize, Deserialize)]
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -86,6 +86,14 @@ pub enum GateError {
    GateClosed,
 }

+impl GateError {
+    pub fn is_cancel(&self) -> bool {
+        match self {
+            GateError::GateClosed => true,
+        }
+    }
+}
+
 impl Default for Gate {
    fn default() -> Self {
        Self {
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -54,6 +54,7 @@ pageserver_api.workspace = true
 pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
 pageserver_compaction.workspace = true
 pageserver_page_api.workspace = true
+peekable.workspace = true
 pem.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
@@ -66,6 +67,7 @@ postgres-types.workspace = true
 posthog_client_lite.workspace = true
 pprof.workspace = true
 pq_proto.workspace = true
+prost.workspace = true
 rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -844,4 +844,13 @@ impl Client {
            .await
            .map_err(Error::ReceiveBody)
    }
+
+    pub async fn update_feature_flag_spec(&self, spec: String) -> Result<()> {
+        let uri = format!("{}/v1/feature_flag_spec", self.mgmt_api_endpoint);
+        self.request(Method::POST, uri, spec)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
 }
--- a/pageserver/client_grpc/Cargo.toml
+++ b/pageserver/client_grpc/Cargo.toml
@@ -0,0 +1,32 @@
+[package]
+name = "pageserver_client_grpc"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+anyhow.workspace = true
+bytes.workspace = true
+futures.workspace = true
+http.workspace = true
+thiserror.workspace = true
+tonic.workspace = true
+tracing.workspace = true
+tokio = { version = "1.43.1", features = ["full", "macros", "net", "io-util", "rt", "rt-multi-thread"] }
+uuid = { version = "1", features = ["v4"] }
+tower = {  version = "0.4", features = ["timeout", "util"] }
+rand = "0.8"
+tokio-util = { version = "0.7", features = ["compat"] }
+hyper-util = "0.1.9"
+hyper = "1.6.0"
+metrics.workspace = true
+priority-queue = "2.3.1"
+scopeguard.workspace = true
+async-trait = { version = "0.1" }
+tokio-stream = "0.1"
+dashmap = "5"
+chrono = { version = "0.4", features = ["serde"] }
+
+
+pageserver_page_api.workspace = true
+pageserver_api.workspace = true
+utils.workspace = true
--- a/pageserver/client_grpc/examples/load_test.rs
+++ b/pageserver/client_grpc/examples/load_test.rs
@@ -0,0 +1,273 @@
+// examples/load_test.rs, generated by AI
+
+use std::collections::{HashMap, HashSet};
+use std::sync::{
+    Arc, Mutex,
+    atomic::{AtomicU64, AtomicUsize, Ordering},
+};
+use std::time::{Duration, Instant};
+
+use rand::Rng;
+use tokio::task;
+use tokio::time::sleep;
+use tonic::Status;
+
+// Pull in your ConnectionPool and PooledItemFactory from the pageserver_client_grpc crate.
+// Adjust these paths if necessary.
+use pageserver_client_grpc::client_cache::ConnectionPool;
+use pageserver_client_grpc::client_cache::PooledItemFactory;
+
+// --------------------------------------
+// GLOBAL COUNTERS FOR “CREATED” / “DROPPED” MockConnections
+// --------------------------------------
+static CREATED: AtomicU64 = AtomicU64::new(0);
+static DROPPED: AtomicU64 = AtomicU64::new(0);
+
+// --------------------------------------
+// MockConnection + Factory
+// --------------------------------------
+
+#[derive(Debug)]
+pub struct MockConnection {
+    pub id: u64,
+}
+
+impl Clone for MockConnection {
+    fn clone(&self) -> Self {
+        // Cloning a MockConnection does NOT count as “creating” a brand‐new connection,
+        // so we do NOT bump CREATED here. We only bump CREATED in the factory’s `create()`.
+        CREATED.fetch_add(1, Ordering::Relaxed);
+        MockConnection { id: self.id }
+    }
+}
+
+impl Drop for MockConnection {
+    fn drop(&mut self) {
+        // When a MockConnection actually gets dropped, bump the counter.
+        DROPPED.fetch_add(1, Ordering::SeqCst);
+    }
+}
+
+#[derive(Default)]
+pub struct MockConnectionFactory {
+    counter: AtomicU64,
+}
+
+#[async_trait::async_trait]
+impl PooledItemFactory<MockConnection> for MockConnectionFactory {
+    /// The trait on ConnectionPool expects:
+    ///   async fn create(&self, timeout: Duration)
+    ///       -> Result<Result<MockConnection, Status>, tokio::time::error::Elapsed>;
+    ///
+    /// On success: Ok(Ok(MockConnection))
+    /// On a simulated “gRPC” failure: Ok(Err(Status::…))
+    /// On a transport/factory error: Err(Box<…>)
+    async fn create(
+        &self,
+        _timeout: Duration,
+    ) -> Result<Result<MockConnection, Status>, tokio::time::error::Elapsed> {
+        // Simulate connection creation immediately succeeding.
+        CREATED.fetch_add(1, Ordering::SeqCst);
+        let next_id = self.counter.fetch_add(1, Ordering::Relaxed);
+        Ok(Ok(MockConnection { id: next_id }))
+    }
+}
+
+// --------------------------------------
+// CLIENT WORKER
+// --------------------------------------
+//
+// Each worker repeatedly calls `pool.get_client().await`. When it succeeds, we:
+//  1. Lock the shared Mutex<HashMap<u64, Arc<AtomicUsize>>> to fetch/insert an Arc<AtomicUsize> for this conn_id.
+//  2. Lock the shared Mutex<HashSet<u64>> to record this conn_id as “seen.”
+//  3. Drop both locks, then atomically increment that counter and assert it ≤ max_consumers.
+//  4. Sleep 10–100 ms to simulate “work.”
+//  5. Atomically decrement the counter.
+//  6. Call `pooled.finish(Ok(()))` to return to the pool.
+
+async fn client_worker(
+    pool: Arc<ConnectionPool<MockConnection>>,
+    usage_map: Arc<Mutex<HashMap<u64, Arc<AtomicUsize>>>>,
+    seen_set: Arc<Mutex<HashSet<u64>>>,
+    max_consumers: usize,
+    worker_id: usize,
+) {
+    for iteration in 0..10 {
+        match pool.clone().get_client().await {
+            Ok(pooled) => {
+                let conn: MockConnection = pooled.channel();
+                let conn_id = conn.id;
+
+                // 1. Fetch or insert the Arc<AtomicUsize> for this conn_id:
+                let counter_arc: Arc<AtomicUsize> = {
+                    let mut guard = usage_map.lock().unwrap();
+                    guard
+                        .entry(conn_id)
+                        .or_insert_with(|| Arc::new(AtomicUsize::new(0)))
+                        .clone()
+                    // MutexGuard is dropped here
+                };
+
+                // 2. Record this conn_id in the shared HashSet of “seen” IDs:
+                {
+                    let mut seen_guard = seen_set.lock().unwrap();
+                    seen_guard.insert(conn_id);
+                    // MutexGuard is dropped immediately
+                }
+
+                // 3. Atomically bump the count for this connection ID
+                let prev = counter_arc.fetch_add(1, Ordering::SeqCst);
+                let current = prev + 1;
+                assert!(
+                    current <= max_consumers,
+                    "Connection {conn_id} exceeded max_consumers (got {current})",
+                );
+
+                println!(
+                    "[worker {worker_id}][iter {iteration}] got MockConnection id={conn_id} ({current} concurrent)",
+                );
+
+                // 4. Simulate some work (10–100 ms)
+                let delay_ms = rand::thread_rng().gen_range(10..100);
+                sleep(Duration::from_millis(delay_ms)).await;
+
+                // 5. Decrement the usage counter
+                let prev2 = counter_arc.fetch_sub(1, Ordering::SeqCst);
+                let after = prev2 - 1;
+                println!(
+                    "[worker {worker_id}][iter {iteration}] returning MockConnection id={conn_id} (now {after} remain)",
+                );
+
+                // 6. Return to the pool (mark success)
+                pooled.finish(Ok(())).await;
+            }
+            Err(status) => {
+                eprintln!(
+                    "[worker {worker_id}][iter {iteration}] failed to get client: {status:?}",
+                );
+            }
+        }
+
+        // Small random pause before next iteration to spread out load
+        let pause = rand::thread_rng().gen_range(0..20);
+        sleep(Duration::from_millis(pause)).await;
+    }
+}
+
+#[tokio::main(flavor = "multi_thread", worker_threads = 8)]
+async fn main() {
+    // --------------------------------------
+    // 1. Create factory and shared instrumentation
+    // --------------------------------------
+    let factory = Arc::new(MockConnectionFactory::default());
+
+    // Shared map: connection ID → Arc<AtomicUsize>
+    let usage_map: Arc<Mutex<HashMap<u64, Arc<AtomicUsize>>>> =
+        Arc::new(Mutex::new(HashMap::new()));
+
+    // Shared set: record each unique connection ID we actually saw
+    let seen_set: Arc<Mutex<HashSet<u64>>> = Arc::new(Mutex::new(HashSet::new()));
+
+    // --------------------------------------
+    // 2. Pool parameters
+    // --------------------------------------
+    let connect_timeout = Duration::from_millis(500);
+    let connect_backoff = Duration::from_millis(100);
+    let max_consumers = 100; // test limit
+    let error_threshold = 2; // mock never fails
+    let max_idle_duration = Duration::from_secs(2);
+    let max_total_connections = 3;
+    let aggregate_metrics = None;
+
+    let pool: Arc<ConnectionPool<MockConnection>> = ConnectionPool::new(
+        factory,
+        connect_timeout,
+        connect_backoff,
+        max_consumers,
+        error_threshold,
+        max_idle_duration,
+        max_total_connections,
+        aggregate_metrics,
+    );
+
+    // --------------------------------------
+    // 3. Spawn worker tasks
+    // --------------------------------------
+    let num_workers = 10000;
+    let mut handles = Vec::with_capacity(num_workers);
+    let start_time = Instant::now();
+
+    for worker_id in 0..num_workers {
+        let pool_clone = Arc::clone(&pool);
+        let usage_clone = Arc::clone(&usage_map);
+        let seen_clone = Arc::clone(&seen_set);
+        let mc = max_consumers;
+
+        let handle = task::spawn(async move {
+            client_worker(pool_clone, usage_clone, seen_clone, mc, worker_id).await;
+        });
+        handles.push(handle);
+    }
+
+    // --------------------------------------
+    // 4. Wait for workers to finish
+    // --------------------------------------
+    for handle in handles {
+        let _ = handle.await;
+    }
+    let elapsed = Instant::now().duration_since(start_time);
+    println!("All {num_workers} workers completed in {elapsed:?}");
+
+    // --------------------------------------
+    // 5. Print the total number of unique connections seen so far
+    // --------------------------------------
+    let unique_count = {
+        let seen_guard = seen_set.lock().unwrap();
+        seen_guard.len()
+    };
+    println!("Total unique connections used by workers: {unique_count}");
+
+    // --------------------------------------
+    // 6. Sleep so the background sweeper can run (max_idle_duration = 2 s)
+    // --------------------------------------
+    sleep(Duration::from_secs(3)).await;
+
+    // --------------------------------------
+    // 7. Shutdown the pool
+    // --------------------------------------
+    let shutdown_pool = Arc::clone(&pool);
+    shutdown_pool.shutdown().await;
+    println!("Pool.shutdown() returned.");
+
+    // --------------------------------------
+    // 8. Verify that no background task still holds an Arc clone of `pool`.
+    //    If any task is still alive (sweeper/create_connection), strong_count > 1.
+    // --------------------------------------
+    sleep(Duration::from_secs(1)).await; // give tasks time to exit
+    let sc = Arc::strong_count(&pool);
+    assert!(
+        sc == 1,
+        "Pool tasks did not all terminate: Arc::strong_count = {sc} (expected 1)",
+    );
+    println!("Verified: all pool tasks have terminated (strong_count == 1).");
+
+    // --------------------------------------
+    // 9. Verify no MockConnection was leaked:
+    //    CREATED must equal DROPPED.
+    // --------------------------------------
+    let created = CREATED.load(Ordering::SeqCst);
+    let dropped = DROPPED.load(Ordering::SeqCst);
+    assert!(
+        created == dropped,
+        "Leaked connections: created={created} but dropped={dropped}",
+    );
+    println!("Verified: no connections leaked (created = {created}, dropped = {dropped}).");
+
+    // --------------------------------------
+    // 10. Because `client_worker` asserted inside that no connection
+    //     ever exceeded `max_consumers`, reaching this point means that check passed.
+    // --------------------------------------
+    println!("All per-connection usage stayed within max_consumers = {max_consumers}.");
+
+    println!("Load test complete; exiting cleanly.");
+}
--- a/pageserver/client_grpc/src/client_cache.rs
+++ b/pageserver/client_grpc/src/client_cache.rs
@@ -0,0 +1,705 @@
+use std::{
+    collections::HashMap,
+    io::{self, Error, ErrorKind},
+    sync::Arc,
+    time::{Duration, Instant},
+};
+
+use priority_queue::PriorityQueue;
+
+use tokio::{
+    io::{AsyncRead, AsyncWrite, ReadBuf},
+    net::TcpStream,
+    sync::{Mutex, OwnedSemaphorePermit, Semaphore},
+    time::sleep,
+};
+use tonic::transport::{Channel, Endpoint};
+
+use uuid;
+
+use std::{
+    pin::Pin,
+    task::{Context, Poll},
+};
+
+use futures::future;
+use rand::{Rng, SeedableRng, rngs::StdRng};
+
+use bytes::BytesMut;
+use http::Uri;
+use hyper_util::rt::TokioIo;
+use tower::service_fn;
+
+use async_trait::async_trait;
+use tokio_util::sync::CancellationToken;
+
+//
+// The "TokioTcp" is flakey TCP network for testing purposes, in order
+// to simulate network errors and delays.
+//
+
+/// Wraps a `TcpStream`, buffers incoming data, and injects a random delay per fresh read/write.
+pub struct TokioTcp {
+    tcp: TcpStream,
+    /// Maximum randomized delay in milliseconds
+    delay_ms: u64,
+
+    /// Next deadline instant for delay
+    deadline: Instant,
+    /// Internal buffer of previously-read data
+    buffer: BytesMut,
+}
+
+impl TokioTcp {
+    /// Create a new wrapper with given max delay (ms)
+    pub fn new(stream: TcpStream, delay_ms: u64) -> Self {
+        let initial = if delay_ms > 0 {
+            rand::thread_rng().gen_range(0..delay_ms)
+        } else {
+            0
+        };
+        let deadline = Instant::now() + Duration::from_millis(initial);
+        TokioTcp {
+            tcp: stream,
+            delay_ms,
+            deadline,
+            buffer: BytesMut::new(),
+        }
+    }
+}
+
+impl AsyncRead for TokioTcp {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        // Safe because TokioTcp is Unpin
+        let this = self.get_mut();
+
+        // 1) Drain any buffered data
+        if !this.buffer.is_empty() {
+            let to_copy = this.buffer.len().min(buf.remaining());
+            buf.put_slice(&this.buffer.split_to(to_copy));
+            return Poll::Ready(Ok(()));
+        }
+
+        // 2) If we're still before the deadline, schedule a wake and return Pending
+        let now = Instant::now();
+        if this.delay_ms > 0 && now < this.deadline {
+            let waker = cx.waker().clone();
+            let wait = this.deadline - now;
+            tokio::spawn(async move {
+                sleep(wait).await;
+                waker.wake_by_ref();
+            });
+            return Poll::Pending;
+        }
+
+        // 3) Past deadline: compute next random deadline
+        if this.delay_ms > 0 {
+            let next_ms = rand::thread_rng().gen_range(0..=this.delay_ms);
+            this.deadline = Instant::now() + Duration::from_millis(next_ms);
+        }
+
+        // 4) Perform actual read into a temporary buffer
+        let mut tmp = [0u8; 4096];
+        let mut rb = ReadBuf::new(&mut tmp);
+        match Pin::new(&mut this.tcp).poll_read(cx, &mut rb) {
+            Poll::Pending => Poll::Pending,
+            Poll::Ready(Ok(())) => {
+                let filled = rb.filled();
+                if filled.is_empty() {
+                    // EOF or zero bytes
+                    Poll::Ready(Ok(()))
+                } else {
+                    this.buffer.extend_from_slice(filled);
+                    let to_copy = this.buffer.len().min(buf.remaining());
+                    buf.put_slice(&this.buffer.split_to(to_copy));
+                    Poll::Ready(Ok(()))
+                }
+            }
+            Poll::Ready(Err(e)) => Poll::Ready(Err(e)),
+        }
+    }
+}
+
+impl AsyncWrite for TokioTcp {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        data: &[u8],
+    ) -> Poll<io::Result<usize>> {
+        let this = self.get_mut();
+
+        // 1) If before deadline, schedule wake and return Pending
+        let now = Instant::now();
+        if this.delay_ms > 0 && now < this.deadline {
+            let waker = cx.waker().clone();
+            let wait = this.deadline - now;
+            tokio::spawn(async move {
+                sleep(wait).await;
+                waker.wake_by_ref();
+            });
+            return Poll::Pending;
+        }
+
+        // 2) Past deadline: compute next random deadline
+        if this.delay_ms > 0 {
+            let next_ms = rand::thread_rng().gen_range(0..=this.delay_ms);
+            this.deadline = Instant::now() + Duration::from_millis(next_ms);
+        }
+
+        // 3) Actual write
+        Pin::new(&mut this.tcp).poll_write(cx, data)
+    }
+
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        let this = self.get_mut();
+        Pin::new(&mut this.tcp).poll_flush(cx)
+    }
+
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        let this = self.get_mut();
+        Pin::new(&mut this.tcp).poll_shutdown(cx)
+    }
+}
+
+#[async_trait]
+pub trait PooledItemFactory<T>: Send + Sync + 'static {
+    /// Create a new pooled item.
+    async fn create(
+        &self,
+        connect_timeout: Duration,
+    ) -> Result<Result<T, tonic::Status>, tokio::time::error::Elapsed>;
+}
+
+pub struct ChannelFactory {
+    endpoint: String,
+    max_delay_ms: u64,
+    drop_rate: f64,
+    hang_rate: f64,
+}
+
+impl ChannelFactory {
+    pub fn new(endpoint: String, max_delay_ms: u64, drop_rate: f64, hang_rate: f64) -> Self {
+        ChannelFactory {
+            endpoint,
+            max_delay_ms,
+            drop_rate,
+            hang_rate,
+        }
+    }
+}
+
+#[async_trait]
+impl PooledItemFactory<Channel> for ChannelFactory {
+    async fn create(
+        &self,
+        connect_timeout: Duration,
+    ) -> Result<Result<Channel, tonic::Status>, tokio::time::error::Elapsed> {
+        let max_delay_ms = self.max_delay_ms;
+        let drop_rate = self.drop_rate;
+        let hang_rate = self.hang_rate;
+
+        // This is a custom connector that inserts delays and errors, for
+        // testing purposes. It would normally be disabled by the config.
+        let connector = service_fn(move |uri: Uri| {
+            let drop_rate = drop_rate;
+            let hang_rate = hang_rate;
+            async move {
+                let mut rng = StdRng::from_entropy();
+                // Simulate an indefinite hang
+                if hang_rate > 0.0 && rng.gen_bool(hang_rate) {
+                    // never completes, to test timeout
+                    return future::pending::<Result<TokioIo<TokioTcp>, std::io::Error>>().await;
+                }
+
+                // Random drop (connect error)
+                if drop_rate > 0.0 && rng.gen_bool(drop_rate) {
+                    return Err(std::io::Error::other("simulated connect drop"));
+                }
+
+                // Otherwise perform real TCP connect
+                let addr = match (uri.host(), uri.port()) {
+                    // host + explicit port
+                    (Some(host), Some(port)) => format!("{}:{}", host, port.as_str()),
+                    // host only (no port)
+                    (Some(host), None) => host.to_string(),
+                    // neither? error out
+                    _ => return Err(Error::new(ErrorKind::InvalidInput, "no host or port")),
+                };
+
+                let tcp = TcpStream::connect(addr).await?;
+                let tcpwrapper = TokioTcp::new(tcp, max_delay_ms);
+                Ok(TokioIo::new(tcpwrapper))
+            }
+        });
+
+        let attempt = tokio::time::timeout(
+            connect_timeout,
+            Endpoint::from_shared(self.endpoint.clone())
+                .expect("invalid endpoint")
+                .timeout(connect_timeout)
+                .connect_with_connector(connector),
+        )
+        .await;
+        match attempt {
+            Ok(Ok(channel)) => {
+                // Connection succeeded
+                Ok(Ok(channel))
+            }
+            Ok(Err(e)) => Ok(Err(tonic::Status::new(
+                tonic::Code::Unavailable,
+                format!("Failed to connect: {e}"),
+            ))),
+            Err(e) => Err(e),
+        }
+    }
+}
+
+/// A pooled gRPC client with capacity tracking and error handling.
+pub struct ConnectionPool<T> {
+    inner: Mutex<Inner<T>>,
+
+    fact: Arc<dyn PooledItemFactory<T> + Send + Sync>,
+
+    connect_timeout: Duration,
+    connect_backoff: Duration,
+    /// The maximum number of consumers that can use a single connection.
+    max_consumers: usize,
+    /// The number of consecutive errors before a connection is removed from the pool.
+    error_threshold: usize,
+    /// The maximum duration a connection can be idle before being removed.
+    max_idle_duration: Duration,
+    max_total_connections: usize,
+
+    channel_semaphore: Arc<Semaphore>,
+
+    shutdown_token: CancellationToken,
+    aggregate_metrics: Option<Arc<crate::PageserverClientAggregateMetrics>>,
+}
+
+struct Inner<T> {
+    entries: HashMap<uuid::Uuid, ConnectionEntry<T>>,
+    pq: PriorityQueue<uuid::Uuid, usize>,
+    // This is updated when a connection is dropped, or we fail
+    // to create a new connection.
+    last_connect_failure: Option<Instant>,
+    waiters: usize,
+    in_progress: usize,
+}
+struct ConnectionEntry<T> {
+    channel: T,
+    active_consumers: usize,
+    consecutive_errors: usize,
+    last_used: Instant,
+}
+
+/// A client borrowed from the pool.
+pub struct PooledClient<T> {
+    pub channel: T,
+    pool: Arc<ConnectionPool<T>>,
+    is_ok: bool,
+    id: uuid::Uuid,
+    permit: OwnedSemaphorePermit,
+}
+
+impl<T: Clone + Send + 'static> ConnectionPool<T> {
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        fact: Arc<dyn PooledItemFactory<T> + Send + Sync>,
+        connect_timeout: Duration,
+        connect_backoff: Duration,
+        max_consumers: usize,
+        error_threshold: usize,
+        max_idle_duration: Duration,
+        max_total_connections: usize,
+        aggregate_metrics: Option<Arc<crate::PageserverClientAggregateMetrics>>,
+    ) -> Arc<Self> {
+        let shutdown_token = CancellationToken::new();
+        let pool = Arc::new(Self {
+            inner: Mutex::new(Inner::<T> {
+                entries: HashMap::new(),
+                pq: PriorityQueue::new(),
+                last_connect_failure: None,
+                waiters: 0,
+                in_progress: 0,
+            }),
+            fact: Arc::clone(&fact),
+            connect_timeout,
+            connect_backoff,
+            max_consumers,
+            error_threshold,
+            max_idle_duration,
+            max_total_connections,
+            channel_semaphore: Arc::new(Semaphore::new(0)),
+            shutdown_token: shutdown_token.clone(),
+            aggregate_metrics: aggregate_metrics.clone(),
+        });
+
+        // Cancelable background task to sweep idle connections
+        let sweeper_token = shutdown_token.clone();
+        let sweeper_pool = Arc::clone(&pool);
+        tokio::spawn(async move {
+            loop {
+                tokio::select! {
+                    _ = sweeper_token.cancelled() => break,
+                    _ = async {
+                        sweeper_pool.sweep_idle_connections().await;
+                        sleep(Duration::from_secs(5)).await;
+                    } => {}
+                }
+            }
+        });
+
+        pool
+    }
+
+    pub async fn shutdown(self: Arc<Self>) {
+        self.shutdown_token.cancel();
+
+        loop {
+            let all_idle = {
+                let inner = self.inner.lock().await;
+                inner.entries.values().all(|e| e.active_consumers == 0)
+            };
+            if all_idle {
+                break;
+            }
+            sleep(Duration::from_millis(100)).await;
+        }
+
+        // 4. Remove all entries
+        let mut inner = self.inner.lock().await;
+        inner.entries.clear();
+    }
+
+    /// Sweep and remove idle connections safely, burning their permits.
+    async fn sweep_idle_connections(self: &Arc<Self>) {
+        let mut ids_to_remove = Vec::new();
+        let now = Instant::now();
+
+        // Remove idle entries. First collect permits for those connections so that
+        // no consumer will reserve them, then remove them from the pool.
+        {
+            let mut inner = self.inner.lock().await;
+            inner.entries.retain(|id, entry| {
+                if entry.active_consumers == 0
+                    && now.duration_since(entry.last_used) > self.max_idle_duration
+                {
+                    // metric
+                    if let Some(ref metrics) = self.aggregate_metrics {
+                        metrics
+                            .retry_counters
+                            .with_label_values(&["connection_swept"])
+                            .inc();
+                    }
+                    ids_to_remove.push(*id);
+                    return false; // remove this entry
+                }
+                true
+            });
+            // Remove the entries from the priority queue
+            for id in ids_to_remove {
+                inner.pq.remove(&id);
+            }
+        }
+    }
+
+    // If we have a permit already, get a connection out of the heap
+    async fn get_conn_with_permit(
+        self: Arc<Self>,
+        permit: OwnedSemaphorePermit,
+    ) -> Option<PooledClient<T>> {
+        let mut inner = self.inner.lock().await;
+
+        // Pop the highest-active-consumers connection. There are no connections
+        // in the heap that have more than max_consumers active consumers.
+        if let Some((id, _cons)) = inner.pq.pop() {
+            let entry = inner
+                .entries
+                .get_mut(&id)
+                .expect("pq and entries got out of sync");
+
+            let mut active_consumers = entry.active_consumers;
+            entry.active_consumers += 1;
+            entry.last_used = Instant::now();
+
+            let client = PooledClient::<T> {
+                channel: entry.channel.clone(),
+                pool: Arc::clone(&self),
+                is_ok: true,
+                id,
+                permit,
+            };
+
+            // re‐insert with updated priority
+            active_consumers += 1;
+            if active_consumers < self.max_consumers {
+                inner.pq.push(id, active_consumers as usize);
+            }
+            Some(client)
+        } else {
+            // If there is no connection to take, it is because permits for a connection
+            // need to drain. This can happen if a connection is removed because it has
+            // too many errors. It is taken out of the heap/hash table in this case, but
+            // we can't remove it's permits until now.
+            //
+            // Just forget the permit and retry.
+            permit.forget();
+            None
+        }
+    }
+
+    pub async fn get_client(self: Arc<Self>) -> Result<PooledClient<T>, tonic::Status> {
+        // The pool is shutting down. Don't accept new connections.
+        if self.shutdown_token.is_cancelled() {
+            return Err(tonic::Status::unavailable("Pool is shutting down"));
+        }
+
+        // A loop is necessary because when a connection is draining, we have to return
+        // a permit and retry.
+        loop {
+            let self_clone = Arc::clone(&self);
+            let mut semaphore = Arc::clone(&self_clone.channel_semaphore);
+
+            match semaphore.try_acquire_owned() {
+                Ok(permit_) => {
+                    // We got a permit, so check the heap for a connection
+                    // we can use.
+                    let pool_conn = self_clone.get_conn_with_permit(permit_).await;
+                    match pool_conn {
+                        Some(pool_conn_) => {
+                            return Ok(pool_conn_);
+                        }
+                        None => {
+                            // No connection available. Forget the permit and retry.
+                            continue;
+                        }
+                    }
+                }
+                Err(_) => {
+                    if let Some(ref metrics) = self_clone.aggregate_metrics {
+                        metrics
+                            .retry_counters
+                            .with_label_values(&["sema_acquire_success"])
+                            .inc();
+                    }
+
+                    {
+                        //
+                        // This is going to generate enough connections to handle a burst,
+                        // but it may generate up to twice the number of connections needed
+                        // in the worst case. Extra connections will go idle and be cleaned
+                        // up.
+                        //
+                        let mut inner = self_clone.inner.lock().await;
+                        inner.waiters += 1;
+                        if inner.waiters > (inner.in_progress * self_clone.max_consumers)
+                            && (inner.entries.len() + inner.in_progress)
+                                < self_clone.max_total_connections
+                        {
+                            let self_clone_spawn = Arc::clone(&self_clone);
+                            tokio::task::spawn(async move {
+                                self_clone_spawn.create_connection().await;
+                            });
+                            inner.in_progress += 1;
+                        }
+                    }
+                    // Wait for a connection to become available, either because it
+                    // was created or because a connection was returned to the pool
+                    // by another consumer.
+                    semaphore = Arc::clone(&self_clone.channel_semaphore);
+                    let conn_permit = semaphore.acquire_owned().await.unwrap();
+                    {
+                        let mut inner = self_clone.inner.lock().await;
+                        inner.waiters -= 1;
+                    }
+                    // We got a permit, check the heap for a connection.
+                    let pool_conn = self_clone.get_conn_with_permit(conn_permit).await;
+                    match pool_conn {
+                        Some(pool_conn_) => {
+                            return Ok(pool_conn_);
+                        }
+                        None => {
+                            // No connection was found, forget the permit and retry.
+                            continue;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    async fn create_connection(&self) {
+        // Generate a random backoff to add some jitter so that connections
+        // don't all retry at the same time.
+        let mut backoff_delay = Duration::from_millis(
+            rand::thread_rng().gen_range(0..=self.connect_backoff.as_millis() as u64),
+        );
+
+        loop {
+            if self.shutdown_token.is_cancelled() {
+                return;
+            }
+
+            // Back off.
+            // Loop because failure can occur while we are sleeping, so wait
+            // until the failure stopped for at least one backoff period. Backoff
+            // period includes some jitter, so that if multiple connections are
+            // failing, they don't all retry at the same time.
+            while let Some(delay) = {
+                let inner = self.inner.lock().await;
+                inner.last_connect_failure.and_then(|at| {
+                    (at.elapsed() < backoff_delay).then(|| backoff_delay - at.elapsed())
+                })
+            } {
+                sleep(delay).await;
+            }
+
+            //
+            // Create a new connection.
+            //
+            // The connect timeout is also the timeout for an individual gRPC request
+            // on this connection. (Requests made later on this channel will time out
+            // with the same timeout.)
+            //
+            if let Some(ref metrics) = self.aggregate_metrics {
+                metrics
+                    .retry_counters
+                    .with_label_values(&["connection_attempt"])
+                    .inc();
+            }
+
+            let attempt = self.fact.create(self.connect_timeout).await;
+
+            match attempt {
+                // Connection succeeded
+                Ok(Ok(channel)) => {
+                    {
+                        if let Some(ref metrics) = self.aggregate_metrics {
+                            metrics
+                                .retry_counters
+                                .with_label_values(&["connection_success"])
+                                .inc();
+                        }
+                        let mut inner = self.inner.lock().await;
+                        let id = uuid::Uuid::new_v4();
+                        inner.entries.insert(
+                            id,
+                            ConnectionEntry::<T> {
+                                channel: channel.clone(),
+                                active_consumers: 0,
+                                consecutive_errors: 0,
+                                last_used: Instant::now(),
+                            },
+                        );
+                        inner.pq.push(id, 0);
+                        inner.in_progress -= 1;
+                        self.channel_semaphore.add_permits(self.max_consumers);
+                        return;
+                    };
+                }
+                // Connection failed, back off and retry
+                Ok(Err(_)) | Err(_) => {
+                    if let Some(ref metrics) = self.aggregate_metrics {
+                        metrics
+                            .retry_counters
+                            .with_label_values(&["connect_failed"])
+                            .inc();
+                    }
+                    let mut inner = self.inner.lock().await;
+                    inner.last_connect_failure = Some(Instant::now());
+                    // Add some jitter so that every connection doesn't retry at once
+                    let jitter = rand::thread_rng().gen_range(0..=backoff_delay.as_millis() as u64);
+                    backoff_delay =
+                        Duration::from_millis(backoff_delay.as_millis() as u64 + jitter);
+
+                    // Do not backoff longer than one minute
+                    if backoff_delay > Duration::from_secs(60) {
+                        backoff_delay = Duration::from_secs(60);
+                    }
+                    // continue the loop to retry
+                }
+            }
+        }
+    }
+
+    /// Return client to the pool, indicating success or error.
+    pub async fn return_client(&self, id: uuid::Uuid, success: bool, permit: OwnedSemaphorePermit) {
+        let mut inner = self.inner.lock().await;
+        if let Some(entry) = inner.entries.get_mut(&id) {
+            entry.last_used = Instant::now();
+            if entry.active_consumers == 0 {
+                panic!("A consumer completed when active_consumers was zero!")
+            }
+            entry.active_consumers -= 1;
+            if success {
+                if entry.consecutive_errors < self.error_threshold {
+                    entry.consecutive_errors = 0;
+                }
+            } else {
+                entry.consecutive_errors += 1;
+                if entry.consecutive_errors == self.error_threshold {
+                    if let Some(ref metrics) = self.aggregate_metrics {
+                        metrics
+                            .retry_counters
+                            .with_label_values(&["connection_dropped"])
+                            .inc();
+                    }
+                }
+            }
+
+            //
+            // Too many errors on this connection. If there are no active users,
+            // remove it. Otherwise just wait for active_consumers to go to zero.
+            // This connection will not be selected for new consumers.
+            //
+            let active_consumers = entry.active_consumers;
+            if entry.consecutive_errors >= self.error_threshold {
+                // too many errors, remove the connection permanently. Once it drains,
+                // it will be dropped.
+                if inner.pq.get_priority(&id).is_some() {
+                    inner.pq.remove(&id);
+                }
+
+                // remove from entries
+                // check if entry is in inner
+                if inner.entries.contains_key(&id) {
+                    inner.entries.remove(&id);
+                }
+                inner.last_connect_failure = Some(Instant::now());
+
+                // The connection has been removed, it's permits will be
+                // drained because if we look for a connection and it's not there
+                // we just forget the permit. However, this process can be a little
+                // bit faster if we just forget permits as the connections are returned.
+                permit.forget();
+            } else {
+                // update its priority in the queue
+                if inner.pq.get_priority(&id).is_some() {
+                    inner.pq.change_priority(&id, active_consumers);
+                } else {
+                    // This connection is not in the heap, but it has space
+                    // for more consumers. Put it back in the heap.
+                    if active_consumers < self.max_consumers {
+                        inner.pq.push(id, active_consumers);
+                    }
+                }
+            }
+        }
+    }
+}
+
+impl<T: Clone + Send + 'static> PooledClient<T> {
+    pub fn channel(&self) -> T {
+        self.channel.clone()
+    }
+    pub async fn finish(mut self, result: Result<(), tonic::Status>) {
+        self.is_ok = result.is_ok();
+        self.pool
+            .return_client(self.id, self.is_ok, self.permit)
+            .await;
+    }
+}
--- a/pageserver/client_grpc/src/lib.rs
+++ b/pageserver/client_grpc/src/lib.rs
@@ -0,0 +1,450 @@
+//! Pageserver Data API client
+//!
+//! - Manage connections to pageserver
+//! - Send requests to correct shards
+//!
+use std::collections::HashMap;
+use std::fmt::Debug;
+use std::sync::Arc;
+use std::sync::RwLock;
+use std::time::Duration;
+
+use bytes::Bytes;
+use futures::{Stream, StreamExt};
+use thiserror::Error;
+use tonic::metadata::AsciiMetadataValue;
+use tonic::transport::Channel;
+
+use pageserver_page_api::proto;
+use pageserver_page_api::proto::PageServiceClient;
+use pageserver_page_api::*;
+use utils::shard::ShardIndex;
+
+pub mod client_cache;
+pub mod pool;
+pub mod request_tracker;
+
+use metrics::{IntCounterVec, core::Collector};
+
+#[derive(Error, Debug)]
+pub enum PageserverClientError {
+    #[error("could not connect to service: {0}")]
+    ConnectError(#[from] tonic::transport::Error),
+    #[error("could not perform request: {0}`")]
+    RequestError(#[from] tonic::Status),
+    #[error("protocol error: {0}")]
+    ProtocolError(#[from] ProtocolError),
+
+    #[error("could not perform request: {0}`")]
+    InvalidUri(#[from] http::uri::InvalidUri),
+
+    #[error("could not perform request: {0}`")]
+    Other(String),
+}
+
+#[derive(Clone, Debug)]
+pub struct PageserverClientAggregateMetrics {
+    pub request_counters: IntCounterVec,
+    pub retry_counters: IntCounterVec,
+}
+
+impl Default for PageserverClientAggregateMetrics {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl PageserverClientAggregateMetrics {
+    pub fn new() -> Self {
+        let request_counters = IntCounterVec::new(
+            metrics::core::Opts::new(
+                "backend_requests_total",
+                "Number of requests from backends.",
+            ),
+            &["request_kind"],
+        )
+        .unwrap();
+
+        let retry_counters = IntCounterVec::new(
+            metrics::core::Opts::new(
+                "backend_requests_retries_total",
+                "Number of retried requests from backends.",
+            ),
+            &["request_kind"],
+        )
+        .unwrap();
+        Self {
+            request_counters,
+            retry_counters,
+        }
+    }
+
+    pub fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
+        let mut metrics = Vec::new();
+        metrics.append(&mut self.request_counters.collect());
+        metrics.append(&mut self.retry_counters.collect());
+        metrics
+    }
+}
+
+pub struct PageserverClient {
+    _tenant_id: String,
+    _timeline_id: String,
+
+    _auth_token: Option<String>,
+
+    shard_map: HashMap<ShardIndex, String>,
+
+    channels: RwLock<HashMap<ShardIndex, Arc<client_cache::ConnectionPool<Channel>>>>,
+
+    auth_interceptor: AuthInterceptor,
+
+    client_cache_options: ClientCacheOptions,
+
+    aggregate_metrics: Option<Arc<PageserverClientAggregateMetrics>>,
+}
+#[derive(Clone)]
+pub struct ClientCacheOptions {
+    pub max_consumers: usize,
+    pub error_threshold: usize,
+    pub connect_timeout: Duration,
+    pub connect_backoff: Duration,
+    pub max_idle_duration: Duration,
+    pub max_total_connections: usize,
+    pub max_delay_ms: u64,
+    pub drop_rate: f64,
+    pub hang_rate: f64,
+}
+
+impl PageserverClient {
+    /// TODO: this doesn't currently react to changes in the shard map.
+    pub fn new(
+        tenant_id: &str,
+        timeline_id: &str,
+        auth_token: &Option<String>,
+        shard_map: HashMap<ShardIndex, String>,
+    ) -> Self {
+        let options = ClientCacheOptions {
+            max_consumers: 5000,
+            error_threshold: 5,
+            connect_timeout: Duration::from_secs(5),
+            connect_backoff: Duration::from_secs(1),
+            max_idle_duration: Duration::from_secs(60),
+            max_total_connections: 100000,
+            max_delay_ms: 0,
+            drop_rate: 0.0,
+            hang_rate: 0.0,
+        };
+        Self::new_with_config(tenant_id, timeline_id, auth_token, shard_map, options, None)
+    }
+    pub fn new_with_config(
+        tenant_id: &str,
+        timeline_id: &str,
+        auth_token: &Option<String>,
+        shard_map: HashMap<ShardIndex, String>,
+        options: ClientCacheOptions,
+        metrics: Option<Arc<PageserverClientAggregateMetrics>>,
+    ) -> Self {
+        Self {
+            _tenant_id: tenant_id.to_string(),
+            _timeline_id: timeline_id.to_string(),
+            _auth_token: auth_token.clone(),
+            shard_map,
+            channels: RwLock::new(HashMap::new()),
+            auth_interceptor: AuthInterceptor::new(tenant_id, timeline_id, auth_token.as_deref()),
+            client_cache_options: options,
+            aggregate_metrics: metrics,
+        }
+    }
+    pub async fn process_check_rel_exists_request(
+        &self,
+        request: CheckRelExistsRequest,
+    ) -> Result<bool, PageserverClientError> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        let request = proto::CheckRelExistsRequest::from(request);
+        let response = client.check_rel_exists(tonic::Request::new(request)).await;
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                Err(PageserverClientError::RequestError(status))
+            }
+            Ok(resp) => {
+                pooled_client.finish(Ok(())).await; // Pass success to finish
+                Ok(resp.get_ref().exists)
+            }
+        }
+    }
+
+    pub async fn process_get_rel_size_request(
+        &self,
+        request: GetRelSizeRequest,
+    ) -> Result<u32, PageserverClientError> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        let request = proto::GetRelSizeRequest::from(request);
+        let response = client.get_rel_size(tonic::Request::new(request)).await;
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                Err(PageserverClientError::RequestError(status))
+            }
+            Ok(resp) => {
+                pooled_client.finish(Ok(())).await; // Pass success to finish
+                Ok(resp.get_ref().num_blocks)
+            }
+        }
+    }
+
+    // Request a single batch of pages
+    //
+    // TODO: This opens a new gRPC stream for every request, which is extremely inefficient
+    pub async fn get_page(
+        &self,
+        request: GetPageRequest,
+    ) -> Result<Vec<Bytes>, PageserverClientError> {
+        // FIXME: calculate the shard number correctly
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        let request = proto::GetPageRequest::from(request);
+
+        let request_stream = futures::stream::once(std::future::ready(request));
+
+        let mut response_stream = client
+            .get_pages(tonic::Request::new(request_stream))
+            .await?
+            .into_inner();
+
+        let Some(response) = response_stream.next().await else {
+            return Err(PageserverClientError::Other(
+                "no response received for getpage request".to_string(),
+            ));
+        };
+
+        if let Some(ref metrics) = self.aggregate_metrics {
+            metrics
+                .request_counters
+                .with_label_values(&["get_page"])
+                .inc();
+        }
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                Err(PageserverClientError::RequestError(status))
+            }
+            Ok(resp) => {
+                pooled_client.finish(Ok(())).await; // Pass success to finish
+                let response: GetPageResponse = resp.into();
+                Ok(response.page_images.to_vec())
+            }
+        }
+    }
+
+    // Open a stream for requesting pages
+    //
+    // TODO: This is a pretty low level interface, the caller should not need to be concerned
+    // with streams. But 'get_page' is currently very naive and inefficient.
+    pub async fn get_pages(
+        &self,
+        requests: impl Stream<Item = proto::GetPageRequest> + Send + 'static,
+    ) -> std::result::Result<
+        tonic::Response<tonic::codec::Streaming<proto::GetPageResponse>>,
+        PageserverClientError,
+    > {
+        // FIXME: calculate the shard number correctly
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        let response = client.get_pages(tonic::Request::new(requests)).await;
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                Err(PageserverClientError::RequestError(status))
+            }
+            Ok(resp) => Ok(resp),
+        }
+    }
+
+    /// Process a request to get the size of a database.
+    pub async fn process_get_dbsize_request(
+        &self,
+        request: GetDbSizeRequest,
+    ) -> Result<u64, PageserverClientError> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        let request = proto::GetDbSizeRequest::from(request);
+        let response = client.get_db_size(tonic::Request::new(request)).await;
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                Err(PageserverClientError::RequestError(status))
+            }
+            Ok(resp) => {
+                pooled_client.finish(Ok(())).await; // Pass success to finish
+                Ok(resp.get_ref().num_bytes)
+            }
+        }
+    }
+    /// Process a request to get the size of a database.
+    pub async fn get_base_backup(
+        &self,
+        request: GetBaseBackupRequest,
+        gzip: bool,
+    ) -> std::result::Result<
+        tonic::Response<tonic::codec::Streaming<proto::GetBaseBackupResponseChunk>>,
+        PageserverClientError,
+    > {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        if gzip {
+            client = client.accept_compressed(tonic::codec::CompressionEncoding::Gzip);
+        }
+
+        let request = proto::GetBaseBackupRequest::from(request);
+        let response = client.get_base_backup(tonic::Request::new(request)).await;
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                Err(PageserverClientError::RequestError(status))
+            }
+            Ok(resp) => {
+                pooled_client.finish(Ok(())).await; // Pass success to finish
+                Ok(resp)
+            }
+        }
+    }
+    /// Get a client for given shard
+    ///
+    /// Get a client from the pool for this shard, also creating the pool if it doesn't exist.
+    ///
+    async fn get_client(&self, shard: ShardIndex) -> client_cache::PooledClient<Channel> {
+        let reused_pool: Option<Arc<client_cache::ConnectionPool<Channel>>> = {
+            let channels = self.channels.read().unwrap();
+            channels.get(&shard).cloned()
+        };
+
+        let usable_pool = match reused_pool {
+            Some(pool) => {
+                let pooled_client = pool.get_client().await.unwrap();
+                return pooled_client;
+            }
+            None => {
+                // Create a new pool using client_cache_options
+                // declare new_pool
+
+                let channel_fact = Arc::new(client_cache::ChannelFactory::new(
+                    self.shard_map.get(&shard).unwrap().clone(),
+                    self.client_cache_options.max_delay_ms,
+                    self.client_cache_options.drop_rate,
+                    self.client_cache_options.hang_rate,
+                ));
+                let new_pool = client_cache::ConnectionPool::new(
+                    channel_fact,
+                    self.client_cache_options.connect_timeout,
+                    self.client_cache_options.connect_backoff,
+                    self.client_cache_options.max_consumers,
+                    self.client_cache_options.error_threshold,
+                    self.client_cache_options.max_idle_duration,
+                    self.client_cache_options.max_total_connections,
+                    self.aggregate_metrics.clone(),
+                );
+                let mut write_pool = self.channels.write().unwrap();
+                write_pool.insert(shard, new_pool.clone());
+                new_pool.clone()
+            }
+        };
+
+        usable_pool.get_client().await.unwrap()
+    }
+}
+
+/// Inject tenant_id, timeline_id and authentication token to all pageserver requests.
+#[derive(Clone)]
+pub struct AuthInterceptor {
+    tenant_id: AsciiMetadataValue,
+    shard_id: Option<AsciiMetadataValue>,
+    timeline_id: AsciiMetadataValue,
+
+    auth_header: Option<AsciiMetadataValue>, // including "Bearer " prefix
+}
+
+impl AuthInterceptor {
+    pub fn new(tenant_id: &str, timeline_id: &str, auth_token: Option<&str>) -> Self {
+        Self {
+            tenant_id: tenant_id.parse().expect("could not parse tenant id"),
+            shard_id: None,
+            timeline_id: timeline_id.parse().expect("could not parse timeline id"),
+            auth_header: auth_token
+                .map(|t| format!("Bearer {t}"))
+                .map(|t| t.parse().expect("could not parse auth token")),
+        }
+    }
+
+    fn for_shard(&self, shard_id: ShardIndex) -> Self {
+        let mut with_shard = self.clone();
+        with_shard.shard_id = Some(
+            shard_id
+                .to_string()
+                .parse()
+                .expect("could not parse shard id"),
+        );
+        with_shard
+    }
+}
+
+impl tonic::service::Interceptor for AuthInterceptor {
+    fn call(&mut self, mut req: tonic::Request<()>) -> Result<tonic::Request<()>, tonic::Status> {
+        req.metadata_mut()
+            .insert("neon-tenant-id", self.tenant_id.clone());
+        if let Some(shard_id) = &self.shard_id {
+            req.metadata_mut().insert("neon-shard-id", shard_id.clone());
+        }
+        req.metadata_mut()
+            .insert("neon-timeline-id", self.timeline_id.clone());
+        if let Some(auth_header) = &self.auth_header {
+            req.metadata_mut()
+                .insert("authorization", auth_header.clone());
+        }
+
+        Ok(req)
+    }
+}
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -0,0 +1,548 @@
+//! This module provides various Pageserver gRPC client resource pools.
+//!
+//! These pools are designed to reuse gRPC resources (connections, clients, and streams) across
+//! multiple callers (i.e. Postgres backends). This avoids the resource cost and latency of creating
+//! a dedicated TCP connection and server task for every Postgres backend.
+//!
+//! Each resource has its own, nested pool. The pools are custom-built for the properties of each
+//! resource -- these are different enough that a generic pool isn't suitable.
+//!
+//! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients
+//!   can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a
+//!   per-channel limit. Channels may be closed when they are no longer used by any clients.
+//!
+//! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared)
+//!   channel from the ChannelPool for client's lifetime. A client can only be acquired by a single
+//!   caller at a time, and is returned to the pool when dropped. Idle clients may be removed from
+//!   the pool after some time, to free up the channel.
+//!
+//! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from
+//!   the ClientPool for the stream's lifetime. Internal streams are not exposed to callers;
+//!   instead, callers submit individual GetPage requests to the pool and await a response.
+//!   Internally, the pool will reuse or spin up a suitable stream for the request, possibly
+//!   pipelining multiple requests from multiple callers on the same stream (up to some queue
+//!   depth), and route the response back to the original caller. Idle streams may be removed from
+//!   the pool after some time, to free up the client.
+
+use std::collections::{BTreeMap, HashMap};
+use std::ops::{Deref, DerefMut};
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::{Arc, Mutex, Weak};
+
+use futures::StreamExt as _;
+use scopeguard::defer;
+use tokio::sync::mpsc::{Receiver, Sender};
+use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
+use tonic::transport::{Channel, Endpoint};
+use tracing::warn;
+
+use pageserver_page_api::{self as page_api, GetPageRequest, GetPageResponse};
+use utils::id::{TenantId, TimelineId};
+use utils::shard::ShardIndex;
+
+// TODO: tune these constants, and consider making them configurable.
+
+/// Max number of concurrent clients per channel.
+///
+/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels
+/// with only streams.
+const CLIENTS_PER_CHANNEL: usize = 16;
+
+/// Maximum number of concurrent clients per `ClientPool`. This bounds the number of channels as
+/// CLIENT_LIMIT / CLIENTS_PER_CHANNEL.
+const CLIENT_LIMIT: usize = 64;
+
+/// Max number of pipelined requests per gRPC GetPage stream.
+const STREAM_QUEUE_DEPTH: usize = 2;
+
+/// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2
+/// stream multiplexing), up to `CLIENTS_PER_CHANNEL`. The pool does not limit the number of
+/// channels, and instead relies on `ClientPool` to limit the number of concurrent clients.
+///
+/// The pool is always wrapped in an outer `Arc`, to allow long-lived references from guards.
+///
+/// Tonic will automatically retry the underlying connection if it fails, so there is no need
+/// to re-establish connections on errors.
+///
+/// TODO: reap idle channels.
+/// TODO: consider adding a circuit breaker for errors and fail fast.
+pub struct ChannelPool {
+    /// Pageserver endpoint to connect to.
+    endpoint: Endpoint,
+    /// Open channels.
+    channels: Mutex<BTreeMap<ChannelID, ChannelEntry>>,
+}
+
+type ChannelID = usize;
+
+struct ChannelEntry {
+    /// The gRPC channel (i.e. TCP connection). Shared by multiple clients.
+    channel: Channel,
+    /// Number of clients using this channel.
+    clients: usize,
+}
+
+impl ChannelPool {
+    /// Creates a new channel pool for the given Pageserver endpoint.
+    pub fn new<E>(endpoint: E) -> anyhow::Result<Arc<Self>>
+    where
+        E: TryInto<Endpoint> + Send + Sync + 'static,
+        <E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
+    {
+        Ok(Arc::new(Self {
+            endpoint: endpoint.try_into()?,
+            channels: Default::default(),
+        }))
+    }
+
+    /// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel.
+    ///
+    /// This never blocks (except for sync mutex acquisition). The channel is connected lazily on
+    /// first use, and the `ChannelPool` does not have a channel limit.
+    ///
+    /// Callers should not clone the returned channel, and must hold onto the returned guard as long
+    /// as the channel is in use. It is unfortunately not possible to enforce this: the Protobuf
+    /// client requires an owned `Channel` and we don't have access to the channel's internal
+    /// refcount.
+    ///
+    /// NB: this is not very performance-sensitive. It is only called when creating a new client,
+    /// and clients are cached and reused by ClientPool. The total number of channels will also be
+    /// small. O(n) performance is therefore okay.
+    pub fn get(self: &Arc<Self>) -> anyhow::Result<ChannelGuard> {
+        let mut channels = self.channels.lock().unwrap();
+
+        // Try to find an existing channel with available capacity. We check entries in BTreeMap
+        // order, to fill up the lower-ordered channels first. The ClientPool also uses clients with
+        // lower-ordered channel IDs first. This will cluster clients in lower-ordered channels, and
+        // free up higher-ordered channels such that they can be reaped.
+        for (&id, entry) in channels.iter_mut() {
+            assert!(entry.clients <= CLIENTS_PER_CHANNEL, "channel overflow");
+            if entry.clients < CLIENTS_PER_CHANNEL {
+                entry.clients += 1;
+                return Ok(ChannelGuard {
+                    pool: Arc::downgrade(self),
+                    id,
+                    channel: Some(entry.channel.clone()),
+                });
+            }
+        }
+
+        // Create a new channel. We connect lazily on the first use, such that we don't block here
+        // and other clients can join onto the same channel while it's connecting.
+        let channel = self.endpoint.connect_lazy();
+
+        let id = channels.keys().last().copied().unwrap_or_default();
+        let entry = ChannelEntry {
+            channel: channel.clone(),
+            clients: 1, // we're returning the guard below
+        };
+        channels.insert(id, entry);
+
+        Ok(ChannelGuard {
+            pool: Arc::downgrade(self),
+            id,
+            channel: Some(channel.clone()),
+        })
+    }
+}
+
+/// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`.
+/// However, the caller must hold onto the guard as long as it's using the channel, and should not
+/// clone it.
+pub struct ChannelGuard {
+    pool: Weak<ChannelPool>,
+    id: ChannelID,
+    channel: Option<Channel>,
+}
+
+impl ChannelGuard {
+    /// Returns the inner channel. Panics if called more than once. The caller must hold onto the
+    /// guard as long as the channel is in use, and should not clone it.
+    pub fn take(&mut self) -> Channel {
+        self.channel.take().expect("channel already taken")
+    }
+}
+
+/// Returns the channel to the pool.
+impl Drop for ChannelGuard {
+    fn drop(&mut self) {
+        let Some(pool) = self.pool.upgrade() else {
+            return; // pool was dropped
+        };
+        let mut channels = pool.channels.lock().unwrap();
+        let entry = channels.get_mut(&self.id).expect("unknown channel");
+        assert!(entry.clients > 0, "channel underflow");
+        entry.clients -= 1;
+    }
+}
+
+/// A pool of gRPC clients for a single tenant shard. Each client acquires a channel from the inner
+/// `ChannelPool`. A client is only acquired by a single caller at a time. The pool limits the total
+/// number of concurrent clients to `CLIENT_LIMIT` via semaphore.
+///
+/// The pool is always wrapped in an outer `Arc`, to allow long-lived references from guards.
+///
+/// TODO: reap idle clients.
+/// TODO: error handling (but channel will be reconnected automatically).
+/// TODO: rate limiting.
+pub struct ClientPool {
+    /// Tenant ID.
+    tenant_id: TenantId,
+    /// Timeline ID.
+    timeline_id: TimelineId,
+    /// Shard ID.
+    shard_id: ShardIndex,
+    /// Authentication token, if any.
+    auth_token: Option<String>,
+    /// Channel pool to acquire channels from.
+    channel_pool: Arc<ChannelPool>,
+    /// Limits the max number of concurrent clients for this pool.
+    limiter: Arc<Semaphore>,
+    /// Idle pooled clients. Acquired clients are removed from here and returned on drop.
+    ///
+    /// The first client in the map will be acquired next. The map is sorted by client ID, which in
+    /// turn is sorted by the channel ID, such that we prefer acquiring idle clients from
+    /// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle
+    /// clients are reaped.
+    idle: Mutex<BTreeMap<ClientID, ClientEntry>>,
+    /// Unique client ID generator.
+    next_client_id: AtomicUsize,
+}
+
+type ClientID = (ChannelID, usize);
+
+struct ClientEntry {
+    client: page_api::Client,
+    channel_guard: ChannelGuard,
+}
+
+impl ClientPool {
+    /// Creates a new client pool for the given tenant shard. Channels are acquired from the given
+    /// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard.
+    pub fn new(
+        channel_pool: Arc<ChannelPool>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_id: ShardIndex,
+        auth_token: Option<String>,
+    ) -> Arc<Self> {
+        Arc::new(Self {
+            tenant_id,
+            timeline_id,
+            shard_id,
+            auth_token,
+            channel_pool,
+            idle: Mutex::default(),
+            limiter: Arc::new(Semaphore::new(CLIENT_LIMIT)),
+            next_client_id: AtomicUsize::default(),
+        })
+    }
+
+    /// Gets a client from the pool, or creates a new one if necessary. Blocks if the pool is at
+    /// `CLIENT_LIMIT`. The client is returned to the pool when the guard is dropped.
+    ///
+    /// This is moderately performance-sensitive. It is called for every unary request, but recall
+    /// that these establish a new gRPC stream per request so it's already expensive. GetPage
+    /// requests use the `StreamPool` instead.
+    pub async fn get(self: &Arc<Self>) -> anyhow::Result<ClientGuard> {
+        let permit = self
+            .limiter
+            .clone()
+            .acquire_owned()
+            .await
+            .expect("never closed");
+
+        // Fast path: acquire an idle client from the pool.
+        if let Some((id, entry)) = self.idle.lock().unwrap().pop_first() {
+            return Ok(ClientGuard {
+                pool: Arc::downgrade(self),
+                id,
+                client: Some(entry.client),
+                channel_guard: Some(entry.channel_guard),
+                permit,
+            });
+        }
+
+        // Slow path: construct a new client.
+        let mut channel_guard = self.channel_pool.get()?;
+        let client = page_api::Client::new(
+            channel_guard.take(),
+            self.tenant_id,
+            self.timeline_id,
+            self.shard_id,
+            self.auth_token.clone(),
+            None,
+        )?;
+
+        Ok(ClientGuard {
+            pool: Arc::downgrade(self),
+            id: (
+                channel_guard.id,
+                self.next_client_id.fetch_add(1, Ordering::Relaxed),
+            ),
+            client: Some(client),
+            channel_guard: Some(channel_guard),
+            permit,
+        })
+    }
+}
+
+/// A client acquired from the pool. The inner client can be accessed via derefs. The client is
+/// returned to the pool when dropped.
+pub struct ClientGuard {
+    pool: Weak<ClientPool>,
+    id: ClientID,
+    client: Option<page_api::Client>,    // Some until dropped
+    channel_guard: Option<ChannelGuard>, // Some until dropped
+    permit: OwnedSemaphorePermit,
+}
+
+impl Deref for ClientGuard {
+    type Target = page_api::Client;
+
+    fn deref(&self) -> &Self::Target {
+        self.client.as_ref().expect("not dropped")
+    }
+}
+
+impl DerefMut for ClientGuard {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.client.as_mut().expect("not dropped")
+    }
+}
+
+// Returns the client to the pool.
+impl Drop for ClientGuard {
+    fn drop(&mut self) {
+        let Some(pool) = self.pool.upgrade() else {
+            return; // pool was dropped
+        };
+        let entry = ClientEntry {
+            client: self.client.take().expect("dropped once"),
+            channel_guard: self.channel_guard.take().expect("dropped once"),
+        };
+        pool.idle.lock().unwrap().insert(self.id, entry);
+
+        // The permit will be returned by its drop handler. Tag it here for visibility.
+        _ = self.permit;
+    }
+}
+
+/// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream
+/// acquires a client from the inner `ClientPool` for the stream's lifetime.
+///
+/// Individual streams are not exposed to callers -- instead, callers submit invididual requests to
+/// the pool and await a response. Internally, requests are multiplexed across streams and channels.
+///
+/// TODO: reap idle streams.
+/// TODO: error handling (but channel will be reconnected automatically).
+/// TODO: rate limiting.
+/// TODO: consider making this generic over request and response types; not currently needed.
+pub struct StreamPool {
+    /// The client pool to acquire clients from.
+    client_pool: Arc<ClientPool>,
+    /// All pooled streams.
+    ///
+    /// Incoming requests will be sent over an existing stream with available capacity, or a new
+    /// stream is spun up and added to the pool. Each stream has an associated Tokio task that
+    /// processes requests and responses.
+    streams: Arc<Mutex<HashMap<StreamID, StreamEntry>>>,
+    /// Limits the max number of concurrent requests (not streams).
+    limiter: Semaphore,
+    /// Stream ID generator.
+    next_stream_id: AtomicUsize,
+}
+
+type StreamID = usize;
+type RequestSender = Sender<(GetPageRequest, ResponseSender)>;
+type RequestReceiver = Receiver<(GetPageRequest, ResponseSender)>;
+type ResponseSender = oneshot::Sender<tonic::Result<GetPageResponse>>;
+
+struct StreamEntry {
+    /// Sends caller requests to the stream task. The stream task exits when this is dropped.
+    sender: RequestSender,
+    /// Number of in-flight requests on this stream. This is an atomic to allow decrementing it on
+    /// completion without acquiring the `StreamPool::streams` lock.
+    queue_depth: Arc<AtomicUsize>,
+}
+
+impl StreamPool {
+    /// Creates a new stream pool, using the given client pool.
+    pub fn new(client_pool: Arc<ClientPool>) -> Self {
+        Self {
+            client_pool,
+            streams: Arc::default(),
+            limiter: Semaphore::new(CLIENT_LIMIT * STREAM_QUEUE_DEPTH),
+            next_stream_id: AtomicUsize::default(),
+        }
+    }
+
+    /// Sends a request via the stream pool and awaits the response. Blocks if the pool is at
+    /// capacity (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight). The
+    /// `GetPageRequest::request_id` must be unique across in-flight request.
+    ///
+    /// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status`
+    /// to avoid tearing down the stream for per-request errors. Callers must check this.
+    ///
+    /// This is very performance-sensitive, as it is on the GetPage hot path.
+    ///
+    /// TODO: this must do something more sophisticated for performance. We want:
+    /// * Cheap, concurrent access in the common case where we can use a pooled stream.
+    /// * Quick acquisition of pooled streams with available capacity.
+    /// * Prefer streams that belong to lower-numbered channels, to reap idle channels.
+    /// * Prefer filling up existing streams' queue depth before spinning up new streams.
+    /// * Don't hold a lock while spinning up new streams.
+    /// * Allow concurrent clients to join onto streams while they're spun up.
+    /// * Allow spinning up multiple streams concurrently, but don't overshoot limits.
+    ///
+    /// For now, we just do something simple and functional, but very inefficient (linear scan).
+    pub async fn send(&self, req: GetPageRequest) -> tonic::Result<GetPageResponse> {
+        // Acquire a permit. For simplicity, we drop it when this method returns. This may exceed
+        // the queue depth if a caller goes away while a request is in flight, but that's okay. We
+        // do the same for queue depth tracking.
+        let _permit = self.limiter.acquire().await.expect("never closed");
+
+        // Acquire a stream sender. We increment and decrement the queue depth here instead of in
+        // the stream task to ensure we don't exceed the queue depth limit.
+        #[allow(clippy::await_holding_lock)] // TODO: Clippy doesn't understand drop()
+        let (req_tx, queue_depth) = async {
+            let mut streams = self.streams.lock().unwrap();
+
+            // Try to find an existing stream with available capacity.
+            for entry in streams.values() {
+                assert!(
+                    entry.queue_depth.load(Ordering::Relaxed) <= STREAM_QUEUE_DEPTH,
+                    "stream overflow"
+                );
+                if entry
+                    .queue_depth
+                    .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |queue_depth| {
+                        // Increment the queue depth via compare-and-swap.
+                        // TODO: review ordering.
+                        (queue_depth < STREAM_QUEUE_DEPTH).then_some(queue_depth + 1)
+                    })
+                    .is_ok()
+                {
+                    return anyhow::Ok((entry.sender.clone(), entry.queue_depth.clone()));
+                }
+            }
+
+            // No available stream, spin up a new one. We install the stream entry first and release
+            // the lock, to allow other callers to join onto this stream and also create additional
+            // streams concurrently when this fills up.
+            let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
+            let queue_depth = Arc::new(AtomicUsize::new(1)); // account for this request
+            let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH);
+            let entry = StreamEntry {
+                sender: req_tx.clone(),
+                queue_depth: queue_depth.clone(),
+            };
+            streams.insert(id, entry);
+
+            drop(streams); // drop lock before spinning up stream
+
+            let client_pool = self.client_pool.clone();
+            let streams = self.streams.clone();
+
+            tokio::spawn(async move {
+                if let Err(err) = Self::run_stream(client_pool, req_rx).await {
+                    warn!("stream failed: {err}");
+                }
+                // Remove stream from pool on exit.
+                let entry = streams.lock().unwrap().remove(&id);
+                assert!(entry.is_some(), "unknown stream ID: {id}");
+            });
+
+            anyhow::Ok((req_tx, queue_depth))
+        }
+        .await
+        .map_err(|err| tonic::Status::internal(err.to_string()))?;
+
+        // Decrement the queue depth on return. This may prematurely decrement it if the caller goes
+        // away while the request is in flight, but that's okay.
+        defer!(
+            let prev_queue_depth = queue_depth.fetch_sub(1, Ordering::SeqCst);
+            assert!(prev_queue_depth > 0, "stream underflow");
+        );
+
+        // Send the request and wait for the response.
+        let (resp_tx, resp_rx) = oneshot::channel();
+
+        req_tx
+            .send((req, resp_tx))
+            .await
+            .map_err(|_| tonic::Status::unavailable("stream closed"))?;
+
+        resp_rx
+            .await
+            .map_err(|_| tonic::Status::unavailable("stream closed"))?
+    }
+
+    /// Runs a stream task. This acquires a client from the `ClientPool` and establishes a
+    /// bidirectional GetPage stream, then forwards requests and responses between callers and the
+    /// stream. It does not track or enforce queue depths, see `send()`.
+    ///
+    /// The task exits when the request channel is closed, or on a stream error. The caller is
+    /// responsible for removing the stream from the pool on exit.
+    async fn run_stream(
+        client_pool: Arc<ClientPool>,
+        mut caller_rx: RequestReceiver,
+    ) -> anyhow::Result<()> {
+        // Acquire a client from the pool and create a stream.
+        let mut client = client_pool.get().await?;
+
+        let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH);
+        let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx);
+        let mut resp_stream = client.get_pages(req_stream).await?;
+
+        // Track caller response channels by request ID. If the task returns early, these response
+        // channels will be dropped and the callers will receive an error.
+        let mut callers = HashMap::with_capacity(STREAM_QUEUE_DEPTH);
+
+        // Process requests and responses.
+        loop {
+            // NB: this can trip if the server doesn't respond to a request, so only debug_assert.
+            debug_assert!(callers.len() <= STREAM_QUEUE_DEPTH, "stream overflow");
+
+            tokio::select! {
+                // Receive requests from callers and send them to the stream.
+                req = caller_rx.recv() => {
+                    // Shut down if request channel is closed.
+                    let Some((req, resp_tx)) = req else {
+                        return Ok(());
+                    };
+
+                    // Store the response channel by request ID.
+                    if callers.contains_key(&req.request_id) {
+                        // Error on request ID duplicates. Ignore callers that went away.
+                        _ = resp_tx.send(Err(tonic::Status::invalid_argument(
+                            format!("duplicate request ID: {}", req.request_id),
+                        )));
+                        continue;
+                    }
+                    callers.insert(req.request_id, resp_tx);
+
+                    // Send the request on the stream. Bail out if the send fails.
+                    req_tx.send(req).await.map_err(|_| {
+                        tonic::Status::unavailable("stream closed")
+                    })?;
+                }
+
+                // Receive responses from the stream and send them to callers.
+                resp = resp_stream.next() => {
+                    // Shut down if the stream is closed, and bail out on stream errors.
+                    let Some(resp) = resp.transpose()? else {
+                        return Ok(())
+                    };
+
+                    // Send the response to the caller. Ignore errors if the caller went away.
+                    let Some(resp_tx) = callers.remove(&resp.request_id) else {
+                        warn!("received response for unknown request ID: {}", resp.request_id);
+                        continue;
+                    };
+                    _ = resp_tx.send(Ok(resp));
+                }
+            }
+        }
+    }
+}
--- a/pageserver/client_grpc/src/request_tracker.rs
+++ b/pageserver/client_grpc/src/request_tracker.rs
@@ -0,0 +1,577 @@
+//! The request tracker dispatches GetPage- and other requests to pageservers, managing a pool of
+//! connections and gRPC streams.
+//!
+//! There is usually one global instance of ShardedRequestTracker in an application, in particular
+//! in the neon extension's communicator process. The application calls the async functions in
+//! ShardedRequestTracker, which routes them to the correct pageservers, taking sharding into
+//! account. In the future, there can be multiple pageservers per shard, and RequestTracker manages
+//! load balancing between them, but that's not implemented yet.
+
+use crate::AuthInterceptor;
+use crate::ClientCacheOptions;
+use crate::PageserverClientAggregateMetrics;
+use crate::client_cache;
+use crate::client_cache::ChannelFactory;
+use crate::client_cache::ConnectionPool;
+use pageserver_page_api::GetPageRequest;
+use pageserver_page_api::GetPageResponse;
+use pageserver_page_api::proto;
+use pageserver_page_api::*;
+use std::sync::Arc;
+use std::sync::atomic::AtomicU64;
+use tonic::{Request, transport::Channel};
+
+use utils::shard::ShardIndex;
+
+use pageserver_page_api::proto::PageServiceClient;
+use tokio_stream::wrappers::ReceiverStream;
+
+use tonic::{Code, Status};
+
+use async_trait::async_trait;
+use std::time::Duration;
+
+use client_cache::PooledItemFactory;
+
+/// StreamReturner represents a gRPC stream to a pageserver.
+///
+/// To send a request:
+/// 1. insert the request's ID, along with a channel to receive the response
+/// 2. send the request to 'sender'
+#[derive(Clone)]
+pub struct StreamReturner {
+    sender: tokio::sync::mpsc::Sender<proto::GetPageRequest>,
+    #[allow(clippy::type_complexity)]
+    sender_hashmap: Arc<
+        tokio::sync::Mutex<
+            Option<
+                std::collections::HashMap<
+                    u64,
+                    tokio::sync::mpsc::Sender<Result<proto::GetPageResponse, Status>>,
+                >,
+            >,
+        >,
+    >,
+}
+
+pub struct StreamFactory {
+    connection_pool: Arc<client_cache::ConnectionPool<Channel>>,
+    auth_interceptor: AuthInterceptor,
+    shard: ShardIndex,
+}
+
+impl StreamFactory {
+    pub fn new(
+        connection_pool: Arc<ConnectionPool<Channel>>,
+        auth_interceptor: AuthInterceptor,
+        shard: ShardIndex,
+    ) -> Self {
+        StreamFactory {
+            connection_pool,
+            auth_interceptor,
+            shard,
+        }
+    }
+}
+
+#[async_trait]
+impl PooledItemFactory<StreamReturner> for StreamFactory {
+    async fn create(
+        &self,
+        _connect_timeout: Duration,
+    ) -> Result<Result<StreamReturner, tonic::Status>, tokio::time::error::Elapsed> {
+        let pool_clone: Arc<ConnectionPool<Channel>> = Arc::clone(&self.connection_pool);
+        let pooled_client = pool_clone.get_client().await;
+        let channel = pooled_client.unwrap().channel();
+        let mut client = PageServiceClient::with_interceptor(
+            channel,
+            self.auth_interceptor.for_shard(self.shard),
+        );
+
+        let (sender, receiver) = tokio::sync::mpsc::channel::<proto::GetPageRequest>(1000);
+        let outbound = ReceiverStream::new(receiver);
+
+        let client_resp = client.get_pages(Request::new(outbound)).await;
+
+        match client_resp {
+            Err(status) => {
+                // TODO: Convert this error correctly
+                Ok(Err(tonic::Status::new(
+                    status.code(),
+                    format!("Failed to connect to pageserver: {}", status.message()),
+                )))
+            }
+            Ok(resp) => {
+                let stream_returner = StreamReturner {
+                    sender: sender.clone(),
+                    sender_hashmap: Arc::new(tokio::sync::Mutex::new(Some(
+                        std::collections::HashMap::new(),
+                    ))),
+                };
+                let map = Arc::clone(&stream_returner.sender_hashmap);
+
+                tokio::spawn(async move {
+                    let map_clone = Arc::clone(&map);
+                    let mut inner = resp.into_inner();
+                    loop {
+                        match inner.message().await {
+                            Err(e) => {
+                                tracing::info!("error received on getpage stream: {e}");
+                                break; // Exit the loop if no more messages
+                            }
+                            Ok(None) => {
+                                break; // Sender closed the stream
+                            }
+                            Ok(Some(response)) => {
+                                // look up stream in hash map
+                                let mut hashmap = map_clone.lock().await;
+                                let hashmap =
+                                    hashmap.as_mut().expect("no other task clears the hashmap");
+                                if let Some(sender) = hashmap.get(&response.request_id) {
+                                    // Send the response to the original request sender
+                                    if let Err(e) = sender.send(Ok(response.clone())).await {
+                                        eprintln!("Failed to send response: {e}");
+                                    }
+                                    hashmap.remove(&response.request_id);
+                                } else {
+                                    eprintln!(
+                                        "No sender found for request ID: {}",
+                                        response.request_id
+                                    );
+                                }
+                            }
+                        }
+                    }
+                    // Don't accept any more requests
+
+                    // Close every sender stream in the hashmap
+                    let mut hashmap_opt = map_clone.lock().await;
+                    let hashmap = hashmap_opt
+                        .as_mut()
+                        .expect("no other task clears the hashmap");
+                    for sender in hashmap.values() {
+                        let error = Status::new(Code::Unknown, "Stream closed");
+                        if let Err(e) = sender.send(Err(error)).await {
+                            eprintln!("Failed to send close response: {e}");
+                        }
+                    }
+                    *hashmap_opt = None;
+                });
+
+                Ok(Ok(stream_returner))
+            }
+        }
+    }
+}
+
+#[derive(Clone)]
+pub struct RequestTracker {
+    _cur_id: Arc<AtomicU64>,
+    stream_pool: Arc<ConnectionPool<StreamReturner>>,
+    unary_pool: Arc<ConnectionPool<Channel>>,
+    auth_interceptor: AuthInterceptor,
+    shard: ShardIndex,
+}
+
+impl RequestTracker {
+    pub fn new(
+        stream_pool: Arc<ConnectionPool<StreamReturner>>,
+        unary_pool: Arc<ConnectionPool<Channel>>,
+        auth_interceptor: AuthInterceptor,
+        shard: ShardIndex,
+    ) -> Self {
+        let cur_id = Arc::new(AtomicU64::new(0));
+
+        RequestTracker {
+            _cur_id: cur_id.clone(),
+            stream_pool,
+            unary_pool,
+            auth_interceptor,
+            shard,
+        }
+    }
+
+    pub async fn send_process_check_rel_exists_request(
+        &self,
+        req: CheckRelExistsRequest,
+    ) -> Result<bool, tonic::Status> {
+        loop {
+            let unary_pool = Arc::clone(&self.unary_pool);
+            let pooled_client = unary_pool.get_client().await.unwrap();
+            let channel = pooled_client.channel();
+            let mut ps_client = PageServiceClient::with_interceptor(
+                channel,
+                self.auth_interceptor.for_shard(self.shard),
+            );
+            let request = proto::CheckRelExistsRequest::from(req);
+            let response = ps_client
+                .check_rel_exists(tonic::Request::new(request))
+                .await;
+
+            match response {
+                Err(status) => {
+                    pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                    continue;
+                }
+                Ok(resp) => {
+                    pooled_client.finish(Ok(())).await; // Pass success to finish
+                    return Ok(resp.get_ref().exists);
+                }
+            }
+        }
+    }
+
+    pub async fn send_process_get_rel_size_request(
+        &self,
+        req: GetRelSizeRequest,
+    ) -> Result<u32, tonic::Status> {
+        loop {
+            // Current sharding model assumes that all metadata is present only at shard 0.
+            let unary_pool = Arc::clone(&self.unary_pool);
+            let pooled_client = unary_pool.get_client().await.unwrap();
+            let channel = pooled_client.channel();
+            let mut ps_client = PageServiceClient::with_interceptor(
+                channel,
+                self.auth_interceptor.for_shard(self.shard),
+            );
+
+            let request = proto::GetRelSizeRequest::from(req);
+            let response = ps_client.get_rel_size(tonic::Request::new(request)).await;
+
+            match response {
+                Err(status) => {
+                    pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                    continue;
+                }
+                Ok(resp) => {
+                    pooled_client.finish(Ok(())).await; // Pass success to finish
+                    return Ok(resp.get_ref().num_blocks);
+                }
+            }
+        }
+    }
+
+    pub async fn send_process_get_dbsize_request(
+        &self,
+        req: GetDbSizeRequest,
+    ) -> Result<u64, tonic::Status> {
+        loop {
+            // Current sharding model assumes that all metadata is present only at shard 0.
+            let unary_pool = Arc::clone(&self.unary_pool);
+            let pooled_client = unary_pool.get_client().await.unwrap();
+            let channel = pooled_client.channel();
+            let mut ps_client = PageServiceClient::with_interceptor(
+                channel,
+                self.auth_interceptor.for_shard(self.shard),
+            );
+
+            let request = proto::GetDbSizeRequest::from(req);
+            let response = ps_client.get_db_size(tonic::Request::new(request)).await;
+
+            match response {
+                Err(status) => {
+                    pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                    continue;
+                }
+                Ok(resp) => {
+                    pooled_client.finish(Ok(())).await; // Pass success to finish
+                    return Ok(resp.get_ref().num_bytes);
+                }
+            }
+        }
+    }
+
+    pub async fn send_getpage_request(
+        &mut self,
+        req: GetPageRequest,
+    ) -> Result<GetPageResponse, tonic::Status> {
+        loop {
+            let request = req.clone();
+            // Increment cur_id
+            //let request_id = self.cur_id.fetch_add(1, Ordering::SeqCst) + 1;
+            let request_id = request.request_id;
+            let response_sender: tokio::sync::mpsc::Sender<Result<proto::GetPageResponse, Status>>;
+            let mut response_receiver: tokio::sync::mpsc::Receiver<
+                Result<proto::GetPageResponse, Status>,
+            >;
+
+            (response_sender, response_receiver) = tokio::sync::mpsc::channel(1);
+            //request.request_id = request_id;
+
+            // Get a stream from the stream pool
+            let pool_clone = Arc::clone(&self.stream_pool);
+            let sender_stream_pool = pool_clone.get_client().await;
+            let stream_returner = match sender_stream_pool {
+                Ok(stream_ret) => stream_ret,
+                Err(_e) => {
+                    // retry
+                    continue;
+                }
+            };
+            let returner = stream_returner.channel();
+            let map = returner.sender_hashmap.clone();
+            // Insert the response sender into the hashmap
+            {
+                if let Some(map_inner) = map.lock().await.as_mut() {
+                    let old = map_inner.insert(request_id, response_sender);
+
+                    // request IDs must be unique
+                    if old.is_some() {
+                        panic!("request with ID {request_id} is already in-flight");
+                    }
+                } else {
+                    // The stream was closed. Try a different one.
+                    tracing::info!("stream was concurrently closed");
+                    continue;
+                }
+            }
+            let sent = returner
+                .sender
+                .send(proto::GetPageRequest::from(request))
+                .await;
+
+            if let Err(_e) = sent {
+                // Remove the request from the map if sending failed
+                {
+                    if let Some(map_inner) = map.lock().await.as_mut() {
+                        // remove from hashmap
+                        map_inner.remove(&request_id);
+                    }
+                }
+                stream_returner
+                    .finish(Err(Status::new(Code::Unknown, "Failed to send request")))
+                    .await;
+                continue;
+            }
+
+            let response = response_receiver.recv().await;
+            match response {
+                Some(resp) => {
+                    match resp {
+                        Err(_status) => {
+                            // Handle the case where the response was not received
+                            stream_returner
+                                .finish(Err(Status::new(
+                                    Code::Unknown,
+                                    "Failed to receive response",
+                                )))
+                                .await;
+                            continue;
+                        }
+                        Ok(resp) => {
+                            stream_returner.finish(Result::Ok(())).await;
+                            return Ok(resp.clone().into());
+                        }
+                    }
+                }
+                None => {
+                    // Handle the case where the response channel was closed
+                    stream_returner
+                        .finish(Err(Status::new(Code::Unknown, "Response channel closed")))
+                        .await;
+                    continue;
+                }
+            }
+        }
+    }
+}
+
+struct ShardedRequestTrackerInner {
+    // Hashmap of shard index to RequestTracker
+    trackers: std::collections::HashMap<ShardIndex, RequestTracker>,
+}
+pub struct ShardedRequestTracker {
+    inner: Arc<std::sync::Mutex<ShardedRequestTrackerInner>>,
+    tcp_client_cache_options: ClientCacheOptions,
+    stream_client_cache_options: ClientCacheOptions,
+}
+
+//
+// TODO: Functions in the ShardedRequestTracker should be able to timeout and
+// cancel a reqeust. The request should return an error if it is cancelled.
+//
+
+impl Default for ShardedRequestTracker {
+    fn default() -> Self {
+        ShardedRequestTracker::new()
+    }
+}
+
+impl ShardedRequestTracker {
+    pub fn new() -> Self {
+        //
+        // Default configuration for the client. These could be added to a config file
+        //
+        let tcp_client_cache_options = ClientCacheOptions {
+            max_delay_ms: 0,
+            drop_rate: 0.0,
+            hang_rate: 0.0,
+            connect_timeout: Duration::from_secs(1),
+            connect_backoff: Duration::from_millis(100),
+            max_consumers: 8, // Streams per connection
+            error_threshold: 10,
+            max_idle_duration: Duration::from_secs(5),
+            max_total_connections: 8,
+        };
+        let stream_client_cache_options = ClientCacheOptions {
+            max_delay_ms: 0,
+            drop_rate: 0.0,
+            hang_rate: 0.0,
+            connect_timeout: Duration::from_secs(1),
+            connect_backoff: Duration::from_millis(100),
+            max_consumers: 64, // Requests per stream
+            error_threshold: 10,
+            max_idle_duration: Duration::from_secs(5),
+            max_total_connections: 64, // Total allowable number of streams
+        };
+        ShardedRequestTracker {
+            inner: Arc::new(std::sync::Mutex::new(ShardedRequestTrackerInner {
+                trackers: std::collections::HashMap::new(),
+            })),
+            tcp_client_cache_options,
+            stream_client_cache_options,
+        }
+    }
+
+    pub async fn update_shard_map(
+        &self,
+        shard_urls: std::collections::HashMap<ShardIndex, String>,
+        metrics: Option<Arc<PageserverClientAggregateMetrics>>,
+        tenant_id: String,
+        timeline_id: String,
+        auth_str: Option<&str>,
+    ) {
+        let mut trackers = std::collections::HashMap::new();
+        for (shard, endpoint_url) in shard_urls {
+            //
+            // Create a pool of streams for streaming get_page requests
+            //
+            let channel_fact: Arc<dyn PooledItemFactory<Channel> + Send + Sync> =
+                Arc::new(ChannelFactory::new(
+                    endpoint_url.clone(),
+                    self.tcp_client_cache_options.max_delay_ms,
+                    self.tcp_client_cache_options.drop_rate,
+                    self.tcp_client_cache_options.hang_rate,
+                ));
+            let new_pool = ConnectionPool::new(
+                Arc::clone(&channel_fact),
+                self.tcp_client_cache_options.connect_timeout,
+                self.tcp_client_cache_options.connect_backoff,
+                self.tcp_client_cache_options.max_consumers,
+                self.tcp_client_cache_options.error_threshold,
+                self.tcp_client_cache_options.max_idle_duration,
+                self.tcp_client_cache_options.max_total_connections,
+                metrics.clone(),
+            );
+
+            let auth_interceptor =
+                AuthInterceptor::new(tenant_id.as_str(), timeline_id.as_str(), auth_str);
+
+            let stream_pool = ConnectionPool::<StreamReturner>::new(
+                Arc::new(StreamFactory::new(
+                    new_pool.clone(),
+                    auth_interceptor.clone(),
+                    ShardIndex::unsharded(),
+                )),
+                self.stream_client_cache_options.connect_timeout,
+                self.stream_client_cache_options.connect_backoff,
+                self.stream_client_cache_options.max_consumers,
+                self.stream_client_cache_options.error_threshold,
+                self.stream_client_cache_options.max_idle_duration,
+                self.stream_client_cache_options.max_total_connections,
+                metrics.clone(),
+            );
+
+            //
+            // Create a client pool for unary requests
+            //
+
+            let unary_pool = ConnectionPool::new(
+                Arc::clone(&channel_fact),
+                self.tcp_client_cache_options.connect_timeout,
+                self.tcp_client_cache_options.connect_backoff,
+                self.tcp_client_cache_options.max_consumers,
+                self.tcp_client_cache_options.error_threshold,
+                self.tcp_client_cache_options.max_idle_duration,
+                self.tcp_client_cache_options.max_total_connections,
+                metrics.clone(),
+            );
+            //
+            // Create a new RequestTracker for this shard
+            //
+            let new_tracker = RequestTracker::new(stream_pool, unary_pool, auth_interceptor, shard);
+            trackers.insert(shard, new_tracker);
+        }
+        let mut inner = self.inner.lock().unwrap();
+        inner.trackers = trackers;
+    }
+
+    pub async fn get_page(&self, req: GetPageRequest) -> Result<GetPageResponse, tonic::Status> {
+        // Get shard index from the request and look up the RequestTracker instance for that shard
+        let shard_index = ShardIndex::unsharded(); // TODO!
+        let mut tracker = self.lookup_tracker_for_shard(shard_index)?;
+
+        let response = tracker.send_getpage_request(req).await;
+        match response {
+            Ok(resp) => Ok(resp),
+            Err(e) => Err(tonic::Status::unknown(format!("Failed to get page: {e}"))),
+        }
+    }
+
+    pub async fn process_get_dbsize_request(
+        &self,
+        request: GetDbSizeRequest,
+    ) -> Result<u64, tonic::Status> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let tracker = self.lookup_tracker_for_shard(ShardIndex::unsharded())?;
+
+        let response = tracker.send_process_get_dbsize_request(request).await;
+        match response {
+            Ok(resp) => Ok(resp),
+            Err(e) => Err(e),
+        }
+    }
+
+    pub async fn process_get_rel_size_request(
+        &self,
+        request: GetRelSizeRequest,
+    ) -> Result<u32, tonic::Status> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let tracker = self.lookup_tracker_for_shard(ShardIndex::unsharded())?;
+
+        let response = tracker.send_process_get_rel_size_request(request).await;
+        match response {
+            Ok(resp) => Ok(resp),
+            Err(e) => Err(e),
+        }
+    }
+
+    pub async fn process_check_rel_exists_request(
+        &self,
+        request: CheckRelExistsRequest,
+    ) -> Result<bool, tonic::Status> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let tracker = self.lookup_tracker_for_shard(ShardIndex::unsharded())?;
+
+        let response = tracker.send_process_check_rel_exists_request(request).await;
+        match response {
+            Ok(resp) => Ok(resp),
+            Err(e) => Err(e),
+        }
+    }
+
+    #[allow(clippy::result_large_err)]
+    fn lookup_tracker_for_shard(
+        &self,
+        shard_index: ShardIndex,
+    ) -> Result<RequestTracker, tonic::Status> {
+        let inner = self.inner.lock().unwrap();
+        if let Some(t) = inner.trackers.get(&shard_index) {
+            Ok(t.clone())
+        } else {
+            Err(tonic::Status::not_found(format!(
+                "Shard {shard_index} not found",
+            )))
+        }
+    }
+}
--- a/pageserver/page_api/Cargo.toml
+++ b/pageserver/page_api/Cargo.toml
@@ -9,12 +9,14 @@ anyhow.workspace = true
 bytes.workspace = true
 futures.workspace = true
 pageserver_api.workspace = true
-postgres_ffi.workspace = true
+postgres_ffi_types.workspace = true
 prost.workspace = true
+prost-types.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
+tokio-util.workspace = true
 tonic.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -35,6 +35,8 @@
 syntax = "proto3";
 package page_api;

+import "google/protobuf/timestamp.proto";
+
 service PageService {
  // Returns whether a relation exists.
  rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse);
@@ -64,6 +66,10 @@ service PageService {

  // Fetches an SLRU segment.
  rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse);
+
+  // Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage
+  // collect the LSN until the lease expires. Must be acquired on all relevant shards.
+  rpc LeaseLsn (LeaseLsnRequest) returns (LeaseLsnResponse);
 }

 // The LSN a request should read at.
@@ -110,6 +116,19 @@ message GetBaseBackupRequest {
  bool replica = 2;
  // If true, include relation files in the base backup. Mainly for debugging and tests.
  bool full = 3;
+  // Compression algorithm to use. Base backups send a compressed payload instead of using gRPC
+  // compression, so that we can cache compressed backups on the server.
+  BaseBackupCompression compression = 4;
+}
+
+// Base backup compression algorithms.
+enum BaseBackupCompression {
+  // Unknown algorithm. Used when clients send an unsupported algorithm.
+  BASE_BACKUP_COMPRESSION_UNKNOWN = 0;
+  // No compression.
+  BASE_BACKUP_COMPRESSION_NONE = 1;
+  // GZIP compression.
+  BASE_BACKUP_COMPRESSION_GZIP = 2;
 }

 // Base backup response chunk, returned as an ordered stream.
@@ -239,3 +258,17 @@ message GetSlruSegmentRequest {
 message GetSlruSegmentResponse {
  bytes segment = 1;
 }
+
+// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage
+// collect the LSN until the lease expires. Must be acquired on all relevant shards.
+message LeaseLsnRequest {
+  // The LSN to lease. Can't be 0 or below the current GC cutoff.
+  uint64 lsn = 1;
+}
+
+// Lease acquisition response. If the lease could not be granted because the LSN has already been
+// garbage collected, a FailedPrecondition status will be returned instead.
+message LeaseLsnResponse {
+  // The lease expiration time.
+  google.protobuf.Timestamp expires = 1;
+}
--- a/pageserver/page_api/src/client.rs
+++ b/pageserver/page_api/src/client.rs
@@ -1,26 +1,153 @@
-use std::convert::TryInto;
-
-use bytes::Bytes;
-use futures::TryStreamExt;
-use futures::{Stream, StreamExt};
+use anyhow::anyhow;
+use futures::{Stream, StreamExt as _, TryStreamExt as _};
+use tokio::io::AsyncRead;
+use tokio_util::io::StreamReader;
+use tonic::codec::CompressionEncoding;
 use tonic::metadata::AsciiMetadataValue;
-use tonic::metadata::errors::InvalidMetadataValue;
-use tonic::transport::Channel;
-use tonic::{Request, Streaming};
+use tonic::service::Interceptor;
+use tonic::service::interceptor::InterceptedService;
+use tonic::transport::{Channel, Endpoint};

-use utils::id::TenantId;
-use utils::id::TimelineId;
+use utils::id::{TenantId, TimelineId};
 use utils::shard::ShardIndex;

-use anyhow::Result;
-
-use crate::model;
+use crate::model::*;
 use crate::proto;

-///
-/// AuthInterceptor adds tenant, timeline, and auth header to the channel. These
-/// headers are required at the pageserver.
-///
+/// A basic Pageserver gRPC client, for a single tenant shard. This API uses native Rust domain
+/// types from `model` rather than generated Protobuf types.
+pub struct Client {
+    inner: proto::PageServiceClient<InterceptedService<Channel, AuthInterceptor>>,
+}
+
+impl Client {
+    /// Connects to the given gRPC endpoint.
+    pub async fn connect<E>(
+        endpoint: E,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_id: ShardIndex,
+        auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
+    ) -> anyhow::Result<Self>
+    where
+        E: TryInto<Endpoint> + Send + Sync + 'static,
+        <E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
+    {
+        let endpoint: Endpoint = endpoint
+            .try_into()
+            .map_err(|err| anyhow!("invalid endpoint: {err}"))?;
+        let channel = endpoint.connect().await?;
+        Self::new(
+            channel,
+            tenant_id,
+            timeline_id,
+            shard_id,
+            auth_token,
+            compression,
+        )
+    }
+
+    /// Creates a new client using the given gRPC channel.
+    pub fn new(
+        channel: Channel,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_id: ShardIndex,
+        auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
+    ) -> anyhow::Result<Self> {
+        let auth = AuthInterceptor::new(tenant_id, timeline_id, shard_id, auth_token)?;
+        let mut inner = proto::PageServiceClient::with_interceptor(channel, auth);
+
+        if let Some(compression) = compression {
+            // TODO: benchmark this (including network latency).
+            inner = inner
+                .accept_compressed(compression)
+                .send_compressed(compression);
+        }
+
+        Ok(Self { inner })
+    }
+
+    /// Returns whether a relation exists.
+    pub async fn check_rel_exists(
+        &mut self,
+        req: CheckRelExistsRequest,
+    ) -> tonic::Result<CheckRelExistsResponse> {
+        let req = proto::CheckRelExistsRequest::from(req);
+        let resp = self.inner.check_rel_exists(req).await?.into_inner();
+        Ok(resp.into())
+    }
+
+    /// Fetches a base backup.
+    pub async fn get_base_backup(
+        &mut self,
+        req: GetBaseBackupRequest,
+    ) -> tonic::Result<impl AsyncRead + use<>> {
+        let req = proto::GetBaseBackupRequest::from(req);
+        let chunks = self.inner.get_base_backup(req).await?.into_inner();
+        Ok(StreamReader::new(
+            chunks
+                .map_ok(|resp| resp.chunk)
+                .map_err(std::io::Error::other),
+        ))
+    }
+
+    /// Returns the total size of a database, as # of bytes.
+    pub async fn get_db_size(&mut self, req: GetDbSizeRequest) -> tonic::Result<GetDbSizeResponse> {
+        let req = proto::GetDbSizeRequest::from(req);
+        let resp = self.inner.get_db_size(req).await?.into_inner();
+        Ok(resp.into())
+    }
+
+    /// Fetches pages.
+    ///
+    /// This is implemented as a bidirectional streaming RPC for performance. Per-request errors are
+    /// typically returned as status_code instead of errors, to avoid tearing down the entire stream
+    /// via a tonic::Status error.
+    pub async fn get_pages(
+        &mut self,
+        reqs: impl Stream<Item = GetPageRequest> + Send + 'static,
+    ) -> tonic::Result<impl Stream<Item = tonic::Result<GetPageResponse>> + Send + 'static> {
+        let reqs = reqs.map(proto::GetPageRequest::from);
+        let resps = self.inner.get_pages(reqs).await?.into_inner();
+        Ok(resps.map_ok(GetPageResponse::from))
+    }
+
+    /// Returns the size of a relation, as # of blocks.
+    pub async fn get_rel_size(
+        &mut self,
+        req: GetRelSizeRequest,
+    ) -> tonic::Result<GetRelSizeResponse> {
+        let req = proto::GetRelSizeRequest::from(req);
+        let resp = self.inner.get_rel_size(req).await?.into_inner();
+        Ok(resp.into())
+    }
+
+    /// Fetches an SLRU segment.
+    pub async fn get_slru_segment(
+        &mut self,
+        req: GetSlruSegmentRequest,
+    ) -> tonic::Result<GetSlruSegmentResponse> {
+        let req = proto::GetSlruSegmentRequest::from(req);
+        let resp = self.inner.get_slru_segment(req).await?.into_inner();
+        Ok(resp.try_into()?)
+    }
+
+    /// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't
+    /// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards.
+    ///
+    /// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be
+    /// acquired because the LSN has already been garbage collected.
+    pub async fn lease_lsn(&mut self, req: LeaseLsnRequest) -> tonic::Result<LeaseLsnResponse> {
+        let req = proto::LeaseLsnRequest::from(req);
+        let resp = self.inner.lease_lsn(req).await?.into_inner();
+        Ok(resp.try_into()?)
+    }
+}
+
+/// Adds authentication metadata to gRPC requests.
 #[derive(Clone)]
 struct AuthInterceptor {
    tenant_id: AsciiMetadataValue,
@@ -33,168 +160,29 @@ impl AuthInterceptor {
    fn new(
        tenant_id: TenantId,
        timeline_id: TimelineId,
-        auth_token: Option<String>,
        shard_id: ShardIndex,
-    ) -> Result<Self, InvalidMetadataValue> {
-        let tenant_ascii: AsciiMetadataValue = tenant_id.to_string().try_into()?;
-        let timeline_ascii: AsciiMetadataValue = timeline_id.to_string().try_into()?;
-        let shard_ascii: AsciiMetadataValue = shard_id.to_string().try_into()?;
-
-        let auth_header: Option<AsciiMetadataValue> = match auth_token {
-            Some(token) => Some(format!("Bearer {token}").try_into()?),
-            None => None,
-        };
-
+        auth_token: Option<String>,
+    ) -> anyhow::Result<Self> {
        Ok(Self {
-            tenant_id: tenant_ascii,
-            shard_id: shard_ascii,
-            timeline_id: timeline_ascii,
-            auth_header,
+            tenant_id: tenant_id.to_string().try_into()?,
+            timeline_id: timeline_id.to_string().try_into()?,
+            shard_id: shard_id.to_string().try_into()?,
+            auth_header: auth_token
+                .map(|token| format!("Bearer {token}").try_into())
+                .transpose()?,
        })
    }
 }

-impl tonic::service::Interceptor for AuthInterceptor {
-    fn call(&mut self, mut req: tonic::Request<()>) -> Result<tonic::Request<()>, tonic::Status> {
-        req.metadata_mut()
-            .insert("neon-tenant-id", self.tenant_id.clone());
-        req.metadata_mut()
-            .insert("neon-shard-id", self.shard_id.clone());
-        req.metadata_mut()
-            .insert("neon-timeline-id", self.timeline_id.clone());
-        if let Some(auth_header) = &self.auth_header {
-            req.metadata_mut()
-                .insert("authorization", auth_header.clone());
+impl Interceptor for AuthInterceptor {
+    fn call(&mut self, mut req: tonic::Request<()>) -> tonic::Result<tonic::Request<()>> {
+        let metadata = req.metadata_mut();
+        metadata.insert("neon-tenant-id", self.tenant_id.clone());
+        metadata.insert("neon-timeline-id", self.timeline_id.clone());
+        metadata.insert("neon-shard-id", self.shard_id.clone());
+        if let Some(ref auth_header) = self.auth_header {
+            metadata.insert("authorization", auth_header.clone());
        }
        Ok(req)
    }
 }
-#[derive(Clone)]
-pub struct Client {
-    client: proto::PageServiceClient<
-        tonic::service::interceptor::InterceptedService<Channel, AuthInterceptor>,
-    >,
-}
-
-impl Client {
-    pub async fn new<T: TryInto<tonic::transport::Endpoint> + Send + Sync + 'static>(
-        into_endpoint: T,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        shard_id: ShardIndex,
-        auth_header: Option<String>,
-        compression: Option<tonic::codec::CompressionEncoding>,
-    ) -> anyhow::Result<Self> {
-        let endpoint: tonic::transport::Endpoint = into_endpoint
-            .try_into()
-            .map_err(|_e| anyhow::anyhow!("failed to convert endpoint"))?;
-        let channel = endpoint.connect().await?;
-        let auth = AuthInterceptor::new(tenant_id, timeline_id, auth_header, shard_id)
-            .map_err(|e| anyhow::anyhow!(e.to_string()))?;
-        let mut client = proto::PageServiceClient::with_interceptor(channel, auth);
-
-        if let Some(compression) = compression {
-            // TODO: benchmark this (including network latency).
-            // TODO: consider enabling compression by default.
-            client = client
-                .accept_compressed(compression)
-                .send_compressed(compression);
-        }
-
-        Ok(Self { client })
-    }
-
-    /// Returns whether a relation exists.
-    pub async fn check_rel_exists(
-        &mut self,
-        req: model::CheckRelExistsRequest,
-    ) -> Result<model::CheckRelExistsResponse, tonic::Status> {
-        let proto_req = proto::CheckRelExistsRequest::from(req);
-
-        let response = self.client.check_rel_exists(proto_req).await?;
-
-        let proto_resp = response.into_inner();
-        Ok(proto_resp.into())
-    }
-
-    /// Fetches a base backup.
-    pub async fn get_base_backup(
-        &mut self,
-        req: model::GetBaseBackupRequest,
-    ) -> Result<impl Stream<Item = Result<Bytes, tonic::Status>> + 'static, tonic::Status> {
-        let proto_req = proto::GetBaseBackupRequest::from(req);
-
-        let response_stream: Streaming<proto::GetBaseBackupResponseChunk> =
-            self.client.get_base_backup(proto_req).await?.into_inner();
-
-        // TODO: Consider dechunking internally
-        let domain_stream = response_stream.map(|chunk_res| {
-            chunk_res.and_then(|proto_chunk| {
-                proto_chunk.try_into().map_err(|e| {
-                    tonic::Status::internal(format!("Failed to convert response chunk: {e}"))
-                })
-            })
-        });
-
-        Ok(domain_stream)
-    }
-
-    /// Returns the total size of a database, as # of bytes.
-    pub async fn get_db_size(
-        &mut self,
-        req: model::GetDbSizeRequest,
-    ) -> Result<u64, tonic::Status> {
-        let proto_req = proto::GetDbSizeRequest::from(req);
-
-        let response = self.client.get_db_size(proto_req).await?;
-        Ok(response.into_inner().into())
-    }
-
-    /// Fetches pages.
-    ///
-    /// This is implemented as a bidirectional streaming RPC for performance.
-    /// Per-request errors are often returned as status_code instead of errors,
-    /// to avoid tearing down the entire stream via tonic::Status.
-    pub async fn get_pages<ReqSt>(
-        &mut self,
-        inbound: ReqSt,
-    ) -> Result<
-        impl Stream<Item = Result<model::GetPageResponse, tonic::Status>> + Send + 'static,
-        tonic::Status,
-    >
-    where
-        ReqSt: Stream<Item = model::GetPageRequest> + Send + 'static,
-    {
-        let outbound_proto = inbound.map(|domain_req| domain_req.into());
-
-        let req_new = Request::new(outbound_proto);
-
-        let response_stream: Streaming<proto::GetPageResponse> =
-            self.client.get_pages(req_new).await?.into_inner();
-
-        let domain_stream = response_stream.map_ok(model::GetPageResponse::from);
-
-        Ok(domain_stream)
-    }
-
-    /// Returns the size of a relation, as # of blocks.
-    pub async fn get_rel_size(
-        &mut self,
-        req: model::GetRelSizeRequest,
-    ) -> Result<model::GetRelSizeResponse, tonic::Status> {
-        let proto_req = proto::GetRelSizeRequest::from(req);
-        let response = self.client.get_rel_size(proto_req).await?;
-        let proto_resp = response.into_inner();
-        Ok(proto_resp.into())
-    }
-
-    /// Fetches an SLRU segment.
-    pub async fn get_slru_segment(
-        &mut self,
-        req: model::GetSlruSegmentRequest,
-    ) -> Result<model::GetSlruSegmentResponse, tonic::Status> {
-        let proto_req = proto::GetSlruSegmentRequest::from(req);
-        let response = self.client.get_slru_segment(proto_req).await?;
-        Ok(response.into_inner().try_into()?)
-    }
-}
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -16,10 +16,11 @@
 //! stream combinators without dealing with errors, and avoids validating the same message twice.

 use std::fmt::Display;
+use std::time::{Duration, SystemTime, UNIX_EPOCH};

 use bytes::Bytes;
-use postgres_ffi::Oid;
-// TODO: split out Lsn, RelTag, SlruKind, Oid and other basic types to a separate crate, to avoid
+use postgres_ffi_types::Oid;
+// TODO: split out Lsn, RelTag, SlruKind and other basic types to a separate crate, to avoid
 // pulling in all of their other crate dependencies when building the client.
 use utils::lsn::Lsn;

@@ -191,15 +192,21 @@ pub struct GetBaseBackupRequest {
    pub replica: bool,
    /// If true, include relation files in the base backup. Mainly for debugging and tests.
    pub full: bool,
+    /// Compression algorithm to use. Base backups send a compressed payload instead of using gRPC
+    /// compression, so that we can cache compressed backups on the server.
+    pub compression: BaseBackupCompression,
 }

-impl From<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
-    fn from(pb: proto::GetBaseBackupRequest) -> Self {
-        Self {
+impl TryFrom<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::GetBaseBackupRequest) -> Result<Self, Self::Error> {
+        Ok(Self {
            lsn: (pb.lsn != 0).then_some(Lsn(pb.lsn)),
            replica: pb.replica,
            full: pb.full,
-        }
+            compression: pb.compression.try_into()?,
+        })
    }
 }

@@ -209,10 +216,55 @@ impl From<GetBaseBackupRequest> for proto::GetBaseBackupRequest {
            lsn: request.lsn.unwrap_or_default().0,
            replica: request.replica,
            full: request.full,
+            compression: request.compression.into(),
        }
    }
 }

+/// Base backup compression algorithm.
+#[derive(Clone, Copy, Debug)]
+pub enum BaseBackupCompression {
+    None,
+    Gzip,
+}
+
+impl TryFrom<proto::BaseBackupCompression> for BaseBackupCompression {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::BaseBackupCompression) -> Result<Self, Self::Error> {
+        match pb {
+            proto::BaseBackupCompression::Unknown => Err(ProtocolError::invalid("compression", pb)),
+            proto::BaseBackupCompression::None => Ok(Self::None),
+            proto::BaseBackupCompression::Gzip => Ok(Self::Gzip),
+        }
+    }
+}
+
+impl TryFrom<i32> for BaseBackupCompression {
+    type Error = ProtocolError;
+
+    fn try_from(compression: i32) -> Result<Self, Self::Error> {
+        proto::BaseBackupCompression::try_from(compression)
+            .map_err(|_| ProtocolError::invalid("compression", compression))
+            .and_then(Self::try_from)
+    }
+}
+
+impl From<BaseBackupCompression> for proto::BaseBackupCompression {
+    fn from(compression: BaseBackupCompression) -> Self {
+        match compression {
+            BaseBackupCompression::None => Self::None,
+            BaseBackupCompression::Gzip => Self::Gzip,
+        }
+    }
+}
+
+impl From<BaseBackupCompression> for i32 {
+    fn from(compression: BaseBackupCompression) -> Self {
+        proto::BaseBackupCompression::from(compression).into()
+    }
+}
+
 pub type GetBaseBackupResponseChunk = Bytes;

 impl TryFrom<proto::GetBaseBackupResponseChunk> for GetBaseBackupResponseChunk {
@@ -652,3 +704,54 @@ impl From<GetSlruSegmentResponse> for proto::GetSlruSegmentResponse {

 // SlruKind is defined in pageserver_api::reltag.
 pub type SlruKind = pageserver_api::reltag::SlruKind;
+
+/// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage
+/// collect the LSN until the lease expires.
+pub struct LeaseLsnRequest {
+    /// The LSN to lease.
+    pub lsn: Lsn,
+}
+
+impl TryFrom<proto::LeaseLsnRequest> for LeaseLsnRequest {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::LeaseLsnRequest) -> Result<Self, Self::Error> {
+        if pb.lsn == 0 {
+            return Err(ProtocolError::Missing("lsn"));
+        }
+        Ok(Self { lsn: Lsn(pb.lsn) })
+    }
+}
+
+impl From<LeaseLsnRequest> for proto::LeaseLsnRequest {
+    fn from(request: LeaseLsnRequest) -> Self {
+        Self { lsn: request.lsn.0 }
+    }
+}
+
+/// Lease expiration time. If the lease could not be granted because the LSN has already been
+/// garbage collected, a FailedPrecondition status will be returned instead.
+pub type LeaseLsnResponse = SystemTime;
+
+impl TryFrom<proto::LeaseLsnResponse> for LeaseLsnResponse {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::LeaseLsnResponse) -> Result<Self, Self::Error> {
+        let expires = pb.expires.ok_or(ProtocolError::Missing("expires"))?;
+        UNIX_EPOCH
+            .checked_add(Duration::new(expires.seconds as u64, expires.nanos as u32))
+            .ok_or_else(|| ProtocolError::invalid("expires", expires))
+    }
+}
+
+impl From<LeaseLsnResponse> for proto::LeaseLsnResponse {
+    fn from(response: LeaseLsnResponse) -> Self {
+        let expires = response.duration_since(UNIX_EPOCH).unwrap_or_default();
+        Self {
+            expires: Some(prost_types::Timestamp {
+                seconds: expires.as_secs() as i64,
+                nanos: expires.subsec_nanos() as i32,
+            }),
+        }
+    }
+}
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -24,10 +24,14 @@ tracing.workspace = true
 tokio.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
+axum.workspace = true
+http.workspace = true
+metrics.workspace = true
 tonic.workspace = true
 url.workspace = true

 pageserver_client.workspace = true
+pageserver_client_grpc.workspace = true
 pageserver_api.workspace = true
 pageserver_page_api.workspace = true
 utils = { path = "../../libs/utils/" }
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -317,6 +317,7 @@ impl Client for LibpqClient {
 /// A gRPC Pageserver client.
 struct GrpcClient {
    inner: page_api::Client,
+    compression: page_api::BaseBackupCompression,
 }

 impl GrpcClient {
@@ -325,16 +326,20 @@ impl GrpcClient {
        ttid: TenantTimelineId,
        compression: bool,
    ) -> anyhow::Result<Self> {
-        let inner = page_api::Client::new(
+        let inner = page_api::Client::connect(
            connstring.to_string(),
            ttid.tenant_id,
            ttid.timeline_id,
            ShardIndex::unsharded(),
            None,
-            compression.then_some(tonic::codec::CompressionEncoding::Zstd),
+            None, // NB: uses payload compression
        )
        .await?;
-        Ok(Self { inner })
+        let compression = match compression {
+            true => page_api::BaseBackupCompression::Gzip,
+            false => page_api::BaseBackupCompression::None,
+        };
+        Ok(Self { inner, compression })
    }
 }

@@ -348,10 +353,8 @@ impl Client for GrpcClient {
            lsn,
            replica: false,
            full: false,
+            compression: self.compression,
        };
-        let stream = self.inner.get_base_backup(req).await?;
-        Ok(Box::pin(StreamReader::new(
-            stream.map_err(std::io::Error::other),
-        )))
+        Ok(Box::pin(self.inner.get_base_backup(req).await?))
    }
 }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -26,12 +26,27 @@ use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
 use utils::shard::ShardIndex;

+use axum::Router;
+use axum::body::Body;
+use axum::extract::State;
+use axum::response::Response;
+
+use http::StatusCode;
+use http::header::CONTENT_TYPE;
+
+use metrics::proto::MetricFamily;
+use metrics::{Encoder, TextEncoder};
+
 use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
 use crate::util::{request_stats, tokio_thread_local_stats};

 /// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
 #[derive(clap::Parser)]
 pub(crate) struct Args {
+    #[clap(long, default_value = "false")]
+    grpc: bool,
+    #[clap(long, default_value = "false")]
+    grpc_stream: bool,
    #[clap(long, default_value = "http://localhost:9898")]
    mgmt_api_endpoint: String,
    /// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
@@ -72,6 +87,9 @@ pub(crate) struct Args {
    #[clap(long)]
    set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,

+    #[clap(long)]
+    only_relnode: Option<u32>,
+
    /// Queue depth generated in each client.
    #[clap(long, default_value = "1")]
    queue_depth: NonZeroUsize,
@@ -86,10 +104,31 @@ pub(crate) struct Args {
    #[clap(long, default_value = "1")]
    batch_size: NonZeroUsize,

-    #[clap(long)]
-    only_relnode: Option<u32>,
-
    targets: Option<Vec<TenantTimelineId>>,
+
+    #[clap(long, default_value = "100")]
+    pool_max_consumers: NonZeroUsize,
+
+    #[clap(long, default_value = "5")]
+    pool_error_threshold: NonZeroUsize,
+
+    #[clap(long, default_value = "5000")]
+    pool_connect_timeout: NonZeroUsize,
+
+    #[clap(long, default_value = "1000")]
+    pool_connect_backoff: NonZeroUsize,
+
+    #[clap(long, default_value = "60000")]
+    pool_max_idle_duration: NonZeroUsize,
+
+    #[clap(long, default_value = "0")]
+    max_delay_ms: usize,
+
+    #[clap(long, default_value = "0")]
+    percent_drops: usize,
+
+    #[clap(long, default_value = "0")]
+    percent_hangs: usize,
 }

 /// State shared by all clients
@@ -146,6 +185,37 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
        main_impl(args, thread_local_stats)
    })
 }
+async fn get_metrics(
+    State(state): State<Arc<pageserver_client_grpc::PageserverClientAggregateMetrics>>,
+) -> Response {
+    let metrics = state.collect();
+
+    info!("metrics: {metrics:?}");
+    // When we call TextEncoder::encode() below, it will immediately return an
+    // error if a metric family has no metrics, so we need to preemptively
+    // filter out metric families with no metrics.
+    let metrics = metrics
+        .into_iter()
+        .filter(|m| !m.get_metric().is_empty())
+        .collect::<Vec<MetricFamily>>();
+
+    let encoder = TextEncoder::new();
+    let mut buffer = vec![];
+
+    if let Err(e) = encoder.encode(&metrics, &mut buffer) {
+        Response::builder()
+            .status(StatusCode::INTERNAL_SERVER_ERROR)
+            .header(CONTENT_TYPE, "application/text")
+            .body(Body::from(e.to_string()))
+            .unwrap()
+    } else {
+        Response::builder()
+            .status(StatusCode::OK)
+            .header(CONTENT_TYPE, encoder.format_type())
+            .body(Body::from(buffer))
+            .unwrap()
+    }
+}

 async fn main_impl(
    args: Args,
@@ -153,6 +223,24 @@ async fn main_impl(
 ) -> anyhow::Result<()> {
    let args: &'static Args = Box::leak(Box::new(args));

+    // Vector of pageserver clients
+    let client_metrics = Arc::new(pageserver_client_grpc::PageserverClientAggregateMetrics::new());
+
+    use axum::routing::get;
+    let app = Router::new()
+        .route("/metrics", get(get_metrics))
+        .with_state(client_metrics.clone());
+
+    // TODO: make configurable. Or listen on unix domain socket?
+    let listener = tokio::net::TcpListener::bind("127.0.0.1:9090")
+        .await
+        .unwrap();
+
+    tokio::spawn(async {
+        tracing::info!("metrics listener spawned");
+        axum::serve(listener, app).await.unwrap()
+    });
+
    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
        reqwest::Client::new(), // TODO: support ssl_ca_file for https APIs in pagebench.
        args.mgmt_api_endpoint.clone(),
@@ -311,6 +399,7 @@ async fn main_impl(
    let rps_period = args
        .per_client_rate
        .map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
+
    let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
        let ss = shared_state.clone();
        let cancel = cancel.clone();
@@ -625,7 +714,7 @@ impl GrpcClient {
        ttid: TenantTimelineId,
        compression: bool,
    ) -> anyhow::Result<Self> {
-        let mut client = page_api::Client::new(
+        let mut client = page_api::Client::connect(
            connstring.to_string(),
            ttid.tenant_id,
            ttid.timeline_id,
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -14,6 +14,7 @@ use std::fmt::Write as FmtWrite;
 use std::time::{Instant, SystemTime};

 use anyhow::{Context, anyhow};
+use async_compression::tokio::write::GzipEncoder;
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
 use pageserver_api::key::{Key, rel_block_to_key};
@@ -25,8 +26,7 @@ use postgres_ffi::{
 };
 use postgres_ffi_types::constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi_types::forknum::{INIT_FORKNUM, MAIN_FORKNUM};
-use tokio::io;
-use tokio::io::AsyncWrite;
+use tokio::io::{self, AsyncWrite, AsyncWriteExt as _};
 use tokio_tar::{Builder, EntryType, Header};
 use tracing::*;
 use utils::lsn::Lsn;
@@ -97,6 +97,7 @@ impl From<BasebackupError> for tonic::Status {
 ///  * When working without safekeepers. In this situation it is important to match the lsn
 ///    we are taking basebackup on with the lsn that is used in pageserver's walreceiver
 ///    to start the replication.
+#[allow(clippy::too_many_arguments)]
 pub async fn send_basebackup_tarball<'a, W>(
    write: &'a mut W,
    timeline: &'a Timeline,
@@ -104,6 +105,7 @@ pub async fn send_basebackup_tarball<'a, W>(
    prev_lsn: Option<Lsn>,
    full_backup: bool,
    replica: bool,
+    gzip_level: Option<async_compression::Level>,
    ctx: &'a RequestContext,
 ) -> Result<(), BasebackupError>
 where
@@ -122,7 +124,7 @@ where
    // prev_lsn value; that happens if the timeline was just branched from
    // an old LSN and it doesn't have any WAL of its own yet. We will set
    // prev_lsn to Lsn(0) if we cannot provide the correct value.
-    let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
+    let (backup_prev, lsn) = if let Some(req_lsn) = req_lsn {
        // Backup was requested at a particular LSN. The caller should've
        // already checked that it's a valid LSN.

@@ -143,7 +145,7 @@ where
    };

    // Consolidate the derived and the provided prev_lsn values
-    let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
+    let prev_record_lsn = if let Some(provided_prev_lsn) = prev_lsn {
        if backup_prev != Lsn(0) && backup_prev != provided_prev_lsn {
            return Err(BasebackupError::Server(anyhow!(
                "backup_prev {backup_prev} != provided_prev_lsn {provided_prev_lsn}"
@@ -155,30 +157,55 @@ where
    };

    info!(
-        "taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={})",
-        backup_lsn, prev_lsn, full_backup, replica
+        "taking basebackup lsn={lsn}, prev_lsn={prev_record_lsn} \
+        (full_backup={full_backup}, replica={replica}, gzip={gzip_level:?})",
+    );
+    let span = info_span!("send_tarball", backup_lsn=%lsn);
+
+    let io_concurrency = IoConcurrency::spawn_from_conf(
+        timeline.conf.get_vectored_concurrent_io,
+        timeline
+            .gate
+            .enter()
+            .map_err(|_| BasebackupError::Shutdown)?,
    );

-    let basebackup = Basebackup {
-        ar: Builder::new_non_terminated(write),
-        timeline,
-        lsn: backup_lsn,
-        prev_record_lsn: prev_lsn,
-        full_backup,
-        replica,
-        ctx,
-        io_concurrency: IoConcurrency::spawn_from_conf(
-            timeline.conf.get_vectored_concurrent_io,
-            timeline
-                .gate
-                .enter()
-                .map_err(|_| BasebackupError::Shutdown)?,
-        ),
-    };
-    basebackup
+    if let Some(gzip_level) = gzip_level {
+        let mut encoder = GzipEncoder::with_quality(write, gzip_level);
+        Basebackup {
+            ar: Builder::new_non_terminated(&mut encoder),
+            timeline,
+            lsn,
+            prev_record_lsn,
+            full_backup,
+            replica,
+            ctx,
+            io_concurrency,
+        }
        .send_tarball()
-        .instrument(info_span!("send_tarball", backup_lsn=%backup_lsn))
-        .await
+        .instrument(span)
+        .await?;
+        encoder
+            .shutdown()
+            .await
+            .map_err(|err| BasebackupError::Client(err, "gzip"))?;
+    } else {
+        Basebackup {
+            ar: Builder::new_non_terminated(write),
+            timeline,
+            lsn,
+            prev_record_lsn,
+            full_backup,
+            replica,
+            ctx,
+            io_concurrency,
+        }
+        .send_tarball()
+        .instrument(span)
+        .await?;
+    }
+
+    Ok(())
 }

 /// This is short-living object only for the time of tarball creation,
--- a/pageserver/src/basebackup_cache.rs
+++ b/pageserver/src/basebackup_cache.rs
@@ -1,13 +1,12 @@
 use std::{collections::HashMap, sync::Arc};

 use anyhow::Context;
-use async_compression::tokio::write::GzipEncoder;
 use camino::{Utf8Path, Utf8PathBuf};
 use metrics::core::{AtomicU64, GenericCounter};
 use pageserver_api::{config::BasebackupCacheConfig, models::TenantState};
 use tokio::{
    io::{AsyncWriteExt, BufWriter},
-    sync::mpsc::{UnboundedReceiver, UnboundedSender},
+    sync::mpsc::{Receiver, Sender, error::TrySendError},
 };
 use tokio_util::sync::CancellationToken;
 use utils::{
@@ -20,8 +19,8 @@ use crate::{
    basebackup::send_basebackup_tarball,
    context::{DownloadBehavior, RequestContext},
    metrics::{
-        BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ,
-        BASEBACKUP_CACHE_SIZE,
+        BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE,
+        BASEBACKUP_CACHE_READ, BASEBACKUP_CACHE_SIZE,
    },
    task_mgr::TaskKind,
    tenant::{
@@ -36,8 +35,8 @@ pub struct BasebackupPrepareRequest {
    pub lsn: Lsn,
 }

-pub type BasebackupPrepareSender = UnboundedSender<BasebackupPrepareRequest>;
-pub type BasebackupPrepareReceiver = UnboundedReceiver<BasebackupPrepareRequest>;
+pub type BasebackupPrepareSender = Sender<BasebackupPrepareRequest>;
+pub type BasebackupPrepareReceiver = Receiver<BasebackupPrepareRequest>;

 #[derive(Clone)]
 struct CacheEntry {
@@ -61,40 +60,65 @@ struct CacheEntry {
 /// and ~1 RPS for get requests.
 pub struct BasebackupCache {
    data_dir: Utf8PathBuf,
+    config: Option<BasebackupCacheConfig>,

    entries: std::sync::Mutex<HashMap<TenantTimelineId, CacheEntry>>,

+    prepare_sender: BasebackupPrepareSender,
+
    read_hit_count: GenericCounter<AtomicU64>,
    read_miss_count: GenericCounter<AtomicU64>,
    read_err_count: GenericCounter<AtomicU64>,
+
+    prepare_skip_count: GenericCounter<AtomicU64>,
 }

 impl BasebackupCache {
-    /// Creates a BasebackupCache and spawns the background task.
-    /// The initialization of the cache is performed in the background and does not
-    /// block the caller. The cache will return `None` for any get requests until
-    /// initialization is complete.
-    pub fn spawn(
-        runtime_handle: &tokio::runtime::Handle,
+    /// Create a new BasebackupCache instance.
+    /// Also returns a BasebackupPrepareReceiver which is needed to start
+    /// the background task.
+    /// The cache is initialized from the data_dir in the background task.
+    /// The cache will return `None` for any get requests until the initialization is complete.
+    /// The background task is spawned separately using [`Self::spawn_background_task`]
+    /// to avoid a circular dependency between the cache and the tenant manager.
+    pub fn new(
        data_dir: Utf8PathBuf,
        config: Option<BasebackupCacheConfig>,
-        prepare_receiver: BasebackupPrepareReceiver,
-        tenant_manager: Arc<TenantManager>,
-        cancel: CancellationToken,
-    ) -> Arc<Self> {
+    ) -> (Arc<Self>, BasebackupPrepareReceiver) {
+        let chan_size = config.as_ref().map(|c| c.max_size_entries).unwrap_or(1);
+
+        let (prepare_sender, prepare_receiver) = tokio::sync::mpsc::channel(chan_size);
+
        let cache = Arc::new(BasebackupCache {
            data_dir,
-
+            config,
            entries: std::sync::Mutex::new(HashMap::new()),
+            prepare_sender,

            read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]),
            read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]),
            read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]),
+
+            prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]),
        });

-        if let Some(config) = config {
+        (cache, prepare_receiver)
+    }
+
+    /// Spawns the background task.
+    /// The background task initializes the cache from the disk,
+    /// processes prepare requests, and cleans up outdated cache entries.
+    /// Noop if the cache is disabled (config is None).
+    pub fn spawn_background_task(
+        self: Arc<Self>,
+        runtime_handle: &tokio::runtime::Handle,
+        prepare_receiver: BasebackupPrepareReceiver,
+        tenant_manager: Arc<TenantManager>,
+        cancel: CancellationToken,
+    ) {
+        if let Some(config) = self.config.clone() {
            let background = BackgroundTask {
-                c: cache.clone(),
+                c: self,

                config,
                tenant_manager,
@@ -109,8 +133,45 @@ impl BasebackupCache {
            };
            runtime_handle.spawn(background.run(prepare_receiver));
        }
+    }

-        cache
+    /// Send a basebackup prepare request to the background task.
+    /// The basebackup will be prepared asynchronously, it does not block the caller.
+    /// The request will be skipped if any cache limits are exceeded.
+    pub fn send_prepare(&self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, lsn: Lsn) {
+        let req = BasebackupPrepareRequest {
+            tenant_shard_id,
+            timeline_id,
+            lsn,
+        };
+
+        BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.inc();
+        let res = self.prepare_sender.try_send(req);
+
+        if let Err(e) = res {
+            BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec();
+            self.prepare_skip_count.inc();
+            match e {
+                TrySendError::Full(_) => {
+                    // Basebackup prepares are pretty rare, normally we should not hit this.
+                    tracing::info!(
+                        tenant_id = %tenant_shard_id.tenant_id,
+                        %timeline_id,
+                        %lsn,
+                        "Basebackup prepare channel is full, skipping the request"
+                    );
+                }
+                TrySendError::Closed(_) => {
+                    // Normal during shutdown, not critical.
+                    tracing::info!(
+                        tenant_id = %tenant_shard_id.tenant_id,
+                        %timeline_id,
+                        %lsn,
+                        "Basebackup prepare channel is closed, skipping the request"
+                    );
+                }
+            }
+        }
    }

    /// Gets a basebackup entry from the cache.
@@ -123,6 +184,10 @@ impl BasebackupCache {
        timeline_id: TimelineId,
        lsn: Lsn,
    ) -> Option<tokio::fs::File> {
+        if !self.is_enabled() {
+            return None;
+        }
+
        // Fast path. Check if the entry exists using the in-memory state.
        let tti = TenantTimelineId::new(tenant_id, timeline_id);
        if self.entries.lock().unwrap().get(&tti).map(|e| e.lsn) != Some(lsn) {
@@ -150,6 +215,10 @@ impl BasebackupCache {
        }
    }

+    pub fn is_enabled(&self) -> bool {
+        self.config.is_some()
+    }
+
    // Private methods.

    fn entry_filename(tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> String {
@@ -367,6 +436,7 @@ impl BackgroundTask {
        loop {
            tokio::select! {
                Some(req) = prepare_receiver.recv() => {
+                    BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE.dec();
                    if let Err(err) = self.prepare_basebackup(
                        req.tenant_shard_id,
                        req.timeline_id,
@@ -594,13 +664,6 @@ impl BackgroundTask {
        let file = tokio::fs::File::create(entry_tmp_path).await?;
        let mut writer = BufWriter::new(file);

-        let mut encoder = GzipEncoder::with_quality(
-            &mut writer,
-            // Level::Best because compression is not on the hot path of basebackup requests.
-            // The decompression is almost not affected by the compression level.
-            async_compression::Level::Best,
-        );
-
        // We may receive a request before the WAL record is applied to the timeline.
        // Wait for the requested LSN to be applied.
        timeline
@@ -613,17 +676,19 @@ impl BackgroundTask {
            .await?;

        send_basebackup_tarball(
-            &mut encoder,
+            &mut writer,
            timeline,
            Some(req_lsn),
            None,
            false,
            false,
+            // Level::Best because compression is not on the hot path of basebackup requests.
+            // The decompression is almost not affected by the compression level.
+            Some(async_compression::Level::Best),
            &ctx,
        )
        .await?;

-        encoder.shutdown().await?;
        writer.flush().await?;
        writer.into_inner().sync_all().await?;

--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -569,8 +569,10 @@ fn start_pageserver(
        pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());

    // Scan the local 'tenants/' directory and start loading the tenants
-    let (basebackup_prepare_sender, basebackup_prepare_receiver) =
-        tokio::sync::mpsc::unbounded_channel();
+    let (basebackup_cache, basebackup_prepare_receiver) = BasebackupCache::new(
+        conf.basebackup_cache_dir(),
+        conf.basebackup_cache_config.clone(),
+    );
    let deletion_queue_client = deletion_queue.new_client();
    let background_purges = mgr::BackgroundPurges::default();

@@ -582,7 +584,7 @@ fn start_pageserver(
            remote_storage: remote_storage.clone(),
            deletion_queue_client,
            l0_flush_global_state,
-            basebackup_prepare_sender,
+            basebackup_cache: Arc::clone(&basebackup_cache),
            feature_resolver: feature_resolver.clone(),
        },
        shutdown_pageserver.clone(),
@@ -590,10 +592,8 @@ fn start_pageserver(
    let tenant_manager = Arc::new(tenant_manager);
    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(tenant_manager.clone(), order))?;

-    let basebackup_cache = BasebackupCache::spawn(
+    basebackup_cache.spawn_background_task(
        BACKGROUND_RUNTIME.handle(),
-        conf.basebackup_cache_dir(),
-        conf.basebackup_cache_config.clone(),
        basebackup_prepare_receiver,
        Arc::clone(&tenant_manager),
        shutdown_pageserver.child_token(),
@@ -806,7 +806,6 @@ fn start_pageserver(
        } else {
            None
        },
-        basebackup_cache,
    );

    // Spawn a Pageserver gRPC server task. It will spawn separate tasks for
--- a/pageserver/src/bin/test_helper_slow_client_reads.rs
+++ b/pageserver/src/bin/test_helper_slow_client_reads.rs
@@ -37,7 +37,7 @@ async fn main() -> anyhow::Result<()> {
                not_modified_since: Lsn(23),
            },
            batch_key: 42,
-            message: format!("message {}", msg),
+            message: format!("message {msg}"),
        }));
        let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else {
            eprintln!("pipe seems full");
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -762,4 +762,40 @@ mod tests {
        let result = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir);
        assert_eq!(result.is_ok(), is_valid);
    }
+
+    #[test]
+    fn test_config_posthog_config_is_valid() {
+        let input = r#"
+            control_plane_api = "http://localhost:6666"
+
+            [posthog_config]
+            server_api_key = "phs_AAA"
+            client_api_key = "phc_BBB"
+            project_id = "000"
+            private_api_url = "https://us.posthog.com"
+            public_api_url = "https://us.i.posthog.com"
+        "#;
+        let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
+            .expect("posthogconfig is valid");
+        let workdir = Utf8PathBuf::from("/nonexistent");
+        PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
+            .expect("parse_and_validate");
+    }
+
+    #[test]
+    fn test_config_posthog_incomplete_config_is_valid() {
+        let input = r#"
+            control_plane_api = "http://localhost:6666"
+
+            [posthog_config]
+            server_api_key = "phs_AAA"
+            private_api_url = "https://us.posthog.com"
+            public_api_url = "https://us.i.posthog.com"
+        "#;
+        let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
+            .expect("posthogconfig is valid");
+        let workdir = Utf8PathBuf::from("/nonexistent");
+        PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
+            .expect("parse_and_validate");
+    }
 }
--- a/pageserver/src/feature_resolver.rs
+++ b/pageserver/src/feature_resolver.rs
@@ -3,7 +3,7 @@ use std::{collections::HashMap, sync::Arc, time::Duration};
 use arc_swap::ArcSwap;
 use pageserver_api::config::NodeMetadata;
 use posthog_client_lite::{
-    CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError,
+    CaptureEvent, FeatureResolverBackgroundLoop, PostHogEvaluationError,
    PostHogFlagFilterPropertyValue,
 };
 use remote_storage::RemoteStorageKind;
@@ -31,6 +31,13 @@ impl FeatureResolver {
        }
    }

+    pub fn update(&self, spec: String) -> anyhow::Result<()> {
+        if let Some(inner) = &self.inner {
+            inner.update(spec)?;
+        }
+        Ok(())
+    }
+
    pub fn spawn(
        conf: &PageServerConf,
        shutdown_pageserver: CancellationToken,
@@ -38,16 +45,24 @@ impl FeatureResolver {
    ) -> anyhow::Result<Self> {
        // DO NOT block in this function: make it return as fast as possible to avoid startup delays.
        if let Some(posthog_config) = &conf.posthog_config {
-            let inner = FeatureResolverBackgroundLoop::new(
-                PostHogClientConfig {
-                    server_api_key: posthog_config.server_api_key.clone(),
-                    client_api_key: posthog_config.client_api_key.clone(),
-                    project_id: posthog_config.project_id.clone(),
-                    private_api_url: posthog_config.private_api_url.clone(),
-                    public_api_url: posthog_config.public_api_url.clone(),
-                },
-                shutdown_pageserver,
-            );
+            let posthog_client_config = match posthog_config.clone().try_into_posthog_config() {
+                Ok(config) => config,
+                Err(e) => {
+                    tracing::warn!(
+                        "invalid posthog config, skipping posthog integration: {}",
+                        e
+                    );
+                    return Ok(FeatureResolver {
+                        inner: None,
+                        internal_properties: None,
+                        force_overrides_for_testing: Arc::new(ArcSwap::new(Arc::new(
+                            HashMap::new(),
+                        ))),
+                    });
+                }
+            };
+            let inner =
+                FeatureResolverBackgroundLoop::new(posthog_client_config, shutdown_pageserver);
            let inner = Arc::new(inner);

            // The properties shared by all tenants on this pageserver.
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1893,9 +1893,13 @@ async fn update_tenant_config_handler(
    let location_conf = LocationConf::attached_single(
        new_tenant_conf.clone(),
        tenant.get_generation(),
-        &ShardParameters::default(),
+        ShardParameters::from(tenant.get_shard_identity()),
    );

+    tenant
+        .get_shard_identity()
+        .assert_equal(location_conf.shard); // not strictly necessary since we construct it above
+
    crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
@@ -1937,9 +1941,13 @@ async fn patch_tenant_config_handler(
    let location_conf = LocationConf::attached_single(
        updated,
        tenant.get_generation(),
-        &ShardParameters::default(),
+        ShardParameters::from(tenant.get_shard_identity()),
    );

+    tenant
+        .get_shard_identity()
+        .assert_equal(location_conf.shard); // not strictly necessary since we construct it above
+
    crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
@@ -3743,6 +3751,20 @@ async fn force_override_feature_flag_for_testing_delete(
    json_response(StatusCode::OK, ())
 }

+async fn update_feature_flag_spec(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let body = json_request(&mut request).await?;
+    let state = get_state(&request);
+    state
+        .feature_resolver
+        .update(body)
+        .map_err(ApiError::InternalServerError)?;
+    json_response(StatusCode::OK, ())
+}
+
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -4128,5 +4150,8 @@ pub fn make_router(
        .delete("/v1/feature_flag/:flag_key", |r| {
            testing_api_handler("force override feature flag - delete", r, force_override_feature_flag_for_testing_delete)
        })
+        .post("/v1/feature_flag_spec", |r| {
+            api_handler(r, update_feature_flag_spec)
+        })
        .any(handler_404))
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -4439,6 +4439,14 @@ pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy<UIntGauge> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) static BASEBACKUP_CACHE_PREPARE_QUEUE_SIZE: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_basebackup_cache_prepare_queue_size",
+        "Number of requests in the basebackup prepare channel"
+    )
+    .expect("failed to define a metric")
+});
+
 static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_config_ignored_items",
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -12,9 +12,9 @@ use std::task::{Context, Poll};
 use std::time::{Duration, Instant, SystemTime};
 use std::{io, str};

-use anyhow::{Context as _, anyhow, bail};
-use async_compression::tokio::write::GzipEncoder;
+use anyhow::{Context as _, bail};
 use bytes::{Buf as _, BufMut as _, BytesMut};
+use chrono::Utc;
 use futures::future::BoxFuture;
 use futures::{FutureExt, Stream};
 use itertools::Itertools;
@@ -63,7 +63,6 @@ use utils::{failpoint_support, span_record};

 use crate::auth::check_permission;
 use crate::basebackup::{self, BasebackupError};
-use crate::basebackup_cache::BasebackupCache;
 use crate::config::PageServerConf;
 use crate::context::{
    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
@@ -138,7 +137,6 @@ pub fn spawn(
    perf_trace_dispatch: Option<Dispatch>,
    tcp_listener: tokio::net::TcpListener,
    tls_config: Option<Arc<rustls::ServerConfig>>,
-    basebackup_cache: Arc<BasebackupCache>,
 ) -> Listener {
    let cancel = CancellationToken::new();
    let libpq_ctx = RequestContext::todo_child(
@@ -160,7 +158,6 @@ pub fn spawn(
            conf.pg_auth_type,
            tls_config,
            conf.page_service_pipelining.clone(),
-            basebackup_cache,
            libpq_ctx,
            cancel.clone(),
        )
@@ -219,7 +216,6 @@ pub async fn libpq_listener_main(
    auth_type: AuthType,
    tls_config: Option<Arc<rustls::ServerConfig>>,
    pipelining_config: PageServicePipeliningConfig,
-    basebackup_cache: Arc<BasebackupCache>,
    listener_ctx: RequestContext,
    listener_cancel: CancellationToken,
 ) -> Connections {
@@ -263,7 +259,6 @@ pub async fn libpq_listener_main(
                    auth_type,
                    tls_config.clone(),
                    pipelining_config.clone(),
-                    Arc::clone(&basebackup_cache),
                    connection_ctx,
                    connections_cancel.child_token(),
                    gate_guard,
@@ -306,7 +301,6 @@ async fn page_service_conn_main(
    auth_type: AuthType,
    tls_config: Option<Arc<rustls::ServerConfig>>,
    pipelining_config: PageServicePipeliningConfig,
-    basebackup_cache: Arc<BasebackupCache>,
    connection_ctx: RequestContext,
    cancel: CancellationToken,
    gate_guard: GateGuard,
@@ -372,7 +366,6 @@ async fn page_service_conn_main(
        pipelining_config,
        conf.get_vectored_concurrent_io,
        perf_span_fields,
-        basebackup_cache,
        connection_ctx,
        cancel.clone(),
        gate_guard,
@@ -426,8 +419,6 @@ struct PageServerHandler {
    pipelining_config: PageServicePipeliningConfig,
    get_vectored_concurrent_io: GetVectoredConcurrentIo,

-    basebackup_cache: Arc<BasebackupCache>,
-
    gate_guard: GateGuard,
 }

@@ -913,7 +904,6 @@ impl PageServerHandler {
        pipelining_config: PageServicePipeliningConfig,
        get_vectored_concurrent_io: GetVectoredConcurrentIo,
        perf_span_fields: ConnectionPerfSpanFields,
-        basebackup_cache: Arc<BasebackupCache>,
        connection_ctx: RequestContext,
        cancel: CancellationToken,
        gate_guard: GateGuard,
@@ -927,7 +917,6 @@ impl PageServerHandler {
            cancel,
            pipelining_config,
            get_vectored_concurrent_io,
-            basebackup_cache,
            gate_guard,
        }
    }
@@ -2613,26 +2602,16 @@ impl PageServerHandler {
                prev_lsn,
                full_backup,
                replica,
+                None,
                &ctx,
            )
            .await?;
        } else {
            let mut writer = BufWriter::new(pgb.copyout_writer());

-            let cached = {
-                // Basebackup is cached only for this combination of parameters.
-                if timeline.is_basebackup_cache_enabled()
-                    && gzip
-                    && lsn.is_some()
-                    && prev_lsn.is_none()
-                {
-                    self.basebackup_cache
-                        .get(tenant_id, timeline_id, lsn.unwrap())
-                        .await
-                } else {
-                    None
-                }
-            };
+            let cached = timeline
+                .get_cached_basebackup_if_enabled(lsn, prev_lsn, full_backup, replica, gzip)
+                .await;

            if let Some(mut cached) = cached {
                from_cache = true;
@@ -2641,31 +2620,6 @@ impl PageServerHandler {
                    .map_err(|err| {
                        BasebackupError::Client(err, "handle_basebackup_request,cached,copy")
                    })?;
-            } else if gzip {
-                let mut encoder = GzipEncoder::with_quality(
-                    &mut writer,
-                    // NOTE using fast compression because it's on the critical path
-                    //      for compute startup. For an empty database, we get
-                    //      <100KB with this method. The Level::Best compression method
-                    //      gives us <20KB, but maybe we should add basebackup caching
-                    //      on compute shutdown first.
-                    async_compression::Level::Fastest,
-                );
-                basebackup::send_basebackup_tarball(
-                    &mut encoder,
-                    &timeline,
-                    lsn,
-                    prev_lsn,
-                    full_backup,
-                    replica,
-                    &ctx,
-                )
-                .await?;
-                // shutdown the encoder to ensure the gzip footer is written
-                encoder
-                    .shutdown()
-                    .await
-                    .map_err(|e| QueryError::Disconnected(ConnectionError::Io(e)))?;
            } else {
                basebackup::send_basebackup_tarball(
                    &mut writer,
@@ -2674,6 +2628,11 @@ impl PageServerHandler {
                    prev_lsn,
                    full_backup,
                    replica,
+                    // NB: using fast compression because it's on the critical path for compute
+                    // startup. For an empty database, we get <100KB with this method. The
+                    // Level::Best compression method gives us <20KB, but maybe we should add
+                    // basebackup caching on compute shutdown first.
+                    gzip.then_some(async_compression::Level::Fastest),
                    &ctx,
                )
                .await?;
@@ -3553,7 +3512,7 @@ impl proto::PageService for GrpcPageServiceHandler {
        if timeline.is_archived() == Some(true) {
            return Err(tonic::Status::failed_precondition("timeline is archived"));
        }
-        let req: page_api::GetBaseBackupRequest = req.into_inner().into();
+        let req: page_api::GetBaseBackupRequest = req.into_inner().try_into()?;

        span_record!(lsn=?req.lsn);

@@ -3579,20 +3538,50 @@ impl proto::PageService for GrpcPageServiceHandler {
        let span = Span::current();
        let (mut simplex_read, mut simplex_write) = tokio::io::simplex(CHUNK_SIZE);
        let jh = tokio::spawn(async move {
-            let result = basebackup::send_basebackup_tarball(
-                &mut simplex_write,
-                &timeline,
-                req.lsn,
-                None,
-                req.full,
-                req.replica,
-                &ctx,
-            )
-            .instrument(span) // propagate request span
-            .await;
-            simplex_write.shutdown().await.map_err(|err| {
-                BasebackupError::Server(anyhow!("simplex shutdown failed: {err}"))
-            })?;
+            let gzip_level = match req.compression {
+                page_api::BaseBackupCompression::None => None,
+                // NB: using fast compression because it's on the critical path for compute
+                // startup. For an empty database, we get <100KB with this method. The
+                // Level::Best compression method gives us <20KB, but maybe we should add
+                // basebackup caching on compute shutdown first.
+                page_api::BaseBackupCompression::Gzip => Some(async_compression::Level::Fastest),
+            };
+
+            // Check for a cached basebackup.
+            let cached = timeline
+                .get_cached_basebackup_if_enabled(
+                    req.lsn,
+                    None,
+                    req.full,
+                    req.replica,
+                    gzip_level.is_some(),
+                )
+                .await;
+
+            let result = if let Some(mut cached) = cached {
+                // If we have a cached basebackup, send it.
+                tokio::io::copy(&mut cached, &mut simplex_write)
+                    .await
+                    .map(|_| ())
+                    .map_err(|err| BasebackupError::Client(err, "cached,copy"))
+            } else {
+                basebackup::send_basebackup_tarball(
+                    &mut simplex_write,
+                    &timeline,
+                    req.lsn,
+                    None,
+                    req.full,
+                    req.replica,
+                    gzip_level,
+                    &ctx,
+                )
+                .instrument(span) // propagate request span
+                .await
+            };
+            simplex_write
+                .shutdown()
+                .await
+                .map_err(|err| BasebackupError::Client(err, "simplex_write"))?;
            result
        });

@@ -3772,6 +3761,36 @@ impl proto::PageService for GrpcPageServiceHandler {
        let resp: page_api::GetSlruSegmentResponse = resp.segment;
        Ok(tonic::Response::new(resp.into()))
    }
+
+    #[instrument(skip_all, fields(lsn))]
+    async fn lease_lsn(
+        &self,
+        req: tonic::Request<proto::LeaseLsnRequest>,
+    ) -> Result<tonic::Response<proto::LeaseLsnResponse>, tonic::Status> {
+        let timeline = self.get_request_timeline(&req).await?;
+        let ctx = self.ctx.with_scope_timeline(&timeline);
+
+        // Validate and convert the request, and decorate the span.
+        let req: page_api::LeaseLsnRequest = req.into_inner().try_into()?;
+
+        span_record!(lsn=%req.lsn);
+
+        // Attempt to acquire a lease. Return FailedPrecondition if the lease could not be granted.
+        let lease_length = timeline.get_lsn_lease_length();
+        let expires = match timeline.renew_lsn_lease(req.lsn, lease_length, &ctx) {
+            Ok(lease) => lease.valid_until,
+            Err(err) => return Err(tonic::Status::failed_precondition(format!("{err}"))),
+        };
+
+        // TODO: is this spammy? Move it compute-side?
+        info!(
+            "acquired lease for {} until {}",
+            req.lsn,
+            chrono::DateTime::<Utc>::from(expires).to_rfc3339()
+        );
+
+        Ok(tonic::Response::new(expires.into()))
+    }
 }

 /// gRPC middleware layer that handles observability concerns:
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -3015,7 +3015,7 @@ mod tests {
        // This shard will get the even blocks
        let shard = ShardIdentity::from_params(
            ShardNumber(0),
-            &ShardParameters {
+            ShardParameters {
                count: ShardCount(2),
                stripe_size: ShardStripeSize(1),
            },
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -80,7 +80,7 @@ use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, Uninit
 use self::timeline::{
    EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError,
 };
-use crate::basebackup_cache::BasebackupPrepareSender;
+use crate::basebackup_cache::BasebackupCache;
 use crate::config::PageServerConf;
 use crate::context;
 use crate::context::RequestContextBuilder;
@@ -162,7 +162,7 @@ pub struct TenantSharedResources {
    pub remote_storage: GenericRemoteStorage,
    pub deletion_queue_client: DeletionQueueClient,
    pub l0_flush_global_state: L0FlushGlobalState,
-    pub basebackup_prepare_sender: BasebackupPrepareSender,
+    pub basebackup_cache: Arc<BasebackupCache>,
    pub feature_resolver: FeatureResolver,
 }

@@ -331,7 +331,7 @@ pub struct TenantShard {
    deletion_queue_client: DeletionQueueClient,

    /// A channel to send async requests to prepare a basebackup for the basebackup cache.
-    basebackup_prepare_sender: BasebackupPrepareSender,
+    basebackup_cache: Arc<BasebackupCache>,

    /// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`].
    cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
@@ -1363,7 +1363,7 @@ impl TenantShard {
            remote_storage,
            deletion_queue_client,
            l0_flush_global_state,
-            basebackup_prepare_sender,
+            basebackup_cache,
            feature_resolver,
        } = resources;

@@ -1380,7 +1380,7 @@ impl TenantShard {
            remote_storage.clone(),
            deletion_queue_client,
            l0_flush_global_state,
-            basebackup_prepare_sender,
+            basebackup_cache,
            feature_resolver,
        ));

@@ -3872,6 +3872,10 @@ impl TenantShard {
        &self.tenant_shard_id
    }

+    pub(crate) fn get_shard_identity(&self) -> ShardIdentity {
+        self.shard_identity
+    }
+
    pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize {
        self.shard_identity.stripe_size
    }
@@ -4380,7 +4384,7 @@ impl TenantShard {
        remote_storage: GenericRemoteStorage,
        deletion_queue_client: DeletionQueueClient,
        l0_flush_global_state: L0FlushGlobalState,
-        basebackup_prepare_sender: BasebackupPrepareSender,
+        basebackup_cache: Arc<BasebackupCache>,
        feature_resolver: FeatureResolver,
    ) -> TenantShard {
        assert!(!attached_conf.location.generation.is_none());
@@ -4485,7 +4489,7 @@ impl TenantShard {
            ongoing_timeline_detach: std::sync::Mutex::default(),
            gc_block: Default::default(),
            l0_flush_global_state,
-            basebackup_prepare_sender,
+            basebackup_cache,
            feature_resolver,
        }
    }
@@ -4525,6 +4529,10 @@ impl TenantShard {
        Ok(toml_edit::de::from_str::<LocationConf>(&config)?)
    }

+    /// Stores a tenant location config to disk.
+    ///
+    /// NB: make sure to call `ShardIdentity::assert_equal` before persisting a new config, to avoid
+    /// changes to shard parameters that may result in data corruption.
    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
    pub(super) async fn persist_tenant_config(
        conf: &'static PageServerConf,
@@ -5414,7 +5422,7 @@ impl TenantShard {
            pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
            l0_compaction_trigger: self.l0_compaction_trigger.clone(),
            l0_flush_global_state: self.l0_flush_global_state.clone(),
-            basebackup_prepare_sender: self.basebackup_prepare_sender.clone(),
+            basebackup_cache: self.basebackup_cache.clone(),
            feature_resolver: self.feature_resolver.clone(),
        }
    }
@@ -6000,7 +6008,7 @@ pub(crate) mod harness {
        ) -> anyhow::Result<Arc<TenantShard>> {
            let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));

-            let (basebackup_requst_sender, _) = tokio::sync::mpsc::unbounded_channel();
+            let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None);

            let tenant = Arc::new(TenantShard::new(
                TenantState::Attaching,
@@ -6008,7 +6016,7 @@ pub(crate) mod harness {
                AttachedTenantConf::try_from(LocationConf::attached_single(
                    self.tenant_conf.clone(),
                    self.generation,
-                    &ShardParameters::default(),
+                    ShardParameters::default(),
                ))
                .unwrap(),
                self.shard_identity,
@@ -6018,7 +6026,7 @@ pub(crate) mod harness {
                self.deletion_queue.new_client(),
                // TODO: ideally we should run all unit tests with both configs
                L0FlushGlobalState::new(L0FlushConfig::default()),
-                basebackup_requst_sender,
+                basebackup_cache,
                FeatureResolver::new_disabled(),
            ));

@@ -11429,11 +11437,11 @@ mod tests {
        if left != right {
            eprintln!("---LEFT---");
            for left in left.iter() {
-                eprintln!("{}", left);
+                eprintln!("{left}");
            }
            eprintln!("---RIGHT---");
            for right in right.iter() {
-                eprintln!("{}", right);
+                eprintln!("{right}");
            }
            assert_eq!(left, right);
        }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -12,6 +12,7 @@
 use pageserver_api::models;
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::{Deserialize, Serialize};
+use utils::critical;
 use utils::generation::Generation;

 #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
@@ -136,7 +137,7 @@ impl LocationConf {
    pub(crate) fn attached_single(
        tenant_conf: pageserver_api::models::TenantConfig,
        generation: Generation,
-        shard_params: &models::ShardParameters,
+        shard_params: models::ShardParameters,
    ) -> Self {
        Self {
            mode: LocationMode::Attached(AttachedLocationConfig {
@@ -171,6 +172,16 @@ impl LocationConf {
            }
        }

+        // This should never happen.
+        // TODO: turn this into a proper assertion.
+        if stripe_size != self.shard.stripe_size {
+            critical!(
+                "stripe size mismatch: {} != {}",
+                self.shard.stripe_size,
+                stripe_size,
+            );
+        }
+
        self.shard.stripe_size = stripe_size;
    }

--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -880,6 +880,9 @@ impl TenantManager {
        // phase of writing config and/or waiting for flush, before returning.
        match fast_path_taken {
            Some(FastPathModified::Attached(tenant)) => {
+                tenant
+                    .shard_identity
+                    .assert_equal(new_location_config.shard);
                TenantShard::persist_tenant_config(
                    self.conf,
                    &tenant_shard_id,
@@ -914,7 +917,10 @@ impl TenantManager {

                return Ok(Some(tenant));
            }
-            Some(FastPathModified::Secondary(_secondary_tenant)) => {
+            Some(FastPathModified::Secondary(secondary_tenant)) => {
+                secondary_tenant
+                    .shard_identity
+                    .assert_equal(new_location_config.shard);
                TenantShard::persist_tenant_config(
                    self.conf,
                    &tenant_shard_id,
@@ -948,6 +954,10 @@ impl TenantManager {

        match slot_guard.get_old_value() {
            Some(TenantSlot::Attached(tenant)) => {
+                tenant
+                    .shard_identity
+                    .assert_equal(new_location_config.shard);
+
                // The case where we keep a Tenant alive was covered above in the special case
                // for Attached->Attached transitions in the same generation.  By this point,
                // if we see an attached tenant we know it will be discarded and should be
@@ -981,9 +991,13 @@ impl TenantManager {
                // rather than assuming it to be empty.
                spawn_mode = SpawnMode::Eager;
            }
-            Some(TenantSlot::Secondary(state)) => {
+            Some(TenantSlot::Secondary(secondary_tenant)) => {
+                secondary_tenant
+                    .shard_identity
+                    .assert_equal(new_location_config.shard);
+
                info!("Shutting down secondary tenant");
-                state.shutdown().await;
+                secondary_tenant.shutdown().await;
            }
            Some(TenantSlot::InProgress(_)) => {
                // This should never happen: acquire_slot should error out
@@ -2200,7 +2214,7 @@ impl TenantManager {
        selector: ShardSelector,
    ) -> ShardResolveResult {
        let tenants = self.tenants.read().unwrap();
-        let mut want_shard = None;
+        let mut want_shard: Option<ShardIndex> = None;
        let mut any_in_progress = None;

        match &*tenants {
@@ -2225,14 +2239,23 @@ impl TenantManager {
                            return ShardResolveResult::Found(tenant.clone());
                        }
                        ShardSelector::Page(key) => {
-                            // First slot we see for this tenant, calculate the expected shard number
-                            // for the key: we will use this for checking if this and subsequent
-                            // slots contain the key, rather than recalculating the hash each time.
-                            if want_shard.is_none() {
-                                want_shard = Some(tenant.shard_identity.get_shard_number(&key));
+                            // Each time we find an attached slot with a different shard count,
+                            // recompute the expected shard number: during shard splits we might
+                            // have multiple shards with the old shard count.
+                            if want_shard.is_none()
+                                || want_shard.unwrap().shard_count != tenant.shard_identity.count
+                            {
+                                want_shard = Some(ShardIndex {
+                                    shard_number: tenant.shard_identity.get_shard_number(&key),
+                                    shard_count: tenant.shard_identity.count,
+                                });
                            }

-                            if Some(tenant.shard_identity.number) == want_shard {
+                            if Some(ShardIndex {
+                                shard_number: tenant.shard_identity.number,
+                                shard_count: tenant.shard_identity.count,
+                            }) == want_shard
+                            {
                                return ShardResolveResult::Found(tenant.clone());
                            }
                        }
@@ -2891,14 +2914,18 @@ mod tests {
    use std::collections::BTreeMap;
    use std::sync::Arc;

+    use camino::Utf8PathBuf;
    use storage_broker::BrokerClientChannel;
    use tracing::Instrument;

    use super::super::harness::TenantHarness;
    use super::TenantsMap;
-    use crate::tenant::{
-        TenantSharedResources,
-        mgr::{BackgroundPurges, TenantManager, TenantSlot},
+    use crate::{
+        basebackup_cache::BasebackupCache,
+        tenant::{
+            TenantSharedResources,
+            mgr::{BackgroundPurges, TenantManager, TenantSlot},
+        },
    };

    #[tokio::test(start_paused = true)]
@@ -2924,9 +2951,7 @@ mod tests {
        // Invoke remove_tenant_from_memory with a cleanup hook that blocks until we manually
        // permit it to proceed: that will stick the tenant in InProgress

-        let (basebackup_prepare_sender, _) = tokio::sync::mpsc::unbounded_channel::<
-            crate::basebackup_cache::BasebackupPrepareRequest,
-        >();
+        let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None);

        let tenant_manager = TenantManager {
            tenants: std::sync::RwLock::new(TenantsMap::Open(tenants)),
@@ -2940,7 +2965,7 @@ mod tests {
                l0_flush_global_state: crate::l0_flush::L0FlushGlobalState::new(
                    h.conf.l0_flush.clone(),
                ),
-                basebackup_prepare_sender,
+                basebackup_cache,
                feature_resolver: crate::feature_resolver::FeatureResolver::new_disabled(),
            },
            cancel: tokio_util::sync::CancellationToken::new(),
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -101,7 +101,7 @@ pub(crate) struct SecondaryTenant {
    // Secondary mode does not need the full shard identity or the pageserver_api::models::TenantConfig.  However,
    // storing these enables us to report our full LocationConf, enabling convenient reconciliation
    // by the control plane (see [`Self::get_location_conf`])
-    shard_identity: ShardIdentity,
+    pub(crate) shard_identity: ShardIdentity,
    tenant_conf: std::sync::Mutex<pageserver_api::models::TenantConfig>,

    // Internal state used by the Downloader.
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -55,11 +55,11 @@ pub struct BatchLayerWriter {
 }

 impl BatchLayerWriter {
-    pub async fn new(conf: &'static PageServerConf) -> anyhow::Result<Self> {
-        Ok(Self {
+    pub fn new(conf: &'static PageServerConf) -> Self {
+        Self {
            generated_layer_writers: Vec::new(),
            conf,
-        })
+        }
    }

    pub fn add_unfinished_image_writer(
@@ -209,6 +209,7 @@ impl<'a> SplitImageLayerWriter<'a> {
    ) -> anyhow::Result<Self> {
        Ok(Self {
            target_layer_size,
+            // XXX make this lazy like in SplitDeltaLayerWriter?
            inner: ImageLayerWriter::new(
                conf,
                timeline_id,
@@ -223,7 +224,7 @@ impl<'a> SplitImageLayerWriter<'a> {
            conf,
            timeline_id,
            tenant_shard_id,
-            batches: BatchLayerWriter::new(conf).await?,
+            batches: BatchLayerWriter::new(conf),
            lsn,
            start_key,
            gate,
@@ -319,7 +320,7 @@ pub struct SplitDeltaLayerWriter<'a> {
 }

 impl<'a> SplitDeltaLayerWriter<'a> {
-    pub async fn new(
+    pub fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
@@ -327,8 +328,8 @@ impl<'a> SplitDeltaLayerWriter<'a> {
        target_layer_size: u64,
        gate: &'a utils::sync::gate::Gate,
        cancel: CancellationToken,
-    ) -> anyhow::Result<Self> {
-        Ok(Self {
+    ) -> Self {
+        Self {
            target_layer_size,
            inner: None,
            conf,
@@ -336,10 +337,10 @@ impl<'a> SplitDeltaLayerWriter<'a> {
            tenant_shard_id,
            lsn_range,
            last_key_written: Key::MIN,
-            batches: BatchLayerWriter::new(conf).await?,
+            batches: BatchLayerWriter::new(conf),
            gate,
            cancel,
-        })
+        }
    }

    pub async fn put_value(
@@ -510,9 +511,7 @@ mod tests {
            4 * 1024 * 1024,
            &tline.gate,
            tline.cancel.clone(),
-        )
-        .await
-        .unwrap();
+        );

        image_writer
            .put_image(get_key(0), get_img(0), &ctx)
@@ -590,9 +589,7 @@ mod tests {
            4 * 1024 * 1024,
            &tline.gate,
            tline.cancel.clone(),
-        )
-        .await
-        .unwrap();
+        );
        const N: usize = 2000;
        for i in 0..N {
            let i = i as u32;
@@ -692,9 +689,7 @@ mod tests {
            4 * 1024,
            &tline.gate,
            tline.cancel.clone(),
-        )
-        .await
-        .unwrap();
+        );

        image_writer
            .put_image(get_key(0), get_img(0), &ctx)
@@ -770,9 +765,7 @@ mod tests {
            4 * 1024 * 1024,
            &tline.gate,
            tline.cancel.clone(),
-        )
-        .await
-        .unwrap();
+        );

        for i in 0..N {
            let i = i as u32;
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -17,14 +17,17 @@ use tracing::*;
 use utils::backoff::exponential_backoff_duration;
 use utils::completion::Barrier;
 use utils::pausable_failpoint;
+use utils::sync::gate::GateError;

 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
 use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
+use crate::tenant::blob_io::WriteBlobError;
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::compaction::CompactionOutcome;
 use crate::tenant::{TenantShard, TenantState};
+use crate::virtual_file::owned_buffers_io::write::FlushTaskError;

 /// Semaphore limiting concurrent background tasks (across all tenants).
 ///
@@ -313,7 +316,20 @@ pub(crate) fn log_compaction_error(
            let timeline = root_cause
                .downcast_ref::<PageReconstructError>()
                .is_some_and(|e| e.is_stopping());
-            let is_stopping = upload_queue || timeline;
+            let buffered_writer_flush_task_canelled = root_cause
+                .downcast_ref::<FlushTaskError>()
+                .is_some_and(|e| e.is_cancel());
+            let write_blob_cancelled = root_cause
+                .downcast_ref::<WriteBlobError>()
+                .is_some_and(|e| e.is_cancel());
+            let gate_closed = root_cause
+                .downcast_ref::<GateError>()
+                .is_some_and(|e| e.is_cancel());
+            let is_stopping = upload_queue
+                || timeline
+                || buffered_writer_flush_task_canelled
+                || write_blob_cancelled
+                || gate_closed;

            if is_stopping {
                Level::INFO
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -95,12 +95,12 @@ use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
 use super::tasks::log_compaction_error;
 use super::upload_queue::NotInitialized;
 use super::{
-    AttachedTenantConf, BasebackupPrepareSender, GcError, HeatMapTimeline, MaybeOffloaded,
+    AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded,
    debug_assert_current_span_has_tenant_and_timeline_id,
 };
 use crate::PERF_TRACE_TARGET;
 use crate::aux_file::AuxFileSizeEstimator;
-use crate::basebackup_cache::BasebackupPrepareRequest;
+use crate::basebackup_cache::BasebackupCache;
 use crate::config::PageServerConf;
 use crate::context::{
    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
@@ -201,7 +201,7 @@ pub struct TimelineResources {
    pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
    pub l0_compaction_trigger: Arc<Notify>,
    pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
-    pub basebackup_prepare_sender: BasebackupPrepareSender,
+    pub basebackup_cache: Arc<BasebackupCache>,
    pub feature_resolver: FeatureResolver,
 }

@@ -448,7 +448,7 @@ pub struct Timeline {
    wait_lsn_log_slow: tokio::sync::Semaphore,

    /// A channel to send async requests to prepare a basebackup for the basebackup cache.
-    basebackup_prepare_sender: BasebackupPrepareSender,
+    basebackup_cache: Arc<BasebackupCache>,

    feature_resolver: FeatureResolver,
 }
@@ -763,7 +763,7 @@ pub(crate) enum CreateImageLayersError {
    PageReconstructError(#[source] PageReconstructError),

    #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    Other(anyhow::Error),
 }

 impl From<layer_manager::Shutdown> for CreateImageLayersError {
@@ -2500,6 +2500,37 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.basebackup_cache_enabled)
    }

+    /// Try to get a basebackup from the on-disk cache.
+    pub(crate) async fn get_cached_basebackup(&self, lsn: Lsn) -> Option<tokio::fs::File> {
+        self.basebackup_cache
+            .get(self.tenant_shard_id.tenant_id, self.timeline_id, lsn)
+            .await
+    }
+
+    /// Convenience method to attempt fetching a basebackup for the timeline if enabled and safe for
+    /// the given request parameters.
+    ///
+    /// TODO: consider moving this onto GrpcPageServiceHandler once the libpq handler is gone.
+    pub async fn get_cached_basebackup_if_enabled(
+        &self,
+        lsn: Option<Lsn>,
+        prev_lsn: Option<Lsn>,
+        full: bool,
+        replica: bool,
+        gzip: bool,
+    ) -> Option<tokio::fs::File> {
+        if !self.is_basebackup_cache_enabled() || !self.basebackup_cache.is_enabled() {
+            return None;
+        }
+        // We have to know which LSN to fetch the basebackup for.
+        let lsn = lsn?;
+        // We only cache gzipped, non-full basebackups for primary computes with automatic prev_lsn.
+        if prev_lsn.is_some() || full || replica || !gzip {
+            return None;
+        }
+        self.get_cached_basebackup(lsn).await
+    }
+
    /// Prepare basebackup for the given LSN and store it in the basebackup cache.
    /// The method is asynchronous and returns immediately.
    /// The actual basebackup preparation is performed in the background
@@ -2521,17 +2552,8 @@ impl Timeline {
            return;
        }

-        let res = self
-            .basebackup_prepare_sender
-            .send(BasebackupPrepareRequest {
-                tenant_shard_id: self.tenant_shard_id,
-                timeline_id: self.timeline_id,
-                lsn,
-            });
-        if let Err(e) = res {
-            // May happen during shutdown, it's not critical.
-            info!("Failed to send shutdown checkpoint: {e:#}");
-        }
+        self.basebackup_cache
+            .send_prepare(self.tenant_shard_id, self.timeline_id, lsn);
    }
 }

@@ -3088,7 +3110,7 @@ impl Timeline {

                wait_lsn_log_slow: tokio::sync::Semaphore::new(1),

-                basebackup_prepare_sender: resources.basebackup_prepare_sender,
+                basebackup_cache: resources.basebackup_cache,

                feature_resolver: resources.feature_resolver,
            };
@@ -4658,6 +4680,16 @@ impl Timeline {
        mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>,
        ctx: &RequestContext,
    ) {
+        // Always notify waiters about the flush loop exiting since the loop might stop
+        // when the timeline hasn't been cancelled.
+        let scopeguard_rx = layer_flush_start_rx.clone();
+        scopeguard::defer! {
+            let (flush_counter, _) = *scopeguard_rx.borrow();
+            let _ = self
+                .layer_flush_done_tx
+                .send_replace((flush_counter, Err(FlushLayerError::Cancelled)));
+        }
+
        // Subscribe to L0 delta layer updates, for compaction backpressure.
        let mut watch_l0 = match self
            .layers
@@ -4687,9 +4719,6 @@ impl Timeline {
            let result = loop {
                if self.cancel.is_cancelled() {
                    info!("dropping out of flush loop for timeline shutdown");
-                    // Note: we do not bother transmitting into [`layer_flush_done_tx`], because
-                    // anyone waiting on that will respect self.cancel as well: they will stop
-                    // waiting at the same time we as drop out of this loop.
                    return;
                }

@@ -5561,7 +5590,7 @@ impl Timeline {
                self.should_check_if_image_layers_required(lsn)
            };

-        let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?;
+        let mut batch_image_writer = BatchLayerWriter::new(self.conf);

        let mut all_generated = true;

@@ -5665,7 +5694,8 @@ impl Timeline {
                self.cancel.clone(),
                ctx,
            )
-            .await?;
+            .await
+            .map_err(CreateImageLayersError::Other)?;

            fail_point!("image-layer-writer-fail-before-finish", |_| {
                Err(CreateImageLayersError::Other(anyhow::anyhow!(
@@ -5760,7 +5790,10 @@ impl Timeline {
            }
        }

-        let image_layers = batch_image_writer.finish(self, ctx).await?;
+        let image_layers = batch_image_writer
+            .finish(self, ctx)
+            .await
+            .map_err(CreateImageLayersError::Other)?;

        let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await;

--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -3531,10 +3531,7 @@ impl Timeline {
            self.get_compaction_target_size(),
            &self.gate,
            self.cancel.clone(),
-        )
-        .await
-        .context("failed to create delta layer writer")
-        .map_err(CompactionError::Other)?;
+        );

        #[derive(Default)]
        struct RewritingLayers {
@@ -4330,7 +4327,8 @@ impl TimelineAdaptor {
            self.timeline.cancel.clone(),
            ctx,
        )
-        .await?;
+        .await
+        .map_err(CreateImageLayersError::Other)?;

        fail_point!("image-layer-writer-fail-before-finish", |_| {
            Err(CreateImageLayersError::Other(anyhow::anyhow!(
@@ -4339,7 +4337,10 @@ impl TimelineAdaptor {
        });

        let keyspace = KeySpace {
-            ranges: self.get_keyspace(key_range, lsn, ctx).await?,
+            ranges: self
+                .get_keyspace(key_range, lsn, ctx)
+                .await
+                .map_err(CreateImageLayersError::Other)?,
        };
        // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly
        let outcome = self
@@ -4358,9 +4359,13 @@ impl TimelineAdaptor {
            unfinished_image_layer,
        } = outcome
        {
-            let (desc, path) = unfinished_image_layer.finish(ctx).await?;
+            let (desc, path) = unfinished_image_layer
+                .finish(ctx)
+                .await
+                .map_err(CreateImageLayersError::Other)?;
            let image_layer =
-                Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?;
+                Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)
+                    .map_err(CreateImageLayersError::Other)?;
            self.new_images.push(image_layer);
        }

--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -241,8 +241,17 @@ impl DeleteTimelineFlow {
                {
                    Ok(r) => r,
                    Err(DownloadError::NotFound) => {
-                        // Deletion is already complete
+                        // Deletion is already complete.
+                        // As we came here, we will need to remove the timeline from the tenant though.
                        tracing::info!("Timeline already deleted in remote storage");
+                        if let TimelineOrOffloaded::Offloaded(_) = &timeline {
+                            // We only supoprt this for offloaded timelines, as we don't know which state non-offloaded timelines are in.
+                            tracing::info!(
+                                "Timeline with gone index part is offloaded timeline. Removing from tenant."
+                            );
+                            remove_maybe_offloaded_timeline_from_tenant(tenant, &timeline, &guard)
+                                .await?;
+                        }
                        return Ok(());
                    }
                    Err(e) => {
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -887,7 +887,7 @@ mod tests {
            .expect("we still have it");
    }

-    fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key {
+    fn make_relation_key_for_shard(shard: ShardNumber, params: ShardParameters) -> Key {
        rel_block_to_key(
            RelTag {
                spcnode: 1663,
@@ -917,14 +917,14 @@ mod tests {
        let child0 = Arc::new_cyclic(|myself| StubTimeline {
            gate: Default::default(),
            id: timeline_id,
-            shard: ShardIdentity::from_params(ShardNumber(0), &child_params),
+            shard: ShardIdentity::from_params(ShardNumber(0), child_params),
            per_timeline_state: PerTimelineState::default(),
            myself: myself.clone(),
        });
        let child1 = Arc::new_cyclic(|myself| StubTimeline {
            gate: Default::default(),
            id: timeline_id,
-            shard: ShardIdentity::from_params(ShardNumber(1), &child_params),
+            shard: ShardIdentity::from_params(ShardNumber(1), child_params),
            per_timeline_state: PerTimelineState::default(),
            myself: myself.clone(),
        });
@@ -937,7 +937,7 @@ mod tests {
            let handle = cache
                .get(
                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
+                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)),
                    &StubManager {
                        shards: vec![parent.clone()],
                    },
@@ -961,7 +961,7 @@ mod tests {
            let handle = cache
                .get(
                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
+                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)),
                    &StubManager {
                        shards: vec![], // doesn't matter what's in here, the cache is fully loaded
                    },
@@ -978,7 +978,7 @@ mod tests {
        let parent_handle = cache
            .get(
                timeline_id,
-                ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)),
+                ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), child_params)),
                &StubManager {
                    shards: vec![parent.clone()],
                },
@@ -995,7 +995,7 @@ mod tests {
            let handle = cache
                .get(
                    timeline_id,
-                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)),
+                    ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)),
                    &StubManager {
                        shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop
                    },
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -275,12 +275,20 @@ pub(super) async fn handle_walreceiver_connection(
    let copy_stream = replication_client.copy_both_simple(&query).await?;
    let mut physical_stream = pin!(ReplicationStream::new(copy_stream));

-    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
-        .await
-        .map_err(|e| match e.kind {
-            crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
-            _ => WalReceiverError::Other(e.into()),
-        })?;
+    let walingest_future = WalIngest::new(timeline.as_ref(), startpoint, &ctx);
+    let walingest_res = select! {
+        walingest_res = walingest_future => walingest_res,
+        _ = cancellation.cancelled() => {
+            // We are doing reads in WalIngest::new, and those can hang as they come from the network.
+            // Timeline cancellation hits the walreceiver cancellation token before it hits the timeline global one.
+            debug!("Connection cancelled");
+            return Err(WalReceiverError::Cancelled);
+        },
+    };
+    let mut walingest = walingest_res.map_err(|e| match e.kind {
+        crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
+        _ => WalReceiverError::Other(e.into()),
+    })?;

    let (format, compression) = match protocol {
        PostgresClientProtocol::Interpreted {
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -5,6 +5,7 @@ MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
 	communicator.o \
+	communicator_new.o \
 	extension_server.o \
 	file_cache.o \
 	hll.o \
@@ -22,12 +23,18 @@ OBJS = \
 	walproposer.o \
 	walproposer_pg.o \
 	neon_ddl_handler.o \
-	walsender_hooks.o
+	walsender_hooks.o \
+	$(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a

 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl

+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S), Darwin)
+    SHLIB_LINK += -framework Security -framework CoreFoundation -framework SystemConfiguration
+endif
+
 EXTENSION = neon
 DATA = \
 	neon--1.0.sql \
@@ -54,6 +61,17 @@ WALPROP_OBJS = \
 	neon_utils.o \
 	walproposer_compat.o

+# libcommunicator.a is built by cargo from the Rust sources under communicator/
+# subdirectory. `cargo build` also generates communicator_bindings.h.
+neon.o: communicator/communicator_bindings.h
+
+$(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h &:
+	(cd $(srcdir)/communicator && cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE))
+
+# Force `cargo build` every time. Some of the Rust sources might have
+# changed.
+.PHONY: $(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h
+
 .PHONY: walproposer-lib
 walproposer-lib: CPPFLAGS += -DWALPROPOSER_LIB
 walproposer-lib: libwalproposer.a;
--- a/pgxn/neon/communicator/Cargo.lock
+++ b/pgxn/neon/communicator/Cargo.lock
@@ -0,0 +1,372 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "addr2line"
+version = "0.24.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
+dependencies = [
+ "gimli",
+]
+
+[[package]]
+name = "adler2"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
+
+[[package]]
+name = "backtrace"
+version = "0.3.74"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a"
+dependencies = [
+ "addr2line",
+ "cfg-if",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+ "windows-targets",
+]
+
+[[package]]
+name = "base64"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
+
+[[package]]
+name = "bytes"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "communicator"
+version = "0.1.0"
+dependencies = [
+ "tonic",
+]
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "futures-core"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
+
+[[package]]
+name = "gimli"
+version = "0.31.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
+
+[[package]]
+name = "http"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
+dependencies = [
+ "bytes",
+ "fnv",
+ "itoa",
+]
+
+[[package]]
+name = "http-body"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
+dependencies = [
+ "bytes",
+ "http",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
+
+[[package]]
+name = "libc"
+version = "0.2.171"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6"
+
+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.8.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff70ce3e48ae43fa075863cef62e8b43b71a4f2382229920e0df362592919430"
+dependencies = [
+ "adler2",
+]
+
+[[package]]
+name = "object"
+version = "0.36.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
+
+[[package]]
+name = "percent-encoding"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
+
+[[package]]
+name = "pin-project"
+version = "1.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
+dependencies = [
+ "pin-project-internal",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "1.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.94"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rustc-demangle"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
+
+[[package]]
+name = "syn"
+version = "2.0.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "tokio"
+version = "1.44.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48"
+dependencies = [
+ "backtrace",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "tokio-stream"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047"
+dependencies = [
+ "futures-core",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tonic"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85839f0b32fd242bb3209262371d07feda6d780d16ee9d2bc88581b89da1549b"
+dependencies = [
+ "base64",
+ "bytes",
+ "http",
+ "http-body",
+ "http-body-util",
+ "percent-encoding",
+ "pin-project",
+ "tokio-stream",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+
+[[package]]
+name = "tower-service"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
+
+[[package]]
+name = "tracing"
+version = "0.1.41"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
+dependencies = [
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
+dependencies = [
+ "once_cell",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
--- a/pgxn/neon/communicator/Cargo.toml
+++ b/pgxn/neon/communicator/Cargo.toml
@@ -0,0 +1,39 @@
+[package]
+name = "communicator"
+version = "0.1.0"
+edition = "2024"
+
+[features]
+testing = []
+
+[lib]
+crate-type = ["staticlib"]
+
+[dependencies]
+axum.workspace = true
+bytes.workspace = true
+clashmap.workspace = true
+http.workspace = true
+libc.workspace = true
+nix.workspace = true
+atomic_enum = "0.3.0"
+prometheus.workspace = true
+prost.workspace = true
+tonic = { version = "0.12.0", default-features = false, features=["codegen", "prost", "transport"] }
+tokio = { version = "1.43.1", features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] }
+tokio-pipe = { version = "0.2.12" }
+thiserror.workspace = true
+tracing.workspace = true
+tracing-subscriber.workspace = true
+
+metrics.workspace = true
+uring-common = { workspace = true, features = ["bytes"] }
+
+pageserver_client_grpc.workspace = true
+pageserver_page_api.workspace = true
+
+neon-shmem.workspace = true
+utils.workspace = true
+
+[build-dependencies]
+cbindgen.workspace = true
--- a/pgxn/neon/communicator/README.md
+++ b/pgxn/neon/communicator/README.md
@@ -0,0 +1,123 @@
+# Communicator
+
+This package provides the so-called "compute-pageserver communicator",
+or just "communicator" in short. It runs in a PostgreSQL server, as
+part of the neon extension, and handles the communication with the
+pageservers. On the PostgreSQL side, the glue code in pgxn/neon/ uses
+the communicator to implement the PostgreSQL Storage Manager (SMGR)
+interface.
+
+## Design criteria
+
+- Low latency
+- Saturate a 10 Gbit / s network interface without becoming a bottleneck
+
+## Source code view
+
+pgxn/neon/communicator_new.c
+	Contains the glue that interact with PostgreSQL code and the Rust
+	communicator code.
+
+pgxn/neon/communicator/src/backend_interface.rs
+	The entry point for calls from each backend.
+
+pgxn/neon/communicator/src/init.rs
+	Initialization at server startup
+
+pgxn/neon/communicator/src/worker_process/
+    Worker process main loop and glue code
+
+At compilation time, pgxn/neon/communicator/ produces a static
+library, libcommunicator.a. It is linked to the neon.so extension
+library.
+
+The real networking code, which is independent of PostgreSQL, is in
+the pageserver/client_grpc crate.
+
+## Process view
+
+The communicator runs in a dedicated background worker process, the
+"communicator process". The communicator uses a multi-threaded Tokio
+runtime to execute the IO requests. So the communicator process has
+multiple threads running. That's unusual for Postgres processes and
+care must be taken to make that work.
+
+### Backend <-> worker communication
+
+Each backend has a number of I/O request slots in shared memory. The
+slots are statically allocated for each backend, and must not be
+accessed by other backends. The worker process reads requests from the
+shared memory slots, and writes responses back to the slots.
+
+To submit an IO request, first pick one of your backend's free slots,
+and write the details of the IO request in the slot. Finally, update
+the 'state' field of the slot to Submitted. That informs the worker
+process that it can start processing the request. Once the state has
+been set to Submitted, the backend *must not* access the slot anymore,
+until the worker process sets its state to 'Completed'. In other
+words, each slot is owned by either the backend or the worker process
+at all times, and the 'state' field indicates who has ownership at the
+moment.
+
+To inform the worker process that a request slot has a pending IO
+request, there's a pipe shared by the worker process and all backend
+processes. After you have changed the slot's state to Submitted, write
+the index of the request slot to the pipe. This wakes up the worker
+process.
+
+(Note that the pipe is just used for wakeups, but the worker process
+is free to pick up Submitted IO requests even without receiving the
+wakeup. As of this writing, it doesn't do that, but it might be useful
+in the future to reduce latency even further, for example.)
+
+When the worker process has completed processing the request, it
+writes the result back in the request slot. A GetPage request can also
+contain a pointer to buffer in the shared buffer cache. In that case,
+the worker process writes the resulting page contents directly to the
+buffer, and just a result code in the request slot. It then updates
+the 'state' field to Completed, which passes the owner ship back to
+the originating backend. Finally, it signals the process Latch of the
+originating backend, waking it up.
+
+### Differences between PostgreSQL v16, v17 and v18
+
+PostgreSQL v18 introduced the new AIO mechanism. The PostgreSQL AIO
+mechanism uses a very similar mechanism as described in the previous
+section, for the communication between AIO worker processes and
+backends. With our communicator, the AIO worker processes are not
+used, but we use the same PgAioHandle request slots as in upstream.
+For Neon-specific IO requests like GetDbSize, a neon request slot is
+used. But for the actual IO requests, the request slot merely contains
+a pointer to the PgAioHandle slot. The worker process updates the
+status of that, calls the IO callbacks upon completionetc, just like
+the upstream AIO worker processes do.
+
+## Sequence diagram
+
+                      neon
+    PostgreSQL     extension       backend_interface.rs  worker_process.rs    processor    tonic
+       |               .                    .                   .                 .
+	   | smgr_read()   .                    .                   .                 .
+	   +-------------> +                    .                   .                 .
+	   .               |                    .                   .                 .
+	   .               |  rcommunicator_    .                   .                 .
+	   .               | get_page_at_lsn    .                   .                 .
+	   .               +------------------> +                   .                 .
+                                            |                   .                 .
+                                            | write request to  .                 .                 .
+                                            | slot              .                 .
+                                            |                   .                 .
+                                            |                   .                 .
+											| submit_request()  .                 .
+											+-----------------> +                 .
+											|                   |                 .
+											|					| db_size_request .               .
+																+---------------->.
+																                  . TODO
+
+
+
+### Compute <-> pageserver protocol
+
+The protocol between Compute and the pageserver is based on gRPC. See `protos/`.
+
--- a/pgxn/neon/communicator/build.rs
+++ b/pgxn/neon/communicator/build.rs
@@ -0,0 +1,22 @@
+use std::env;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
+
+    cbindgen::generate(crate_dir).map_or_else(
+        |error| match error {
+            cbindgen::Error::ParseSyntaxError { .. } => {
+                // This means there was a syntax error in the Rust sources. Don't panic, because
+                // we want the build to continue and the Rust compiler to hit the error. The
+                // Rust compiler produces a better error message than cbindgen.
+                eprintln!("Generating C bindings failed because of a Rust syntax error");
+            }
+            e => panic!("Unable to generate C bindings: {e:?}"),
+        },
+        |bindings| {
+            bindings.write_to_file("communicator_bindings.h");
+        },
+    );
+
+    Ok(())
+}
--- a/pgxn/neon/communicator/cbindgen.toml
+++ b/pgxn/neon/communicator/cbindgen.toml
@@ -0,0 +1,4 @@
+language = "C"
+
+[enum]
+prefix_with_name = true
--- a/pgxn/neon/communicator/src/backend_comms.rs
+++ b/pgxn/neon/communicator/src/backend_comms.rs
@@ -0,0 +1,204 @@
+//! This module implements a request/response "slot" for submitting requests from backends
+//! to the communicator process.
+//!
+//! NB: The "backend" side of this code runs in Postgres backend processes,
+//! which means that it is not safe to use the 'tracing' crate for logging, nor
+//! to launch threads or use tokio tasks.
+use std::cell::UnsafeCell;
+use std::sync::atomic::fence;
+use std::sync::atomic::{AtomicI32, Ordering};
+
+use crate::neon_request::{NeonIORequest, NeonIOResult};
+
+use atomic_enum::atomic_enum;
+
+/// One request/response slot. Each backend has its own set of slots that it uses.
+///
+/// This is the moral equivalent of PgAioHandle for Postgres AIO requests
+/// Like PgAioHandle, try to keep this small.
+///
+/// There is an array of these in shared memory. Therefore, this must be Sized.
+///
+/// ## Lifecycle of a request
+///
+/// The slot is always owned by either the backend process or the communicator
+/// process, depending on the 'state'. Only the owning process is allowed to
+/// read or modify the slot, except for reading the 'state' itself to check who
+/// owns it.
+///
+/// A slot begins in the Idle state, where it is owned by the backend process.
+/// To submit a request, the backend process fills the slot with the request
+/// data, and changes it to the Submitted state. After changing the state, the
+/// slot is owned by the communicator process, and the backend is not allowed
+/// to access it until the communicator process marks it as Completed.
+///
+/// When the communicator process sees that the slot is in Submitted state, it
+/// starts to process the request. After processing the request, it stores the
+/// result in the slot, and changes the state to Completed. It is now owned by
+/// the backend process again, which may now read the result, and reuse the
+/// slot for a new request.
+///
+/// For correctness of the above protocol, we really only need two states:
+/// "owned by backend" and "owned by communicator process. But to help with
+/// debugging, there are a few more states. When the backend starts to fill in
+/// the request details in the slot, it first sets the state from Idle to
+/// Filling, and when it's done with that, from Filling to Submitted. In the
+/// Filling state, the slot is still owned by the backend. Similarly, when the
+/// communicator process starts to process a request, it sets it to Processing
+/// state first, but the slot is still owned by the communicator process.
+///
+/// This struct doesn't handle waking up the communicator process when a request
+/// has been submitted or when a response is ready. We only store the 'owner_procno'
+/// which can be used for waking up the backend on completion, but the wakeups are
+/// performed elsewhere.
+pub struct NeonIOHandle {
+    /// similar to PgAioHandleState
+    state: AtomicNeonIOHandleState,
+
+    /// The owning process's ProcNumber. The worker process uses this to set the process's
+    /// latch on completion.
+    ///
+    /// (This could be calculated from num_neon_request_slots_per_backend and the index of
+    /// this slot in the overall 'neon_requst_slots array')
+    owner_procno: AtomicI32,
+
+    /// SAFETY: This is modified by fill_request(), after it has established ownership
+    /// of the slot by setting state from Idle to Filling
+    request: UnsafeCell<NeonIORequest>,
+
+    /// valid when state is Completed
+    ///
+    /// SAFETY: This is modified by RequestProcessingGuard::complete(). There can be
+    /// only one RequestProcessingGuard outstanding for a slot at a time, because
+    /// it is returned by start_processing_request() which checks the state, so
+    /// RequestProcessingGuard has exclusive access to the slot.
+    result: UnsafeCell<NeonIOResult>,
+}
+
+// The protocol described in the "Lifecycle of a request" section above ensures
+// the safe access to the fields
+unsafe impl Send for NeonIOHandle {}
+unsafe impl Sync for NeonIOHandle {}
+
+impl Default for NeonIOHandle {
+    fn default() -> NeonIOHandle {
+        NeonIOHandle {
+            owner_procno: AtomicI32::new(-1),
+            request: UnsafeCell::new(NeonIORequest::Empty),
+            result: UnsafeCell::new(NeonIOResult::Empty),
+            state: AtomicNeonIOHandleState::new(NeonIOHandleState::Idle),
+        }
+    }
+}
+
+#[atomic_enum]
+#[derive(Eq, PartialEq)]
+pub enum NeonIOHandleState {
+    Idle,
+
+    /// backend is filling in the request
+    Filling,
+
+    /// Backend has submitted the request to the communicator, but the
+    /// communicator process has not yet started processing it.
+    Submitted,
+
+    /// Communicator is processing the request
+    Processing,
+
+    /// Communicator has completed the request, and the 'result' field is now
+    /// valid, but the backend has not read the result yet.
+    Completed,
+}
+
+pub struct RequestProcessingGuard<'a>(&'a NeonIOHandle);
+
+unsafe impl<'a> Send for RequestProcessingGuard<'a> {}
+unsafe impl<'a> Sync for RequestProcessingGuard<'a> {}
+
+impl<'a> RequestProcessingGuard<'a> {
+    pub fn get_request(&self) -> &NeonIORequest {
+        unsafe { &*self.0.request.get() }
+    }
+
+    pub fn get_owner_procno(&self) -> i32 {
+        self.0.owner_procno.load(Ordering::Relaxed)
+    }
+
+    pub fn completed(self, result: NeonIOResult) {
+        unsafe {
+            *self.0.result.get() = result;
+        };
+
+        // Ok, we have completed the IO. Mark the request as completed. After that,
+        // we no longer have ownership of the slot, and must not modify it.
+        let old_state = self
+            .0
+            .state
+            .swap(NeonIOHandleState::Completed, Ordering::Release);
+        assert!(old_state == NeonIOHandleState::Processing);
+    }
+}
+
+impl NeonIOHandle {
+    pub fn fill_request(&self, request: &NeonIORequest, proc_number: i32) {
+        // Verify that the slot is in Idle state previously, and start filling it.
+        //
+        // XXX: This step isn't strictly necessary. Assuming the caller didn't screw up
+        // and try to use a slot that's already in use, we could fill the slot and
+        // switch it directly from Idle to Submitted state.
+        if let Err(s) = self.state.compare_exchange(
+            NeonIOHandleState::Idle,
+            NeonIOHandleState::Filling,
+            Ordering::Relaxed,
+            Ordering::Relaxed,
+        ) {
+            panic!("unexpected state in request slot: {s:?}");
+        }
+
+        // This fence synchronizes-with store/swap in `communicator_process_main_loop`.
+        fence(Ordering::Acquire);
+
+        self.owner_procno.store(proc_number, Ordering::Relaxed);
+        unsafe { *self.request.get() = *request }
+        self.state
+            .store(NeonIOHandleState::Submitted, Ordering::Release);
+    }
+
+    pub fn try_get_result(&self) -> Option<NeonIOResult> {
+        // FIXME: ordering?
+        let state = self.state.load(Ordering::Relaxed);
+        if state == NeonIOHandleState::Completed {
+            // This fence synchronizes-with store/swap in `communicator_process_main_loop`.
+            fence(Ordering::Acquire);
+            let result = unsafe { *self.result.get() };
+            self.state.store(NeonIOHandleState::Idle, Ordering::Relaxed);
+            Some(result)
+        } else {
+            None
+        }
+    }
+
+    pub fn start_processing_request<'a>(&'a self) -> Option<RequestProcessingGuard<'a>> {
+        // Read the IO request from the slot indicated in the wakeup
+        //
+        // XXX: using compare_exchange for this is not strictly necessary, as long as
+        // the communicator process has _some_ means of tracking which requests it's
+        // already processing. That could be a flag somewhere in communicator's private
+        // memory, for example.
+        if let Err(s) = self.state.compare_exchange(
+            NeonIOHandleState::Submitted,
+            NeonIOHandleState::Processing,
+            Ordering::Relaxed,
+            Ordering::Relaxed,
+        ) {
+            // FIXME surprising state. This is unexpected at the moment, but if we
+            // started to process requests more aggressively, without waiting for the
+            // read from the pipe, then this could happen
+            panic!("unexpected state in request slot: {s:?}");
+        }
+        fence(Ordering::Acquire);
+
+        Some(RequestProcessingGuard(self))
+    }
+}
--- a/pgxn/neon/communicator/src/backend_interface.rs
+++ b/pgxn/neon/communicator/src/backend_interface.rs
@@ -0,0 +1,222 @@
+//! This code runs in each backend process. That means that launching Rust threads, panicking
+//! etc. is forbidden!
+
+use std::os::fd::OwnedFd;
+
+use crate::backend_comms::NeonIOHandle;
+use crate::init::CommunicatorInitStruct;
+use crate::integrated_cache::{BackendCacheReadOp, IntegratedCacheReadAccess};
+use crate::neon_request::{CCachedGetPageVResult, COid};
+use crate::neon_request::{NeonIORequest, NeonIOResult};
+
+pub struct CommunicatorBackendStruct<'t> {
+    my_proc_number: i32,
+
+    next_neon_request_idx: u32,
+
+    my_start_idx: u32, // First request slot that belongs to this backend
+    my_end_idx: u32,   // end + 1 request slot that belongs to this backend
+
+    neon_request_slots: &'t [NeonIOHandle],
+
+    submission_pipe_write_fd: OwnedFd,
+
+    pending_cache_read_op: Option<BackendCacheReadOp<'t>>,
+
+    integrated_cache: &'t IntegratedCacheReadAccess<'t>,
+}
+
+#[unsafe(no_mangle)]
+pub extern "C" fn rcommunicator_backend_init(
+    cis: Box<CommunicatorInitStruct>,
+    my_proc_number: i32,
+) -> &'static mut CommunicatorBackendStruct<'static> {
+    let start_idx = my_proc_number as u32 * cis.num_neon_request_slots_per_backend;
+    let end_idx = start_idx + cis.num_neon_request_slots_per_backend;
+
+    let integrated_cache = Box::leak(Box::new(cis.integrated_cache_init_struct.backend_init()));
+
+    let bs: &'static mut CommunicatorBackendStruct =
+        Box::leak(Box::new(CommunicatorBackendStruct {
+            my_proc_number,
+            next_neon_request_idx: start_idx,
+            my_start_idx: start_idx,
+            my_end_idx: end_idx,
+            neon_request_slots: cis.neon_request_slots,
+
+            submission_pipe_write_fd: cis.submission_pipe_write_fd,
+            pending_cache_read_op: None,
+
+            integrated_cache,
+        }));
+    bs
+}
+
+/// Start a request. You can poll for its completion and get the result by
+/// calling bcomm_poll_dbsize_request_completion(). The communicator will wake
+/// us up by setting our process latch, so to wait for the completion, wait on
+/// the latch and call bcomm_poll_dbsize_request_completion() every time the
+/// latch is set.
+///
+/// Safety: The C caller must ensure that the references are valid.
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_start_io_request(
+    bs: &'_ mut CommunicatorBackendStruct,
+    request: &NeonIORequest,
+    immediate_result_ptr: &mut NeonIOResult,
+) -> i32 {
+    assert!(bs.pending_cache_read_op.is_none());
+
+    // Check if the request can be satisfied from the cache first
+    if let NeonIORequest::RelSize(req) = request {
+        if let Some(nblocks) = bs.integrated_cache.get_rel_size(&req.reltag()) {
+            *immediate_result_ptr = NeonIOResult::RelSize(nblocks);
+            return -1;
+        }
+    }
+
+    // Create neon request and submit it
+    let request_idx = bs.start_neon_request(request);
+
+    // Tell the communicator about it
+    bs.submit_request(request_idx);
+
+    request_idx
+}
+
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_start_get_page_v_request(
+    bs: &mut CommunicatorBackendStruct,
+    request: &NeonIORequest,
+    immediate_result_ptr: &mut CCachedGetPageVResult,
+) -> i32 {
+    let NeonIORequest::GetPageV(get_pagev_request) = request else {
+        panic!("invalid request passed to bcomm_start_get_page_v_request()");
+    };
+    assert!(matches!(request, NeonIORequest::GetPageV(_)));
+    assert!(bs.pending_cache_read_op.is_none());
+
+    // Check if the request can be satisfied from the cache first
+    let mut all_cached = true;
+    let mut read_op = bs.integrated_cache.start_read_op();
+    for i in 0..get_pagev_request.nblocks {
+        if let Some(cache_block) = read_op.get_page(
+            &get_pagev_request.reltag(),
+            get_pagev_request.block_number + i as u32,
+        ) {
+            immediate_result_ptr.cache_block_numbers[i as usize] = cache_block;
+        } else {
+            // not found in cache
+            all_cached = false;
+            break;
+        }
+    }
+    if all_cached {
+        bs.pending_cache_read_op = Some(read_op);
+        return -1;
+    }
+
+    // Create neon request and submit it
+    let request_idx = bs.start_neon_request(request);
+
+    // Tell the communicator about it
+    bs.submit_request(request_idx);
+
+    request_idx
+}
+
+/// Check if a request has completed. Returns:
+///
+/// -1 if the request is still being processed
+/// 0 on success
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_poll_request_completion(
+    bs: &mut CommunicatorBackendStruct,
+    request_idx: u32,
+    result_p: &mut NeonIOResult,
+) -> i32 {
+    match bs.neon_request_slots[request_idx as usize].try_get_result() {
+        None => -1, // still processing
+        Some(result) => {
+            *result_p = result;
+            0
+        }
+    }
+}
+
+// LFC functions
+
+/// Finish a local file cache read
+///
+//
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_finish_cache_read(bs: &mut CommunicatorBackendStruct) -> bool {
+    if let Some(op) = bs.pending_cache_read_op.take() {
+        op.finish()
+    } else {
+        panic!("bcomm_finish_cache_read() called with no cached read pending");
+    }
+}
+
+
+/// Check if the local file cache contians the given block
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_cache_contains(
+    bs: &mut CommunicatorBackendStruct,
+    spc_oid: COid,
+    db_oid: COid,
+    rel_number: u32,
+    fork_number: u8,
+    block_number: u32,
+) -> bool {
+    bs.integrated_cache.cache_contains_page(
+        &pageserver_page_api::RelTag {
+            spcnode: spc_oid,
+            dbnode: db_oid,
+            relnode: rel_number,
+            forknum: fork_number,
+        },
+        block_number
+    )
+}
+
+
+impl<'t> CommunicatorBackendStruct<'t> {
+    /// Send a wakeup to the communicator process
+    fn submit_request(self: &CommunicatorBackendStruct<'t>, request_idx: i32) {
+        // wake up communicator by writing the idx to the submission pipe
+        //
+        // This can block, if the pipe is full. That should be very rare,
+        // because the communicator tries hard to drain the pipe to prevent
+        // that. Also, there's a natural upper bound on how many wakeups can be
+        // queued up: there is only a limited number of request slots for each
+        // backend.
+        //
+        // If it does block very briefly, that's not too serious.
+        let idxbuf = request_idx.to_ne_bytes();
+
+        let _res = nix::unistd::write(&self.submission_pipe_write_fd, &idxbuf);
+        // FIXME: check result, return any errors
+    }
+
+    /// Note: there's no guarantee on when the communicator might pick it up. You should ring
+    /// the doorbell. But it might pick it up immediately.
+    pub(crate) fn start_neon_request(&mut self, request: &NeonIORequest) -> i32 {
+        let my_proc_number = self.my_proc_number;
+
+        // Grab next free slot
+        // FIXME: any guarantee that there will be any?
+        let idx = self.next_neon_request_idx;
+
+        let next_idx = idx + 1;
+        self.next_neon_request_idx = if next_idx == self.my_end_idx {
+            self.my_start_idx
+        } else {
+            next_idx
+        };
+
+        self.neon_request_slots[idx as usize].fill_request(request, my_proc_number);
+
+        idx as i32
+    }
+}
--- a/Show More
+++ b/Show More