Documentation and tweaks

Add stream pool
Add initial client pool
2026-01-29 08:10:38 +00:00 · 2025-07-01 17:54:41 +02:00 · 2025-07-01 17:54:41 +02:00 · 2025-07-01 17:54:41 +02:00 · 2025-07-01 17:54:41 +02:00 · 2025-07-01 17:54:41 +02:00
76 changed files with 14162 additions and 1007 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ neon.iml
 /.neon
 /integration_tests/.neon
 compaction-suite-results.*
+pgxn/neon/communicator/communicator_bindings.h

 # Coverage
 *.profraw
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -253,6 +253,17 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"

+[[package]]
+name = "atomic_enum"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.100",
+]
+
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -687,13 +698,40 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "axum"
+version = "0.7.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
+dependencies = [
+ "async-trait",
+ "axum-core 0.4.5",
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
+ "itoa",
+ "matchit 0.7.3",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustversion",
+ "serde",
+ "sync_wrapper 1.0.1",
+ "tower 0.5.2",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "axum"
 version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6d6fd624c75e18b3b4c6b9caf42b1afe24437daaee904069137d8bab077be8b8"
 dependencies = [
- "axum-core",
+ "axum-core 0.5.0",
 "base64 0.22.1",
 "bytes",
 "form_urlencoded",
@@ -701,10 +739,10 @@ dependencies = [
 "http 1.1.0",
 "http-body 1.0.0",
 "http-body-util",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "itoa",
- "matchit",
+ "matchit 0.8.4",
 "memchr",
 "mime",
 "percent-encoding",
@@ -724,6 +762,26 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "axum-core"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
+ "mime",
+ "pin-project-lite",
+ "rustversion",
+ "sync_wrapper 1.0.1",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "axum-core"
 version = "0.5.0"
@@ -750,8 +808,8 @@ version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
 dependencies = [
- "axum",
- "axum-core",
+ "axum 0.8.1",
+ "axum-core 0.5.0",
 "bytes",
 "form_urlencoded",
 "futures-util",
@@ -1083,6 +1141,25 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"

+[[package]]
+name = "cbindgen"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eadd868a2ce9ca38de7eeafdcec9c7065ef89b42b32f0839278d55f35c54d1ff"
+dependencies = [
+ "clap",
+ "heck 0.4.1",
+ "indexmap 2.9.0",
+ "log",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "serde_json",
+ "syn 2.0.100",
+ "tempfile",
+ "toml",
+]
+
 [[package]]
 name = "cc"
 version = "1.2.16"
@@ -1209,7 +1286,7 @@ version = "4.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -1267,6 +1344,34 @@ dependencies = [
 "unicode-width",
 ]

+[[package]]
+name = "communicator"
+version = "0.1.0"
+dependencies = [
+ "atomic_enum",
+ "axum 0.8.1",
+ "bytes",
+ "cbindgen",
+ "clashmap",
+ "http 1.1.0",
+ "libc",
+ "metrics",
+ "neon-shmem",
+ "nix 0.30.1",
+ "pageserver_client_grpc",
+ "pageserver_page_api",
+ "prometheus",
+ "prost 0.13.5",
+ "thiserror 1.0.69",
+ "tokio",
+ "tokio-pipe",
+ "tonic 0.12.3",
+ "tracing",
+ "tracing-subscriber",
+ "uring-common",
+ "utils",
+]
+
 [[package]]
 name = "compute_api"
 version = "0.1.0"
@@ -1293,7 +1398,7 @@ dependencies = [
 "aws-sdk-kms",
 "aws-sdk-s3",
 "aws-smithy-types",
- "axum",
+ "axum 0.8.1",
 "axum-extra",
 "base64 0.22.1",
 "bytes",
@@ -1596,9 +1701,9 @@ dependencies = [

 [[package]]
 name = "crossbeam-utils"
-version = "0.8.19"
+version = "0.8.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"

 [[package]]
 name = "crossterm"
@@ -1938,7 +2043,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
 dependencies = [
 "darling",
 "either",
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -2052,7 +2157,7 @@ name = "endpoint_storage"
 version = "0.0.1"
 dependencies = [
 "anyhow",
- "axum",
+ "axum 0.8.1",
 "axum-extra",
 "camino",
 "camino-tempfile",
@@ -2333,7 +2438,7 @@ dependencies = [
 "futures-core",
 "futures-sink",
 "http-body-util",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "pin-project",
 "rand 0.8.5",
@@ -2503,6 +2608,18 @@ dependencies = [
 "wasm-bindgen",
 ]

+[[package]]
+name = "getrandom"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi",
+ "wasi 0.14.2+wasi-0.2.4",
+]
+
 [[package]]
 name = "gettid"
 version = "0.1.3"
@@ -2715,6 +2832,12 @@ dependencies = [
 "http 1.1.0",
 ]

+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -2886,9 +3009,9 @@ dependencies = [

 [[package]]
 name = "httparse"
-version = "1.8.0"
+version = "1.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904"
+checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"

 [[package]]
 name = "httpdate"
@@ -2938,9 +3061,9 @@ dependencies = [

 [[package]]
 name = "hyper"
-version = "1.4.1"
+version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05"
+checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80"
 dependencies = [
 "bytes",
 "futures-channel",
@@ -2980,7 +3103,7 @@ checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c"
 dependencies = [
 "futures-util",
 "http 1.1.0",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "rustls 0.22.4",
 "rustls-pki-types",
@@ -2995,7 +3118,7 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3203a961e5c83b6f5498933e78b6b263e208c197b63e9c6c53cc82ffd3f63793"
 dependencies = [
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "pin-project-lite",
 "tokio",
@@ -3004,20 +3127,20 @@ dependencies = [

 [[package]]
 name = "hyper-util"
-version = "0.1.7"
+version = "0.1.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9"
+checksum = "cf9f1e950e0d9d1d3c47184416723cf29c0d1f93bd8cccf37e4beb6b44f31710"
 dependencies = [
 "bytes",
 "futures-channel",
 "futures-util",
 "http 1.1.0",
 "http-body 1.0.0",
- "hyper 1.4.1",
+ "hyper 1.6.0",
+ "libc",
 "pin-project-lite",
 "socket2",
 "tokio",
- "tower 0.4.13",
 "tower-service",
 "tracing",
 ]
@@ -3606,6 +3729,12 @@ dependencies = [
 "regex-automata 0.1.10",
 ]

+[[package]]
+name = "matchit"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
+
 [[package]]
 name = "matchit"
 version = "0.8.4"
@@ -3651,7 +3780,7 @@ version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -3713,7 +3842,7 @@ dependencies = [
 "procfs",
 "prometheus",
 "rand 0.8.5",
- "rand_distr",
+ "rand_distr 0.4.3",
 "twox-hash",
 ]

@@ -3802,11 +3931,25 @@ name = "neon-shmem"
 version = "0.1.0"
 dependencies = [
 "nix 0.30.1",
+ "rand 0.9.1",
+ "rand_distr 0.5.1",
+ "spin",
 "tempfile",
 "thiserror 1.0.69",
 "workspace_hack",
 ]

+[[package]]
+name = "neonart"
+version = "0.1.0"
+dependencies = [
+ "crossbeam-utils",
+ "rand 0.9.1",
+ "rand_distr 0.5.1",
+ "spin",
+ "tracing",
+]
+
 [[package]]
 name = "never-say-never"
 version = "6.6.666"
@@ -4240,15 +4383,19 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-trait",
+ "axum 0.8.1",
 "bytes",
 "camino",
 "clap",
 "futures",
 "hdrhistogram",
+ "http 1.1.0",
 "humantime",
 "humantime-serde",
+ "metrics",
 "pageserver_api",
 "pageserver_client",
+ "pageserver_client_grpc",
 "pageserver_page_api",
 "rand 0.8.5",
 "reqwest",
@@ -4332,6 +4479,7 @@ dependencies = [
 "pageserver_client",
 "pageserver_compaction",
 "pageserver_page_api",
+ "peekable",
 "pem",
 "pin-project-lite",
 "postgres-protocol",
@@ -4345,6 +4493,7 @@ dependencies = [
 "pprof",
 "pq_proto",
 "procfs",
+ "prost 0.13.5",
 "rand 0.8.5",
 "range-set-blaze",
 "regex",
@@ -4448,6 +4597,36 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "pageserver_client_grpc"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "bytes",
+ "chrono",
+ "dashmap 5.5.0",
+ "futures",
+ "http 1.1.0",
+ "hyper 1.6.0",
+ "hyper-util",
+ "metrics",
+ "pageserver_api",
+ "pageserver_page_api",
+ "priority-queue",
+ "rand 0.8.5",
+ "scopeguard",
+ "thiserror 1.0.69",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "tonic 0.13.1",
+ "tower 0.4.13",
+ "tracing",
+ "utils",
+ "uuid",
+]
+
 [[package]]
 name = "pageserver_compaction"
 version = "0.1.0"
@@ -4619,6 +4798,15 @@ dependencies = [
 "sha2",
 ]

+[[package]]
+name = "peekable"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b"
+dependencies = [
+ "smallvec",
+]
+
 [[package]]
 name = "pem"
 version = "3.0.3"
@@ -5052,6 +5240,17 @@ dependencies = [
 "elliptic-curve 0.13.8",
 ]

+[[package]]
+name = "priority-queue"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef08705fa1589a1a59aa924ad77d14722cb0cd97b67dd5004ed5f4a4873fce8d"
+dependencies = [
+ "autocfg",
+ "equivalent",
+ "indexmap 2.9.0",
+]
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.94"
@@ -5130,7 +5329,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
 dependencies = [
 "bytes",
- "heck",
+ "heck 0.5.0",
 "itertools 0.12.1",
 "log",
 "multimap",
@@ -5151,7 +5350,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
 "bytes",
- "heck",
+ "heck 0.5.0",
 "itertools 0.12.1",
 "log",
 "multimap",
@@ -5252,7 +5451,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 0.14.30",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "indexmap 2.9.0",
 "ipnet",
@@ -5276,7 +5475,7 @@ dependencies = [
 "postgres_backend",
 "pq_proto",
 "rand 0.8.5",
- "rand_distr",
+ "rand_distr 0.4.3",
 "rcgen",
 "redis",
 "regex",
@@ -5380,6 +5579,12 @@ dependencies = [
 "proc-macro2",
 ]

+[[package]]
+name = "r-efi"
+version = "5.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
+
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5404,6 +5609,16 @@ dependencies = [
 "rand_core 0.6.4",
 ]

+[[package]]
+name = "rand"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
+dependencies = [
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.3",
+]
+
 [[package]]
 name = "rand_chacha"
 version = "0.2.2"
@@ -5424,6 +5639,16 @@ dependencies = [
 "rand_core 0.6.4",
 ]

+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.3",
+]
+
 [[package]]
 name = "rand_core"
 version = "0.5.1"
@@ -5442,6 +5667,15 @@ dependencies = [
 "getrandom 0.2.11",
 ]

+[[package]]
+name = "rand_core"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
+dependencies = [
+ "getrandom 0.3.2",
+]
+
 [[package]]
 name = "rand_distr"
 version = "0.4.3"
@@ -5452,6 +5686,16 @@ dependencies = [
 "rand 0.8.5",
 ]

+[[package]]
+name = "rand_distr"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
+dependencies = [
+ "num-traits",
+ "rand 0.9.1",
+]
+
 [[package]]
 name = "rand_hc"
 version = "0.2.0"
@@ -5648,7 +5892,7 @@ dependencies = [
 "http-body-util",
 "http-types",
 "humantime-serde",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "itertools 0.10.5",
 "metrics",
 "once_cell",
@@ -5688,7 +5932,7 @@ dependencies = [
 "http 1.1.0",
 "http-body 1.0.0",
 "http-body-util",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-rustls 0.26.0",
 "hyper-util",
 "ipnet",
@@ -5745,7 +5989,7 @@ dependencies = [
 "futures",
 "getrandom 0.2.11",
 "http 1.1.0",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "parking_lot 0.11.2",
 "reqwest",
 "reqwest-middleware",
@@ -5766,7 +6010,7 @@ dependencies = [
 "async-trait",
 "getrandom 0.2.11",
 "http 1.1.0",
- "matchit",
+ "matchit 0.8.4",
 "opentelemetry",
 "reqwest",
 "reqwest-middleware",
@@ -6714,12 +6958,12 @@ dependencies = [

 [[package]]
 name = "socket2"
-version = "0.5.5"
+version = "0.5.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
+checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef"
 dependencies = [
 "libc",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -6727,6 +6971,9 @@ name = "spin"
 version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
+dependencies = [
+ "lock_api",
+]

 [[package]]
 name = "spinning_top"
@@ -6785,7 +7032,7 @@ dependencies = [
 "http-body-util",
 "http-utils",
 "humantime",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "metrics",
 "once_cell",
@@ -6969,7 +7216,7 @@ version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "rustversion",
@@ -7394,6 +7641,16 @@ dependencies = [
 "syn 2.0.100",
 ]

+[[package]]
+name = "tokio-pipe"
+version = "0.2.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784"
+dependencies = [
+ "libc",
+ "tokio",
+]
+
 [[package]]
 name = "tokio-postgres"
 version = "0.7.10"
@@ -7588,16 +7845,25 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
 dependencies = [
+ "async-stream",
 "async-trait",
+ "axum 0.7.9",
 "base64 0.22.1",
 "bytes",
+ "h2 0.4.4",
 "http 1.1.0",
 "http-body 1.0.0",
 "http-body-util",
+ "hyper 1.6.0",
+ "hyper-timeout",
+ "hyper-util",
 "percent-encoding",
 "pin-project",
 "prost 0.13.5",
+ "socket2",
+ "tokio",
 "tokio-stream",
+ "tower 0.4.13",
 "tower-layer",
 "tower-service",
 "tracing",
@@ -7610,7 +7876,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9"
 dependencies = [
 "async-trait",
- "axum",
+ "axum 0.8.1",
 "base64 0.22.1",
 "bytes",
 "flate2",
@@ -7618,7 +7884,7 @@ dependencies = [
 "http 1.1.0",
 "http-body 1.0.0",
 "http-body-util",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-timeout",
 "hyper-util",
 "percent-encoding",
@@ -7671,11 +7937,16 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
 dependencies = [
 "futures-core",
 "futures-util",
+ "indexmap 1.9.3",
 "pin-project",
 "pin-project-lite",
+ "rand 0.8.5",
+ "slab",
 "tokio",
+ "tokio-util",
 "tower-layer",
 "tower-service",
+ "tracing",
 ]

 [[package]]
@@ -8159,7 +8430,7 @@ name = "vm_monitor"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "axum",
+ "axum 0.8.1",
 "cgroups-rs",
 "clap",
 "futures",
@@ -8271,6 +8542,15 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

+[[package]]
+name = "wasi"
+version = "0.14.2+wasi-0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
+dependencies = [
+ "wit-bindgen-rt",
+]
+
 [[package]]
 name = "wasite"
 version = "0.1.0"
@@ -8628,6 +8908,15 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

+[[package]]
+name = "wit-bindgen-rt"
+version = "0.39.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
+dependencies = [
+ "bitflags 2.8.0",
+]
+
 [[package]]
 name = "workspace_hack"
 version = "0.1.0"
@@ -8635,8 +8924,8 @@ dependencies = [
 "ahash",
 "anstream",
 "anyhow",
- "axum",
- "axum-core",
+ "axum 0.8.1",
+ "axum-core 0.5.0",
 "base64 0.21.7",
 "base64ct",
 "bytes",
@@ -8668,7 +8957,7 @@ dependencies = [
 "hex",
 "hmac",
 "hyper 0.14.30",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "indexmap 2.9.0",
 "itertools 0.12.1",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,6 +8,7 @@ members = [
    "pageserver/compaction",
    "pageserver/ctl",
    "pageserver/client",
+    "pageserver/client_grpc",
    "pageserver/pagebench",
    "pageserver/page_api",
    "proxy",
@@ -34,6 +35,7 @@ members = [
    "libs/pq_proto",
    "libs/tenant_size_model",
    "libs/metrics",
+    "libs/neonart",
    "libs/postgres_connection",
    "libs/remote_storage",
    "libs/tracing-utils",
@@ -46,6 +48,7 @@ members = [
    "libs/proxy/postgres-types2",
    "libs/proxy/tokio-postgres2",
    "endpoint_storage",
+    "pgxn/neon/communicator",
 ]

 [workspace.package]
@@ -89,6 +92,7 @@ clap = { version = "4.0", features = ["derive", "env"] }
 clashmap = { version = "1.0", features = ["raw-api"] }
 comfy-table = "7.1"
 const_format = "0.2"
+crossbeam-utils = "0.8.21"
 crc32c = "0.6"
 diatomic-waker = { version = "0.2.3" }
 either = "1.8"
@@ -147,6 +151,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pem = "3.0.3"
+peekable = "0.3.0"
 pin-project-lite = "0.2"
 pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
 procfs = "0.16"
@@ -183,6 +188,7 @@ smallvec = "1.11"
 smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 spki = "0.7.3"
+spin = "0.9.8"
 strum = "0.26"
 strum_macros = "0.26"
 "subtle"  = "2.5.0"
@@ -194,7 +200,6 @@ thiserror = "1.0"
 tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
 tokio = { version = "1.43.1", features = ["macros"] }
-tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
 tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
@@ -236,6 +241,9 @@ x509-cert = { version = "0.2.5" }
 env_logger = "0.11"
 log = "0.4"

+tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
+uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
+
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
@@ -255,9 +263,12 @@ desim = { version = "0.1", path = "./libs/desim" }
 endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" }
 http-utils = { version = "0.1", path = "./libs/http-utils/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
+neonart = { version = "0.1", path = "./libs/neonart/" }
+neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
 pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
+pageserver_client_grpc = { path = "./pageserver/client_grpc" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 pageserver_page_api = { path = "./pageserver/page_api" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
@@ -284,6 +295,7 @@ walproposer = { version = "0.1", path = "./libs/walproposer/" }
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
+cbindgen = "0.28.0"
 criterion = "0.5.1"
 rcgen = "0.13"
 rstest = "0.18"
--- a/12
+++ b/12
@@ -30,11 +30,18 @@ ifeq ($(BUILD_TYPE),release)
 	PG_CFLAGS += -O2 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
 	CARGO_PROFILE ?= --profile=release
+	# NEON_CARGO_ARTIFACT_TARGET_DIR is the directory where `cargo build` places
+	# the final build artifacts. There is unfortunately no easy way of changing
+	# it to a fully predictable path, nor to extract the path with a simple
+	# command. See https://github.com/rust-lang/cargo/issues/9661 and
+	# https://github.com/rust-lang/cargo/issues/6790.
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
 else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
 	PG_CFLAGS += -O0 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
 	CARGO_PROFILE ?= --profile=dev
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
 else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif
@@ -118,7 +125,10 @@ cargo-target-dir:
 neon-pg-ext-%: postgres-install-%
 	+@echo "Compiling neon-specific Postgres extensions for $*"
 	mkdir -p $(BUILD_DIR)/pgxn-$*
-	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
+	$(MAKE) PG_CONFIG="$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config" COPT='$(COPT)' \
+		NEON_CARGO_ARTIFACT_TARGET_DIR="$(NEON_CARGO_ARTIFACT_TARGET_DIR)" \
+		CARGO_BUILD_FLAGS="$(CARGO_BUILD_FLAGS)" \
+		CARGO_PROFILE="$(CARGO_PROFILE)" \
 		-C $(BUILD_DIR)/pgxn-$*\
 		-f $(ROOT_PROJECT_DIR)/pgxn/Makefile  install

--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,4 +1,4 @@
-use anyhow::{Context, Result};
+use anyhow::{Context, Result, anyhow};
 use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
 use compute_api::responses::{
@@ -6,7 +6,8 @@ use compute_api::responses::{
    LfcPrewarmState, TlsConfig,
 };
 use compute_api::spec::{
-    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
+    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverConnectionInfo,
+    PageserverShardConnectionInfo, PgIdent,
 };
 use futures::StreamExt;
 use futures::future::join_all;
@@ -216,7 +217,7 @@ pub struct ParsedSpec {
    pub spec: ComputeSpec,
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
-    pub pageserver_connstr: String,
+    pub pageserver_conninfo: PageserverConnectionInfo,
    pub safekeeper_connstrings: Vec<String>,
    pub storage_auth_token: Option<String>,
    /// k8s dns name and port
@@ -263,6 +264,27 @@ impl ParsedSpec {
    }
 }

+fn extract_pageserver_conninfo_from_guc(
+    pageserver_connstring_guc: &str,
+) -> PageserverConnectionInfo {
+    PageserverConnectionInfo {
+        shards: pageserver_connstring_guc
+            .split(',')
+            .enumerate()
+            .map(|(i, connstr)| {
+                (
+                    i as u32,
+                    PageserverShardConnectionInfo {
+                        libpq_url: Some(connstr.to_string()),
+                        grpc_url: None,
+                    },
+                )
+            })
+            .collect(),
+        prefer_grpc: false,
+    }
+}
+
 impl TryFrom<ComputeSpec> for ParsedSpec {
    type Error = String;
    fn try_from(spec: ComputeSpec) -> Result<Self, String> {
@@ -272,11 +294,17 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
        // For backwards-compatibility, the top-level fields in the spec file
        // may be empty. In that case, we need to dig them from the GUCs in the
        // cluster.settings field.
-        let pageserver_connstr = spec
-            .pageserver_connstring
-            .clone()
-            .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
-            .ok_or("pageserver connstr should be provided")?;
+        let pageserver_conninfo = match &spec.pageserver_connection_info {
+            Some(x) => x.clone(),
+            None => {
+                if let Some(guc) = spec.cluster.settings.find("neon.pageserver_connstring") {
+                    extract_pageserver_conninfo_from_guc(&guc)
+                } else {
+                    return Err("pageserver connstr should be provided".to_string());
+                }
+            }
+        };
+
        let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
            if matches!(spec.mode, ComputeMode::Primary) {
                spec.cluster
@@ -326,7 +354,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {

        let res = ParsedSpec {
            spec,
-            pageserver_connstr,
+            pageserver_conninfo,
            safekeeper_connstrings,
            storage_auth_token,
            tenant_id,
@@ -416,7 +444,7 @@ impl ComputeNode {

        let mut new_state = ComputeState::new();
        if let Some(spec) = config.spec {
-            let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
+            let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow!(msg))?;
            new_state.pspec = Some(pspec);
        }

@@ -1003,12 +1031,11 @@ impl ComputeNode {
    fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");

-        let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
        let started = Instant::now();
-
-        let (connected, size) = match PageserverProtocol::from_connstring(shard0_connstr)? {
-            PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?,
-            PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
+        let (connected, size) = if spec.pageserver_conninfo.prefer_grpc {
+            self.try_get_basebackup_grpc(spec, lsn)?
+        } else {
+            self.try_get_basebackup_libpq(spec, lsn)?
        };

        let mut state = self.state.lock().unwrap();
@@ -1023,20 +1050,21 @@ impl ComputeNode {
    /// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
    /// the connection was established, and the (compressed) size of the basebackup.
    fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
-        let shard0_connstr = spec
-            .pageserver_connstr
-            .split(',')
-            .next()
-            .unwrap()
-            .to_string();
-        let shard_index = match spec.pageserver_connstr.split(',').count() as u8 {
+        let shard0 = spec
+            .pageserver_conninfo
+            .shards
+            .get(&0)
+            .expect("shard 0 connection info missing");
+        let shard0_url = shard0.grpc_url.clone().expect("no grpc_url for shard 0");
+
+        let shard_index = match spec.pageserver_conninfo.shards.len() as u8 {
            0 | 1 => ShardIndex::unsharded(),
            count => ShardIndex::new(ShardNumber(0), ShardCount(count)),
        };

        let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
-            let mut client = page_api::Client::new(
-                shard0_connstr,
+            let mut client = page_api::Client::connect(
+                shard0_url,
                spec.tenant_id,
                spec.timeline_id,
                shard_index,
@@ -1071,8 +1099,13 @@ impl ComputeNode {
    /// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp
    /// when the connection was established, and the (compressed) size of the basebackup.
    fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
-        let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
-        let mut config = postgres::Config::from_str(shard0_connstr)?;
+        let shard0 = spec
+            .pageserver_conninfo
+            .shards
+            .get(&0)
+            .expect("shard 0 connection info missing");
+        let shard0_connstr = shard0.libpq_url.clone().expect("no libpq_url for shard 0");
+        let mut config = postgres::Config::from_str(&shard0_connstr)?;

        // Use the storage auth token from the config file, if given.
        // Note: this overrides any password set in the connection string.
@@ -1158,10 +1191,7 @@ impl ComputeNode {
                    return result;
                }
                Err(ref e) if attempts < max_attempts => {
-                    warn!(
-                        "Failed to get basebackup: {} (attempt {}/{})",
-                        e, attempts, max_attempts
-                    );
+                    warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})");
                    std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
                    retry_period_ms *= 1.5;
                }
@@ -1370,16 +1400,8 @@ impl ComputeNode {
            }
        };

-        info!(
-            "getting basebackup@{} from pageserver {}",
-            lsn, &pspec.pageserver_connstr
-        );
-        self.get_basebackup(compute_state, lsn).with_context(|| {
-            format!(
-                "failed to get basebackup@{} from pageserver {}",
-                lsn, &pspec.pageserver_connstr
-            )
-        })?;
+        self.get_basebackup(compute_state, lsn)
+            .with_context(|| format!("failed to get basebackup@{lsn}"))?;

        // Update pg_hba.conf received with basebackup.
        update_pg_hba(pgdata_path)?;
@@ -2039,7 +2061,7 @@ LIMIT 100",
            self.params
                .remote_ext_base_url
                .as_ref()
-                .ok_or(DownloadError::BadInput(anyhow::anyhow!(
+                .ok_or(DownloadError::BadInput(anyhow!(
                    "Remote extensions storage is not configured",
                )))?;

@@ -2235,7 +2257,7 @@ LIMIT 100",
        let remote_extensions = spec
            .remote_extensions
            .as_ref()
-            .ok_or(anyhow::anyhow!("Remote extensions are not configured"))?;
+            .ok_or(anyhow!("Remote extensions are not configured"))?;

        info!("parse shared_preload_libraries from spec.cluster.settings");
        let mut libs_vec = Vec::new();
@@ -2314,22 +2336,22 @@ LIMIT 100",
    /// The operation will time out after a specified duration.
    pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
        let state = self.state.lock().unwrap();
-        let old_pageserver_connstr = state
+        let old_pageserver_conninfo = state
            .pspec
            .as_ref()
            .expect("spec must be set")
-            .pageserver_connstr
+            .pageserver_conninfo
            .clone();
        let mut unchanged = true;
        let _ = self
            .state_changed
            .wait_timeout_while(state, duration, |s| {
-                let pageserver_connstr = &s
+                let pageserver_conninfo = &s
                    .pspec
                    .as_ref()
                    .expect("spec must be set")
-                    .pageserver_connstr;
-                unchanged = pageserver_connstr == &old_pageserver_connstr;
+                    .pageserver_conninfo;
+                unchanged = pageserver_conninfo == &old_pageserver_conninfo;
                unchanged
            })
            .unwrap();
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -56,9 +56,51 @@ pub fn write_postgres_conf(

    // Add options for connecting to storage
    writeln!(file, "# Neon storage settings")?;
-    if let Some(s) = &spec.pageserver_connstring {
-        writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
+
+    if let Some(conninfo) = &spec.pageserver_connection_info {
+        let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
+        let mut grpc_urls: Option<Vec<String>> = Some(Vec::new());
+
+        for shardno in 0..conninfo.shards.len() {
+            let info = conninfo.shards.get(&(shardno as u32)).ok_or_else(|| {
+                anyhow::anyhow!("shard {shardno} missing from pageserver_connection_info shard map")
+            })?;
+
+            if let Some(url) = &info.libpq_url {
+                if let Some(ref mut urls) = libpq_urls {
+                    urls.push(url.clone());
+                }
+            } else {
+                libpq_urls = None
+            }
+            if let Some(url) = &info.grpc_url {
+                if let Some(ref mut urls) = grpc_urls {
+                    urls.push(url.clone());
+                }
+            } else {
+                grpc_urls = None
+            }
+        }
+        if let Some(libpq_urls) = libpq_urls {
+            writeln!(
+                file,
+                "neon.pageserver_connstring={}",
+                escape_conf_value(&libpq_urls.join(","))
+            )?;
+        } else {
+            writeln!(file, "# no neon.pageserver_connstring")?;
+        }
+        if let Some(grpc_urls) = grpc_urls {
+            writeln!(
+                file,
+                "neon.pageserver_grpc_urls={}",
+                escape_conf_value(&grpc_urls.join(","))
+            )?;
+        } else {
+            writeln!(file, "# no neon.pageserver_grpc_urls")?;
+        }
    }
+
    if let Some(stripe_size) = spec.shard_stripe_size {
        writeln!(file, "neon.stripe_size={stripe_size}")?;
    }
--- a/compute_tools/src/lsn_lease.rs
+++ b/compute_tools/src/lsn_lease.rs
@@ -4,8 +4,7 @@ use std::thread;
 use std::time::{Duration, SystemTime};

 use anyhow::{Result, bail};
-use compute_api::spec::{ComputeMode, PageserverProtocol};
-use itertools::Itertools as _;
+use compute_api::spec::{ComputeMode, PageserverConnectionInfo};
 use pageserver_page_api as page_api;
 use postgres::{NoTls, SimpleQueryMessage};
 use tracing::{info, warn};
@@ -78,17 +77,16 @@ fn acquire_lsn_lease_with_retry(

    loop {
        // Note: List of pageservers is dynamic, need to re-read configs before each attempt.
-        let (connstrings, auth) = {
+        let (conninfo, auth) = {
            let state = compute.state.lock().unwrap();
            let spec = state.pspec.as_ref().expect("spec must be set");
            (
-                spec.pageserver_connstr.clone(),
+                spec.pageserver_conninfo.clone(),
                spec.storage_auth_token.clone(),
            )
        };

-        let result =
-            try_acquire_lsn_lease(&connstrings, auth.as_deref(), tenant_id, timeline_id, lsn);
+        let result = try_acquire_lsn_lease(conninfo, auth.as_deref(), tenant_id, timeline_id, lsn);
        match result {
            Ok(Some(res)) => {
                return Ok(res);
@@ -112,17 +110,16 @@ fn acquire_lsn_lease_with_retry(

 /// Tries to acquire LSN leases on all Pageserver shards.
 fn try_acquire_lsn_lease(
-    connstrings: &str,
+    conninfo: PageserverConnectionInfo,
    auth: Option<&str>,
    tenant_id: TenantId,
    timeline_id: TimelineId,
    lsn: Lsn,
 ) -> Result<Option<SystemTime>> {
-    let connstrings = connstrings.split(',').collect_vec();
-    let shard_count = connstrings.len();
+    let shard_count = conninfo.shards.len();
    let mut leases = Vec::new();

-    for (shard_number, &connstring) in connstrings.iter().enumerate() {
+    for (shard_number, shard) in conninfo.shards.into_iter() {
        let tenant_shard_id = match shard_count {
            0 | 1 => TenantShardId::unsharded(tenant_id),
            shard_count => TenantShardId {
@@ -132,13 +129,22 @@ fn try_acquire_lsn_lease(
            },
        };

-        let lease = match PageserverProtocol::from_connstring(connstring)? {
-            PageserverProtocol::Libpq => {
-                acquire_lsn_lease_libpq(connstring, auth, tenant_shard_id, timeline_id, lsn)?
-            }
-            PageserverProtocol::Grpc => {
-                acquire_lsn_lease_grpc(connstring, auth, tenant_shard_id, timeline_id, lsn)?
-            }
+        let lease = if conninfo.prefer_grpc {
+            acquire_lsn_lease_grpc(
+                &shard.grpc_url.unwrap(),
+                auth,
+                tenant_shard_id,
+                timeline_id,
+                lsn,
+            )?
+        } else {
+            acquire_lsn_lease_libpq(
+                &shard.libpq_url.unwrap(),
+                auth,
+                tenant_shard_id,
+                timeline_id,
+                lsn,
+            )?
        };
        leases.push(lease);
    }
@@ -192,7 +198,7 @@ fn acquire_lsn_lease_grpc(
    lsn: Lsn,
 ) -> Result<Option<SystemTime>> {
    tokio::runtime::Handle::current().block_on(async move {
-        let mut client = page_api::Client::new(
+        let mut client = page_api::Client::connect(
            connstring.to_string(),
            tenant_shard_id.tenant_id,
            timeline_id,
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -16,7 +16,7 @@ use std::time::Duration;
 use anyhow::{Context, Result, anyhow, bail};
 use clap::Parser;
 use compute_api::requests::ComputeClaimsScope;
-use compute_api::spec::{ComputeMode, PageserverProtocol};
+use compute_api::spec::{ComputeMode, PageserverConnectionInfo, PageserverShardConnectionInfo};
 use control_plane::broker::StorageBroker;
 use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode};
 use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
@@ -1504,29 +1504,35 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                )?;
            }

-            let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
-                let conf = env.get_pageserver_conf(pageserver_id).unwrap();
-                // Use gRPC if requested.
-                let pageserver = if endpoint.grpc {
-                    let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
-                    let (host, port) = parse_host_port(grpc_addr)?;
-                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
-                    (PageserverProtocol::Grpc, host, port)
-                } else {
+            let (shards, stripe_size) = if let Some(ps_id) = pageserver_id {
+                let conf = env.get_pageserver_conf(ps_id).unwrap();
+                let libpq_url = Some({
                    let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
                    let port = port.unwrap_or(5432);
-                    (PageserverProtocol::Libpq, host, port)
+                    format!("postgres://no_user@{host}:{port}")
+                });
+                let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
+                    let (host, port) = parse_host_port(grpc_addr)?;
+                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
+                    Some(format!("grpc://no_user@{host}:{port}"))
+                } else {
+                    None
                };
+                let pageserver = PageserverShardConnectionInfo {
+                    libpq_url,
+                    grpc_url,
+                };
+
                // If caller is telling us what pageserver to use, this is not a tenant which is
                // fully managed by storage controller, therefore not sharded.
-                (vec![pageserver], DEFAULT_STRIPE_SIZE)
+                (vec![(0, pageserver)], DEFAULT_STRIPE_SIZE)
            } else {
                // Look up the currently attached location of the tenant, and its striping metadata,
                // to pass these on to postgres.
                let storage_controller = StorageController::from_env(env);
                let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
-                let pageservers = futures::future::try_join_all(
-                    locate_result.shards.into_iter().map(|shard| async move {
+                let shards = futures::future::try_join_all(locate_result.shards.into_iter().map(
+                    |shard| async move {
                        if let ComputeMode::Static(lsn) = endpoint.mode {
                            // Initialize LSN leases for static computes.
                            let conf = env.get_pageserver_conf(shard.node_id).unwrap();
@@ -1538,28 +1544,34 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                                .await?;
                        }

-                        let pageserver = if endpoint.grpc {
-                            (
-                                PageserverProtocol::Grpc,
-                                Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))?,
-                                shard.listen_grpc_port.expect("no gRPC port"),
-                            )
+                        let libpq_host = Host::parse(&shard.listen_pg_addr)?;
+                        let libpq_port = shard.listen_pg_port;
+                        let libpq_url =
+                            Some(format!("postgres://no_user@{libpq_host}:{libpq_port}"));
+
+                        let grpc_url = if let Some(grpc_host) = shard.listen_grpc_addr {
+                            let grpc_port = shard.listen_grpc_port.expect("no gRPC port");
+                            Some(format!("grpc://no_user@{grpc_host}:{grpc_port}"))
                        } else {
-                            (
-                                PageserverProtocol::Libpq,
-                                Host::parse(&shard.listen_pg_addr)?,
-                                shard.listen_pg_port,
-                            )
+                            None
                        };
-                        anyhow::Ok(pageserver)
-                    }),
-                )
+                        let pageserver = PageserverShardConnectionInfo {
+                            libpq_url,
+                            grpc_url,
+                        };
+                        anyhow::Ok((shard.shard_id.shard_number.0 as u32, pageserver))
+                    },
+                ))
                .await?;
                let stripe_size = locate_result.shard_params.stripe_size;

-                (pageservers, stripe_size)
+                (shards, stripe_size)
+            };
+            assert!(!shards.is_empty());
+            let pageserver_conninfo = PageserverConnectionInfo {
+                shards: shards.into_iter().collect(),
+                prefer_grpc: endpoint.grpc,
            };
-            assert!(!pageservers.is_empty());

            let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
@@ -1591,7 +1603,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                    endpoint_storage_addr,
                    safekeepers_generation,
                    safekeepers,
-                    pageservers,
+                    pageserver_conninfo,
                    remote_ext_base_url.as_ref(),
                    stripe_size.0 as usize,
                    args.create_test_user,
@@ -1606,20 +1618,27 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                .endpoints
                .get(endpoint_id.as_str())
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id {
+            let shards = if let Some(ps_id) = args.endpoint_pageserver_id {
                let conf = env.get_pageserver_conf(ps_id)?;
-                // Use gRPC if requested.
-                let pageserver = if endpoint.grpc {
-                    let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
-                    let (host, port) = parse_host_port(grpc_addr)?;
-                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
-                    (PageserverProtocol::Grpc, host, port)
-                } else {
+                let libpq_url = Some({
                    let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
                    let port = port.unwrap_or(5432);
-                    (PageserverProtocol::Libpq, host, port)
+                    format!("postgres://no_user@{host}:{port}")
+                });
+                let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
+                    let (host, port) = parse_host_port(grpc_addr)?;
+                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
+                    Some(format!("grpc://no_user@{host}:{port}"))
+                } else {
+                    None
                };
-                vec![pageserver]
+                let pageserver = PageserverShardConnectionInfo {
+                    libpq_url,
+                    grpc_url,
+                };
+                // If caller is telling us what pageserver to use, this is not a tenant which is
+                // fully managed by storage controller, therefore not sharded.
+                vec![(0, pageserver)]
            } else {
                let storage_controller = StorageController::from_env(env);
                storage_controller
@@ -1629,28 +1648,36 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                    .into_iter()
                    .map(|shard| {
                        // Use gRPC if requested.
-                        if endpoint.grpc {
-                            (
-                                PageserverProtocol::Grpc,
-                                Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))
-                                    .expect("bad hostname"),
-                                shard.listen_grpc_port.expect("no gRPC port"),
-                            )
+                        let libpq_host = Host::parse(&shard.listen_pg_addr).expect("bad hostname");
+                        let libpq_port = shard.listen_pg_port;
+                        let libpq_url =
+                            Some(format!("postgres://no_user@{libpq_host}:{libpq_port}"));
+
+                        let grpc_url = if let Some(grpc_host) = shard.listen_grpc_addr {
+                            let grpc_port = shard.listen_grpc_port.expect("no gRPC port");
+                            Some(format!("grpc://no_user@{grpc_host}:{grpc_port}"))
                        } else {
-                            (
-                                PageserverProtocol::Libpq,
-                                Host::parse(&shard.listen_pg_addr).expect("bad hostname"),
-                                shard.listen_pg_port,
-                            )
-                        }
+                            None
+                        };
+                        (
+                            shard.shard_id.shard_number.0 as u32,
+                            PageserverShardConnectionInfo {
+                                libpq_url,
+                                grpc_url,
+                            },
+                        )
                    })
                    .collect::<Vec<_>>()
            };
+            let pageserver_conninfo = PageserverConnectionInfo {
+                shards: shards.into_iter().collect(),
+                prefer_grpc: endpoint.grpc,
+            };
            // If --safekeepers argument is given, use only the listed
            // safekeeper nodes; otherwise all from the env.
            let safekeepers = parse_safekeepers(&args.safekeepers)?;
            endpoint
-                .reconfigure(Some(pageservers), None, safekeepers, None)
+                .reconfigure(Some(pageserver_conninfo), None, safekeepers, None)
                .await?;
        }
        EndpointCmd::Stop(args) => {
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -56,9 +56,13 @@ use compute_api::responses::{
    TlsConfig,
 };
 use compute_api::spec::{
-    Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol,
-    PgIdent, RemoteExtSpec, Role,
+    Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
+    RemoteExtSpec, Role,
 };
+
+// re-export these, because they're used in the reconfigure() function
+pub use compute_api::spec::{PageserverConnectionInfo, PageserverShardConnectionInfo};
+
 use jsonwebtoken::jwk::{
    AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
    OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
@@ -74,7 +78,6 @@ use sha2::{Digest, Sha256};
 use spki::der::Decode;
 use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
 use tracing::debug;
-use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};

 use crate::local_env::LocalEnv;
@@ -637,14 +640,6 @@ impl Endpoint {
        }
    }

-    fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String {
-        pageservers
-            .iter()
-            .map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}"))
-            .collect::<Vec<_>>()
-            .join(",")
-    }
-
    /// Map safekeepers ids to the actual connection strings.
    fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
        let mut safekeeper_connstrings = Vec::new();
@@ -685,7 +680,7 @@ impl Endpoint {
        endpoint_storage_addr: String,
        safekeepers_generation: Option<SafekeeperGeneration>,
        safekeepers: Vec<NodeId>,
-        pageservers: Vec<(PageserverProtocol, Host, u16)>,
+        pageserver_conninfo: PageserverConnectionInfo,
        remote_ext_base_url: Option<&String>,
        shard_stripe_size: usize,
        create_test_user: bool,
@@ -704,9 +699,6 @@ impl Endpoint {
            std::fs::remove_dir_all(self.pgdata())?;
        }

-        let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
-        assert!(!pageserver_connstring.is_empty());
-
        let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;

        // check for file remote_extensions_spec.json
@@ -765,7 +757,7 @@ impl Endpoint {
                branch_id: None,
                endpoint_id: Some(self.endpoint_id.clone()),
                mode: self.mode,
-                pageserver_connstring: Some(pageserver_connstring),
+                pageserver_connection_info: Some(pageserver_conninfo),
                safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
                safekeeper_connstrings,
                storage_auth_token: auth_token.clone(),
@@ -974,7 +966,7 @@ impl Endpoint {

    pub async fn reconfigure(
        &self,
-        pageservers: Option<Vec<(PageserverProtocol, Host, u16)>>,
+        pageserver_conninfo: Option<PageserverConnectionInfo>,
        stripe_size: Option<ShardStripeSize>,
        safekeepers: Option<Vec<NodeId>>,
        safekeeper_generation: Option<SafekeeperGeneration>,
@@ -990,15 +982,17 @@ impl Endpoint {
        let postgresql_conf = self.read_postgresql_conf()?;
        spec.cluster.postgresql_conf = Some(postgresql_conf);

-        // If pageservers are not specified, don't change them.
-        if let Some(pageservers) = pageservers {
-            anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
-
-            let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
-            spec.pageserver_connstring = Some(pageserver_connstr);
-            if stripe_size.is_some() {
-                spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
-            }
+        if let Some(pageserver_conninfo) = pageserver_conninfo {
+            // If pageservers are provided, we need to ensure that they are not empty.
+            // This is a requirement for the compute_ctl configuration.
+            anyhow::ensure!(
+                !pageserver_conninfo.shards.is_empty(),
+                "no pageservers provided"
+            );
+            spec.pageserver_connection_info = Some(pageserver_conninfo);
+        }
+        if stripe_size.is_some() {
+            spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
        }

        // If safekeepers are not specified, don't change them.
@@ -1047,7 +1041,7 @@ impl Endpoint {

    pub async fn reconfigure_pageservers(
        &self,
-        pageservers: Vec<(PageserverProtocol, Host, u16)>,
+        pageservers: PageserverConnectionInfo,
        stripe_size: Option<ShardStripeSize>,
    ) -> Result<()> {
        self.reconfigure(Some(pageservers), stripe_size, None, None)
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -105,7 +105,11 @@ pub struct ComputeSpec {
    // updated to fill these fields, we can make these non optional.
    pub tenant_id: Option<TenantId>,
    pub timeline_id: Option<TimelineId>,
-    pub pageserver_connstring: Option<String>,
+
+    // Pageserver information can be passed in two different ways:
+    // 1. Here
+    // 2. in cluster.settings. This is legacy, we are switching to method 1.
+    pub pageserver_connection_info: Option<PageserverConnectionInfo>,

    // More neon ids that we expose to the compute_ctl
    // and to postgres as neon extension GUCs.
@@ -205,6 +209,20 @@ pub enum ComputeFeature {
    UnknownFeature,
 }

+/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
+#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
+pub struct PageserverConnectionInfo {
+    pub shards: HashMap<u32, PageserverShardConnectionInfo>,
+
+    pub prefer_grpc: bool,
+}
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
+pub struct PageserverShardConnectionInfo {
+    pub libpq_url: Option<String>,
+    pub grpc_url: Option<String>,
+}
+
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct RemoteExtSpec {
    pub public_extensions: Option<Vec<String>>,
@@ -322,6 +340,12 @@ impl ComputeMode {
    }
 }

+impl Display for ComputeMode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.to_type_str())
+    }
+}
+
 /// Log level for audit logging
 #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeAudit {
--- a/libs/neon-shmem/Cargo.toml
+++ b/libs/neon-shmem/Cargo.toml
@@ -6,8 +6,13 @@ license.workspace = true

 [dependencies]
 thiserror.workspace = true
-nix.workspace=true
+nix.workspace = true
+spin.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }

+[dev-dependencies]
+rand = "0.9.1"
+rand_distr = "0.5.1"
+
 [target.'cfg(target_os = "macos")'.dependencies]
 tempfile = "3.14.0"
--- a/libs/neon-shmem/src/hash.rs
+++ b/libs/neon-shmem/src/hash.rs
@@ -0,0 +1,367 @@
+//! Hash table implementation on top of 'shmem'
+//!
+//! Features required in the long run by the communicator project:
+//!
+//! [X] Accessible from both Postgres processes and rust threads in the communicator process
+//! [X] Low latency
+//! [ ] Scalable to lots of concurrent accesses (currently uses a single spinlock)
+//! [ ] Resizable
+
+use std::fmt::Debug;
+use std::hash::Hash;
+use std::mem::MaybeUninit;
+use std::ops::Deref;
+
+use crate::shmem::ShmemHandle;
+
+use spin;
+
+mod core;
+
+#[cfg(test)]
+mod tests;
+
+use core::CoreHashMap;
+
+pub enum UpdateAction<V> {
+    Nothing,
+    Insert(V),
+    Remove,
+}
+
+#[derive(Debug)]
+pub struct OutOfMemoryError();
+
+pub struct HashMapInit<'a, K, V> {
+    // Hash table can be allocated in a fixed memory area, or in a resizeable ShmemHandle.
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+}
+
+pub struct HashMapAccess<'a, K, V> {
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+}
+
+unsafe impl<'a, K: Sync, V: Sync> Sync for HashMapAccess<'a, K, V> {}
+unsafe impl<'a, K: Send, V: Send> Send for HashMapAccess<'a, K, V> {}
+
+impl<'a, K, V> HashMapInit<'a, K, V> {
+    pub fn attach_writer(self) -> HashMapAccess<'a, K, V> {
+        HashMapAccess {
+            shmem_handle: self.shmem_handle,
+            shared_ptr: self.shared_ptr,
+        }
+    }
+
+    pub fn attach_reader(self) -> HashMapAccess<'a, K, V> {
+        // no difference to attach_writer currently
+        self.attach_writer()
+    }
+}
+
+// This is stored in the shared memory area
+struct HashMapShared<'a, K, V> {
+    inner: spin::RwLock<CoreHashMap<'a, K, V>>,
+}
+
+impl<'a, K, V> HashMapInit<'a, K, V>
+where
+    K: Clone + Hash + Eq,
+{
+    pub fn estimate_size(num_buckets: u32) -> usize {
+        // add some margin to cover alignment etc.
+        CoreHashMap::<K, V>::estimate_size(num_buckets) + size_of::<HashMapShared<K, V>>() + 1000
+    }
+
+    pub fn init_in_fixed_area(
+        num_buckets: u32,
+        area: &'a mut [MaybeUninit<u8>],
+    ) -> HashMapInit<'a, K, V> {
+        Self::init_common(num_buckets, None, area.as_mut_ptr().cast(), area.len())
+    }
+
+    /// Initialize a new hash map in the given shared memory area
+    pub fn init_in_shmem(num_buckets: u32, mut shmem: ShmemHandle) -> HashMapInit<'a, K, V> {
+        let size = Self::estimate_size(num_buckets);
+        shmem
+            .set_size(size)
+            .expect("could not resize shared memory area");
+
+        let ptr = unsafe { shmem.data_ptr.as_mut() };
+        Self::init_common(num_buckets, Some(shmem), ptr, size)
+    }
+
+    fn init_common(
+        num_buckets: u32,
+        shmem_handle: Option<ShmemHandle>,
+        area_ptr: *mut u8,
+        area_len: usize,
+    ) -> HashMapInit<'a, K, V> {
+        // carve out HashMapShared from the area. This does not include the hashmap's dictionary
+        // and buckets.
+        let mut ptr: *mut u8 = area_ptr;
+        ptr = unsafe { ptr.add(ptr.align_offset(align_of::<HashMapShared<K, V>>())) };
+        let shared_ptr: *mut HashMapShared<K, V> = ptr.cast();
+        ptr = unsafe { ptr.add(size_of::<HashMapShared<K, V>>()) };
+
+        // the rest of the space is given to the hash map's dictionary and buckets
+        let remaining_area = unsafe {
+            std::slice::from_raw_parts_mut(ptr, area_len - ptr.offset_from(area_ptr) as usize)
+        };
+
+        let hashmap = CoreHashMap::new(num_buckets, remaining_area);
+        unsafe {
+            std::ptr::write(
+                shared_ptr,
+                HashMapShared {
+                    inner: spin::RwLock::new(hashmap),
+                },
+            );
+        }
+
+        HashMapInit {
+            shmem_handle,
+            shared_ptr,
+        }
+    }
+}
+
+impl<'a, K, V> HashMapAccess<'a, K, V>
+where
+    K: Clone + Hash + Eq,
+{
+    pub fn get<'e>(&'e self, key: &K) -> Option<ValueReadGuard<'e, K, V>> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        let lock_guard = map.inner.read();
+
+        match lock_guard.get(key) {
+            None => None,
+            Some(val_ref) => {
+                let val_ptr = std::ptr::from_ref(val_ref);
+                Some(ValueReadGuard {
+                    _lock_guard: lock_guard,
+                    value: val_ptr,
+                })
+            }
+        }
+    }
+
+    /// Insert a value
+    pub fn insert(&self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
+        let mut success = None;
+
+        self.update_with_fn(key, |existing| {
+            if existing.is_some() {
+                success = Some(false);
+                UpdateAction::Nothing
+            } else {
+                success = Some(true);
+                UpdateAction::Insert(value)
+            }
+        })?;
+        Ok(success.expect("value_fn not called"))
+    }
+
+    /// Remove value. Returns true if it existed
+    pub fn remove(&self, key: &K) -> bool {
+        let mut result = false;
+        self.update_with_fn(key, |existing| match existing {
+            Some(_) => {
+                result = true;
+                UpdateAction::Remove
+            }
+            None => UpdateAction::Nothing,
+        })
+        .expect("out of memory while removing");
+        result
+    }
+
+    /// Update key using the given function. All the other modifying operations are based on this.
+    pub fn update_with_fn<F>(&self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
+    where
+        F: FnOnce(Option<&V>) -> UpdateAction<V>,
+    {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        let mut lock_guard = map.inner.write();
+
+        let old_val = lock_guard.get(key);
+        let action = value_fn(old_val);
+        match (old_val, action) {
+            (_, UpdateAction::Nothing) => {}
+            (_, UpdateAction::Insert(new_val)) => {
+                let _ = lock_guard.insert(key, new_val);
+            }
+            (None, UpdateAction::Remove) => panic!("Remove action with no old value"),
+            (Some(_), UpdateAction::Remove) => {
+                let _ = lock_guard.remove(key);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Update key using the given function. All the other modifying operations are based on this.
+    pub fn update_with_fn_at_bucket<F>(
+        &self,
+        pos: usize,
+        value_fn: F,
+    ) -> Result<(), OutOfMemoryError>
+    where
+        F: FnOnce(Option<&V>) -> UpdateAction<V>,
+    {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        let mut lock_guard = map.inner.write();
+
+        let old_val = lock_guard.get_bucket(pos);
+        let action = value_fn(old_val.map(|(_k, v)| v));
+        match (old_val, action) {
+            (_, UpdateAction::Nothing) => {}
+            (_, UpdateAction::Insert(_new_val)) => panic!("cannot insert without key"),
+            (None, UpdateAction::Remove) => panic!("Remove action with no old value"),
+            (Some((key, _value)), UpdateAction::Remove) => {
+                let key = key.clone();
+                let _ = lock_guard.remove(&key);
+            }
+        }
+
+        Ok(())
+    }
+
+    pub fn get_num_buckets(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        map.inner.read().get_num_buckets()
+    }
+
+    /// Return the key and value stored in bucket with given index. This can be used to
+    /// iterate through the hash map. (An Iterator might be nicer. The communicator's
+    /// clock algorithm needs to _slowly_ iterate through all buckets with its clock hand,
+    /// without holding a lock. If we switch to an Iterator, it must not hold the lock.)
+    pub fn get_bucket<'e>(&'e self, pos: usize) -> Option<ValueReadGuard<'e, K, V>> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        let lock_guard = map.inner.read();
+
+        match lock_guard.get_bucket(pos) {
+            None => None,
+            Some((_key, val_ref)) => {
+                let val_ptr = std::ptr::from_ref(val_ref);
+                Some(ValueReadGuard {
+                    _lock_guard: lock_guard,
+                    value: val_ptr,
+                })
+            }
+        }
+    }
+
+    // for metrics
+    pub fn get_num_buckets_in_use(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        map.inner.read().buckets_in_use as usize
+    }
+
+    /// Grow
+    ///
+    /// 1. grow the underlying shared memory area
+    /// 2. Initialize new buckets. This overwrites the current dictionary
+    /// 3. Recalculate the dictionary
+    pub fn grow(&self, num_buckets: u32) -> Result<(), crate::shmem::Error> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        let mut lock_guard = map.inner.write();
+        let inner = &mut *lock_guard;
+        let old_num_buckets = inner.buckets.len() as u32;
+
+        if num_buckets < old_num_buckets {
+            panic!("grow called with a smaller number of buckets");
+        }
+        if num_buckets == old_num_buckets {
+            return Ok(());
+        }
+        let shmem_handle = self
+            .shmem_handle
+            .as_ref()
+            .expect("grow called on a fixed-size hash table");
+
+        let size_bytes = HashMapInit::<K, V>::estimate_size(num_buckets);
+        shmem_handle.set_size(size_bytes)?;
+        let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
+
+        // Initialize new buckets. The new buckets are linked to the free list. NB: This overwrites
+        // the dictionary!
+        let buckets_ptr = inner.buckets.as_mut_ptr();
+        unsafe {
+            for i in old_num_buckets..num_buckets {
+                let bucket_ptr = buckets_ptr.add(i as usize);
+                bucket_ptr.write(core::Bucket {
+                    hash: 0,
+                    next: if i < num_buckets {
+                        i + 1
+                    } else {
+                        inner.free_head
+                    },
+                    inner: None,
+                });
+            }
+        }
+
+        // Recalculate the dictionary
+        let buckets;
+        let dictionary;
+        unsafe {
+            let buckets_end_ptr = buckets_ptr.add(num_buckets as usize);
+            let dictionary_ptr: *mut u32 = buckets_end_ptr
+                .byte_add(buckets_end_ptr.align_offset(align_of::<u32>()))
+                .cast();
+            let dictionary_size: usize =
+                end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::<u32>();
+
+            buckets = std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize);
+            dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size);
+        }
+        for item in dictionary.iter_mut() {
+            *item = core::INVALID_POS;
+        }
+
+        #[allow(clippy::needless_range_loop)]
+        for i in 0..old_num_buckets as usize {
+            if buckets[i].inner.is_none() {
+                continue;
+            }
+            let pos: usize = (buckets[i].hash % dictionary.len() as u64) as usize;
+            buckets[i].next = dictionary[pos];
+            dictionary[pos] = i as u32;
+        }
+
+        // Finally, update the CoreHashMap struct
+        inner.dictionary = dictionary;
+        inner.buckets = buckets;
+        inner.free_head = old_num_buckets;
+
+        Ok(())
+    }
+
+    // TODO: Shrinking is a multi-step process that requires co-operation from the caller
+    //
+    // 1. The caller must first call begin_shrink(). That forbids allocation of higher-numbered
+    // buckets.
+    //
+    // 2. Next, the caller must evict all entries in higher-numbered buckets.
+    //
+    // 3. Finally, call finish_shrink(). This recomputes the dictionary and shrinks the underlying
+    //    shmem area
+}
+
+pub struct ValueReadGuard<'a, K, V> {
+    _lock_guard: spin::RwLockReadGuard<'a, CoreHashMap<'a, K, V>>,
+    value: *const V,
+}
+
+impl<'a, K, V> Deref for ValueReadGuard<'a, K, V> {
+    type Target = V;
+
+    fn deref(&self) -> &Self::Target {
+        // SAFETY: The `lock_guard` ensures that the underlying map (and thus the value pointed to
+        // by `value`) remains valid for the lifetime `'a`. The `value` has been obtained from a
+        // valid reference within the map.
+        unsafe { &*self.value }
+    }
+}
--- a/libs/neon-shmem/src/hash/core.rs
+++ b/libs/neon-shmem/src/hash/core.rs
@@ -0,0 +1,233 @@
+//! Simple hash table with chaining
+//!
+//! # Resizing
+//!
+
+use std::hash::{DefaultHasher, Hash, Hasher};
+use std::mem::MaybeUninit;
+
+pub(crate) const INVALID_POS: u32 = u32::MAX;
+
+// Bucket
+pub(crate) struct Bucket<K, V> {
+    pub(crate) hash: u64,
+    pub(crate) next: u32,
+    pub(crate) inner: Option<(K, V)>,
+}
+
+pub(crate) struct CoreHashMap<'a, K, V> {
+    pub(crate) dictionary: &'a mut [u32],
+    pub(crate) buckets: &'a mut [Bucket<K, V>],
+    pub(crate) free_head: u32,
+
+    // metrics
+    pub(crate) buckets_in_use: u32,
+}
+
+pub struct FullError();
+
+impl<'a, K, V> CoreHashMap<'a, K, V>
+where
+    K: Clone + Hash + Eq,
+{
+    const FILL_FACTOR: f32 = 0.60;
+
+    pub fn estimate_size(num_buckets: u32) -> usize {
+        let mut size = 0;
+
+        // buckets
+        size += size_of::<Bucket<K, V>>() * num_buckets as usize;
+
+        // dictionary
+        size += (f32::ceil((size_of::<u32>() * num_buckets as usize) as f32 / Self::FILL_FACTOR))
+            as usize;
+
+        size
+    }
+
+    pub fn new(num_buckets: u32, area: &'a mut [u8]) -> CoreHashMap<'a, K, V> {
+        let len = area.len();
+
+        let mut ptr: *mut u8 = area.as_mut_ptr();
+        let end_ptr: *mut u8 = unsafe { area.as_mut_ptr().add(len) };
+
+        // carve out the buckets
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<Bucket<K, V>>())) };
+        let buckets_ptr = ptr;
+        ptr = unsafe { ptr.add(size_of::<Bucket<K, V>>() * num_buckets as usize) };
+
+        // use remaining space for the dictionary
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<u32>())) };
+        let dictionary_ptr = ptr;
+
+        assert!(ptr.addr() < end_ptr.addr());
+        let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::<u32>() as isize };
+        assert!(dictionary_size > 0);
+
+        // Initialize the buckets
+        let buckets = {
+            let buckets_ptr: *mut MaybeUninit<Bucket<K, V>> = buckets_ptr.cast();
+            let buckets =
+                unsafe { std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize) };
+            for i in 0..buckets.len() {
+                buckets[i].write(Bucket {
+                    hash: 0,
+                    next: if i < buckets.len() - 1 {
+                        i as u32 + 1
+                    } else {
+                        INVALID_POS
+                    },
+                    inner: None,
+                });
+            }
+            // TODO: use std::slice::assume_init_mut() once it stabilizes
+            unsafe { std::slice::from_raw_parts_mut(buckets_ptr.cast(), num_buckets as usize) }
+        };
+
+        // Initialize the dictionary
+        let dictionary = {
+            let dictionary_ptr: *mut MaybeUninit<u32> = dictionary_ptr.cast();
+            let dictionary =
+                unsafe { std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size as usize) };
+
+            for item in dictionary.iter_mut() {
+                item.write(INVALID_POS);
+            }
+            // TODO: use std::slice::assume_init_mut() once it stabilizes
+            unsafe {
+                std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize)
+            }
+        };
+
+        CoreHashMap {
+            dictionary,
+            buckets,
+            free_head: 0,
+            buckets_in_use: 0,
+        }
+    }
+
+    pub fn get(&self, key: &K) -> Option<&V> {
+        let mut hasher = DefaultHasher::new();
+        key.hash(&mut hasher);
+        let hash = hasher.finish();
+
+        let mut next = self.dictionary[hash as usize % self.dictionary.len()];
+        loop {
+            if next == INVALID_POS {
+                return None;
+            }
+
+            let bucket = &self.buckets[next as usize];
+            let (bucket_key, bucket_value) = bucket.inner.as_ref().expect("entry is in use");
+            if bucket_key == key {
+                return Some(bucket_value);
+            }
+            next = bucket.next;
+        }
+    }
+
+    pub fn insert(&mut self, key: &K, value: V) -> Result<(), FullError> {
+        let mut hasher = DefaultHasher::new();
+        key.hash(&mut hasher);
+        let hash = hasher.finish();
+
+        let first = self.dictionary[hash as usize % self.dictionary.len()];
+        if first == INVALID_POS {
+            // no existing entry
+            let pos = self.alloc_bucket(key.clone(), value, hash)?;
+            if pos == INVALID_POS {
+                return Err(FullError());
+            }
+            self.dictionary[hash as usize % self.dictionary.len()] = pos;
+            return Ok(());
+        }
+
+        let mut next = first;
+        loop {
+            let bucket = &mut self.buckets[next as usize];
+            let (bucket_key, bucket_value) = bucket.inner.as_mut().expect("entry is in use");
+            if bucket_key == key {
+                // found existing entry, update its value
+                *bucket_value = value;
+                return Ok(());
+            }
+
+            if bucket.next == INVALID_POS {
+                // No existing entry found. Append to the chain
+                let pos = self.alloc_bucket(key.clone(), value, hash)?;
+                if pos == INVALID_POS {
+                    return Err(FullError());
+                }
+                self.buckets[next as usize].next = pos;
+                return Ok(());
+            }
+            next = bucket.next;
+        }
+    }
+
+    pub fn remove(&mut self, key: &K) -> Result<(), FullError> {
+        let mut hasher = DefaultHasher::new();
+        key.hash(&mut hasher);
+        let hash = hasher.finish();
+
+        let mut next = self.dictionary[hash as usize % self.dictionary.len()];
+        let mut prev_pos: u32 = INVALID_POS;
+        loop {
+            if next == INVALID_POS {
+                // no existing entry
+                return Ok(());
+            }
+            let bucket = &mut self.buckets[next as usize];
+            let (bucket_key, _) = bucket.inner.as_mut().expect("entry is in use");
+            if bucket_key == key {
+                // found existing entry, unlink it from the chain
+                if prev_pos == INVALID_POS {
+                    self.dictionary[hash as usize % self.dictionary.len()] = bucket.next;
+                } else {
+                    self.buckets[prev_pos as usize].next = bucket.next;
+                }
+
+                // and add it to the freelist
+                let bucket = &mut self.buckets[next as usize];
+                bucket.hash = 0;
+                bucket.inner = None;
+                bucket.next = self.free_head;
+                self.free_head = next;
+                self.buckets_in_use -= 1;
+                return Ok(());
+            }
+            prev_pos = next;
+            next = bucket.next;
+        }
+    }
+
+    pub fn get_num_buckets(&self) -> usize {
+        self.buckets.len()
+    }
+
+    pub fn get_bucket(&self, pos: usize) -> Option<&(K, V)> {
+        if pos >= self.buckets.len() {
+            return None;
+        }
+
+        self.buckets[pos].inner.as_ref()
+    }
+
+    fn alloc_bucket(&mut self, key: K, value: V, hash: u64) -> Result<u32, FullError> {
+        let pos = self.free_head;
+        if pos == INVALID_POS {
+            return Err(FullError());
+        }
+
+        let bucket = &mut self.buckets[pos as usize];
+        self.free_head = bucket.next;
+        self.buckets_in_use += 1;
+
+        bucket.hash = hash;
+        bucket.next = INVALID_POS;
+        bucket.inner = Some((key, value));
+
+        Ok(pos)
+    }
+}
--- a/libs/neon-shmem/src/hash/tests.rs
+++ b/libs/neon-shmem/src/hash/tests.rs
@@ -0,0 +1,220 @@
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fmt::{Debug, Formatter};
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use crate::hash::HashMapAccess;
+use crate::hash::HashMapInit;
+use crate::hash::UpdateAction;
+use crate::shmem::ShmemHandle;
+
+use rand::seq::SliceRandom;
+use rand::{Rng, RngCore};
+use rand_distr::Zipf;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl From<&TestKey> for u128 {
+    fn from(val: &TestKey) -> u128 {
+        u128::from_be_bytes(val.0)
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl<'a> From<&'a [u8]> for TestKey {
+    fn from(bytes: &'a [u8]) -> TestKey {
+        TestKey(bytes.try_into().unwrap())
+    }
+}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
+    const MAX_MEM_SIZE: usize = 10000000;
+    let shmem = ShmemHandle::new("test_inserts", 0, MAX_MEM_SIZE).unwrap();
+
+    let init_struct = HashMapInit::<TestKey, usize>::init_in_shmem(100000, shmem);
+    let w = init_struct.attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+        let res = w.insert(&(*k).into(), idx);
+        assert!(res.is_ok());
+    }
+
+    for (idx, k) in keys.iter().enumerate() {
+        let x = w.get(&(*k).into());
+        let value = x.as_deref().copied();
+        assert_eq!(value, Some(idx));
+    }
+
+    //eprintln!("stats: {:?}", tree_writer.get_statistics());
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut rand::rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.contains(&key) {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
+
+struct TestValue(AtomicUsize);
+
+impl TestValue {
+    fn new(val: usize) -> TestValue {
+        TestValue(AtomicUsize::new(val))
+    }
+
+    fn load(&self) -> usize {
+        self.0.load(Ordering::Relaxed)
+    }
+}
+
+impl Clone for TestValue {
+    fn clone(&self) -> TestValue {
+        TestValue::new(self.load())
+    }
+}
+
+impl Debug for TestValue {
+    fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.load())
+    }
+}
+
+#[derive(Clone, Debug)]
+struct TestOp(TestKey, Option<usize>);
+
+fn apply_op(
+    op: &TestOp,
+    sut: &HashMapAccess<TestKey, TestValue>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    eprintln!("applying op: {op:?}");
+
+    // apply the change to the shadow tree first
+    let shadow_existing = if let Some(v) = op.1 {
+        shadow.insert(op.0, v)
+    } else {
+        shadow.remove(&op.0)
+    };
+
+    // apply to Art tree
+    sut.update_with_fn(&op.0, |existing| {
+        assert_eq!(existing.map(TestValue::load), shadow_existing);
+
+        match (existing, op.1) {
+            (None, None) => UpdateAction::Nothing,
+            (None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
+            (Some(_old_val), None) => UpdateAction::Remove,
+            (Some(old_val), Some(new_val)) => {
+                old_val.0.store(new_val, Ordering::Relaxed);
+                UpdateAction::Nothing
+            }
+        }
+    })
+    .expect("out of memory");
+}
+
+#[test]
+fn random_ops() {
+    const MAX_MEM_SIZE: usize = 10000000;
+    let shmem = ShmemHandle::new("test_inserts", 0, MAX_MEM_SIZE).unwrap();
+
+    let init_struct = HashMapInit::<TestKey, TestValue>::init_in_shmem(100000, shmem);
+    let writer = init_struct.attach_writer();
+
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
+    let mut rng = rand::rng();
+    for i in 0..100000 {
+        let key: TestKey = (rng.sample(distribution) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            //eprintln!("stats: {:?}", tree_writer.get_statistics());
+            //test_iter(&tree_writer, &shadow);
+        }
+    }
+}
+
+#[test]
+fn test_grow() {
+    const MEM_SIZE: usize = 10000000;
+    let shmem = ShmemHandle::new("test_grow", 0, MEM_SIZE).unwrap();
+
+    let init_struct = HashMapInit::<TestKey, TestValue>::init_in_shmem(1000, shmem);
+    let writer = init_struct.attach_writer();
+
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let mut rng = rand::rng();
+    for i in 0..10000 {
+        let key: TestKey = ((rng.next_u32() % 1000) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            //eprintln!("stats: {:?}", tree_writer.get_statistics());
+            //test_iter(&tree_writer, &shadow);
+        }
+    }
+
+    writer.grow(1500).unwrap();
+
+    for i in 0..10000 {
+        let key: TestKey = ((rng.next_u32() % 1500) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            //eprintln!("stats: {:?}", tree_writer.get_statistics());
+            //test_iter(&tree_writer, &shadow);
+        }
+    }
+}
--- a/libs/neon-shmem/src/lib.rs
+++ b/libs/neon-shmem/src/lib.rs
@@ -1,418 +1,4 @@
 //! Shared memory utilities for neon communicator

-use std::num::NonZeroUsize;
-use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
-use std::ptr::NonNull;
-use std::sync::atomic::{AtomicUsize, Ordering};
-
-use nix::errno::Errno;
-use nix::sys::mman::MapFlags;
-use nix::sys::mman::ProtFlags;
-use nix::sys::mman::mmap as nix_mmap;
-use nix::sys::mman::munmap as nix_munmap;
-use nix::unistd::ftruncate as nix_ftruncate;
-
-/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
-/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
-/// specified at creation.
-///
-/// The area is backed by an anonymous file created with memfd_create(). The full address space for
-/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
-/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
-/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
-/// future.
-pub struct ShmemHandle {
-    /// memfd file descriptor
-    fd: OwnedFd,
-
-    max_size: usize,
-
-    // Pointer to the beginning of the shared memory area. The header is stored there.
-    shared_ptr: NonNull<SharedStruct>,
-
-    // Pointer to the beginning of the user data
-    pub data_ptr: NonNull<u8>,
-}
-
-/// This is stored at the beginning in the shared memory area.
-struct SharedStruct {
-    max_size: usize,
-
-    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
-    current_size: AtomicUsize,
-}
-
-const RESIZE_IN_PROGRESS: usize = 1 << 63;
-
-const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
-
-/// Error type returned by the ShmemHandle functions.
-#[derive(thiserror::Error, Debug)]
-#[error("{msg}: {errno}")]
-pub struct Error {
-    pub msg: String,
-    pub errno: Errno,
-}
-
-impl Error {
-    fn new(msg: &str, errno: Errno) -> Error {
-        Error {
-            msg: msg.to_string(),
-            errno,
-        }
-    }
-}
-
-impl ShmemHandle {
-    /// Create a new shared memory area. To communicate between processes, the processes need to be
-    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
-    ///
-    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
-    /// processes can continue using it, however.
-    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
-        // create the backing anonymous file.
-        let fd = create_backing_file(name)?;
-
-        Self::new_with_fd(fd, initial_size, max_size)
-    }
-
-    fn new_with_fd(
-        fd: OwnedFd,
-        initial_size: usize,
-        max_size: usize,
-    ) -> Result<ShmemHandle, Error> {
-        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
-        // is a little larger than this because of the SharedStruct header. Make the upper limit
-        // somewhat smaller than that, because with anything close to that, you'll run out of
-        // memory anyway.
-        if max_size >= 1 << 48 {
-            panic!("max size {max_size} too large");
-        }
-        if initial_size > max_size {
-            panic!("initial size {initial_size} larger than max size {max_size}");
-        }
-
-        // The actual initial / max size is the one given by the caller, plus the size of
-        // 'SharedStruct'.
-        let initial_size = HEADER_SIZE + initial_size;
-        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
-
-        // Reserve address space for it with mmap
-        //
-        // TODO: Use MAP_HUGETLB if possible
-        let start_ptr = unsafe {
-            nix_mmap(
-                None,
-                max_size,
-                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
-                MapFlags::MAP_SHARED,
-                &fd,
-                0,
-            )
-        }
-        .map_err(|e| Error::new("mmap failed: {e}", e))?;
-
-        // Reserve space for the initial size
-        enlarge_file(fd.as_fd(), initial_size as u64)?;
-
-        // Initialize the header
-        let shared: NonNull<SharedStruct> = start_ptr.cast();
-        unsafe {
-            shared.write(SharedStruct {
-                max_size: max_size.into(),
-                current_size: AtomicUsize::new(initial_size),
-            })
-        };
-
-        // The user data begins after the header
-        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
-
-        Ok(ShmemHandle {
-            fd,
-            max_size: max_size.into(),
-            shared_ptr: shared,
-            data_ptr,
-        })
-    }
-
-    // return reference to the header
-    fn shared(&self) -> &SharedStruct {
-        unsafe { self.shared_ptr.as_ref() }
-    }
-
-    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
-    /// when creating the area.
-    ///
-    /// This may only be called from one process/thread concurrently. We detect that case
-    /// and return an Error.
-    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
-        let new_size = new_size + HEADER_SIZE;
-        let shared = self.shared();
-
-        if new_size > self.max_size {
-            panic!(
-                "new size ({} is greater than max size ({})",
-                new_size, self.max_size
-            );
-        }
-        assert_eq!(self.max_size, shared.max_size);
-
-        // Lock the area by setting the bit in 'current_size'
-        //
-        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
-        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
-        // since this is not performance-critical, better safe than sorry .
-        let mut old_size = shared.current_size.load(Ordering::Acquire);
-        loop {
-            if (old_size & RESIZE_IN_PROGRESS) != 0 {
-                return Err(Error::new(
-                    "concurrent resize detected",
-                    Errno::UnknownErrno,
-                ));
-            }
-            match shared.current_size.compare_exchange(
-                old_size,
-                new_size,
-                Ordering::Acquire,
-                Ordering::Relaxed,
-            ) {
-                Ok(_) => break,
-                Err(x) => old_size = x,
-            }
-        }
-
-        // Ok, we got the lock.
-        //
-        // NB: If anything goes wrong, we *must* clear the bit!
-        let result = {
-            use std::cmp::Ordering::{Equal, Greater, Less};
-            match new_size.cmp(&old_size) {
-                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
-                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
-                }),
-                Equal => Ok(()),
-                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
-            }
-        };
-
-        // Unlock
-        shared.current_size.store(
-            if result.is_ok() { new_size } else { old_size },
-            Ordering::Release,
-        );
-
-        result
-    }
-
-    /// Returns the current user-visible size of the shared memory segment.
-    ///
-    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
-    /// responsibility not to access the area beyond the current size.
-    pub fn current_size(&self) -> usize {
-        let total_current_size =
-            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
-        total_current_size - HEADER_SIZE
-    }
-}
-
-impl Drop for ShmemHandle {
-    fn drop(&mut self) {
-        // SAFETY: The pointer was obtained from mmap() with the given size.
-        // We unmap the entire region.
-        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
-        // The fd is dropped automatically by OwnedFd.
-    }
-}
-
-/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
-/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
-/// development and testing, but in production we want the file to stay in memory.
-///
-/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
-#[allow(unused_variables)]
-fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
-            .map_err(|e| Error::new("memfd_create failed: {e}", e))
-    }
-    #[cfg(target_os = "macos")]
-    {
-        let file = tempfile::tempfile().map_err(|e| {
-            Error::new(
-                "could not create temporary file to back shmem area: {e}",
-                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
-            )
-        })?;
-        Ok(OwnedFd::from(file))
-    }
-}
-
-fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
-    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
-    // we don't get a segfault later when trying to actually use it.
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
-            Error::new(
-                "could not grow shmem segment, posix_fallocate failed: {e}",
-                e,
-            )
-        })
-    }
-    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
-    #[cfg(target_os = "macos")]
-    {
-        nix::unistd::ftruncate(fd, size as i64)
-            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use nix::unistd::ForkResult;
-    use std::ops::Range;
-
-    /// check that all bytes in given range have the expected value.
-    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
-        for i in range {
-            let b = unsafe { *(ptr.add(i)) };
-            assert_eq!(expected, b, "unexpected byte at offset {i}");
-        }
-    }
-
-    /// Write 'b' to all bytes in the given range
-    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
-        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
-    }
-
-    // simple single-process test of growing and shrinking
-    #[test]
-    fn test_shmem_resize() -> Result<(), Error> {
-        let max_size = 1024 * 1024;
-        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
-
-        assert_eq!(init_struct.current_size(), 0);
-
-        // Initial grow
-        let size1 = 10000;
-        init_struct.set_size(size1).unwrap();
-        assert_eq!(init_struct.current_size(), size1);
-
-        // Write some data
-        let data_ptr = init_struct.data_ptr.as_ptr();
-        write_range(data_ptr, 0xAA, 0..size1);
-        assert_range(data_ptr, 0xAA, 0..size1);
-
-        // Shrink
-        let size2 = 5000;
-        init_struct.set_size(size2).unwrap();
-        assert_eq!(init_struct.current_size(), size2);
-
-        // Grow again
-        let size3 = 20000;
-        init_struct.set_size(size3).unwrap();
-        assert_eq!(init_struct.current_size(), size3);
-
-        // Try to read it. The area that was shrunk and grown again should read as all zeros now
-        assert_range(data_ptr, 0xAA, 0..5000);
-        assert_range(data_ptr, 0, 5000..size1);
-
-        // Try to grow beyond max_size
-        //let size4 = max_size + 1;
-        //assert!(init_struct.set_size(size4).is_err());
-
-        // Dropping init_struct should unmap the memory
-        drop(init_struct);
-
-        Ok(())
-    }
-
-    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
-    /// but is stored in the shared memory area and works across processes. It's implemented by
-    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
-    struct SimpleBarrier {
-        num_procs: usize,
-        count: AtomicUsize,
-    }
-
-    impl SimpleBarrier {
-        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
-            unsafe {
-                *ptr = SimpleBarrier {
-                    num_procs,
-                    count: AtomicUsize::new(0),
-                }
-            }
-        }
-
-        pub fn wait(&self) {
-            let old = self.count.fetch_add(1, Ordering::Relaxed);
-
-            let generation = old / self.num_procs;
-
-            let mut current = old + 1;
-            while current < (generation + 1) * self.num_procs {
-                std::thread::sleep(std::time::Duration::from_millis(10));
-                current = self.count.load(Ordering::Relaxed);
-            }
-        }
-    }
-
-    #[test]
-    fn test_multi_process() {
-        // Initialize
-        let max_size = 1_000_000_000_000;
-        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
-        let ptr = init_struct.data_ptr.as_ptr();
-
-        // Store the SimpleBarrier in the first 1k of the area.
-        init_struct.set_size(10000).unwrap();
-        let barrier_ptr: *mut SimpleBarrier = unsafe {
-            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
-                .cast()
-        };
-        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
-        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
-
-        // Fork another test process. The code after this runs in both processes concurrently.
-        let fork_result = unsafe { nix::unistd::fork().unwrap() };
-
-        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, 1000..2000);
-        } else {
-            write_range(ptr, 0xBB, 2000..3000);
-        }
-        barrier.wait();
-        // Verify the contents. (in both processes)
-        assert_range(ptr, 0xAA, 1000..2000);
-        assert_range(ptr, 0xBB, 2000..3000);
-
-        // Grow, from the child this time
-        let size = 10_000_000;
-        if !fork_result.is_parent() {
-            init_struct.set_size(size).unwrap();
-        }
-        barrier.wait();
-
-        // make some writes at the end
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, (size - 10)..size);
-        } else {
-            write_range(ptr, 0xBB, (size - 20)..(size - 10));
-        }
-        barrier.wait();
-
-        // Verify the contents. (This runs in both processes)
-        assert_range(ptr, 0, (size - 1000)..(size - 20));
-        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
-        assert_range(ptr, 0xAA, (size - 10)..size);
-
-        if let ForkResult::Parent { child } = fork_result {
-            nix::sys::wait::waitpid(child, None).unwrap();
-        }
-    }
-}
+pub mod hash;
+pub mod shmem;
--- a/libs/neon-shmem/src/shmem.rs
+++ b/libs/neon-shmem/src/shmem.rs
@@ -0,0 +1,418 @@
+//! Dynamically resizable contiguous chunk of shared memory
+
+use std::num::NonZeroUsize;
+use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use nix::errno::Errno;
+use nix::sys::mman::MapFlags;
+use nix::sys::mman::ProtFlags;
+use nix::sys::mman::mmap as nix_mmap;
+use nix::sys::mman::munmap as nix_munmap;
+use nix::unistd::ftruncate as nix_ftruncate;
+
+/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
+/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
+/// specified at creation.
+///
+/// The area is backed by an anonymous file created with memfd_create(). The full address space for
+/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
+/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
+/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
+/// future.
+pub struct ShmemHandle {
+    /// memfd file descriptor
+    fd: OwnedFd,
+
+    max_size: usize,
+
+    // Pointer to the beginning of the shared memory area. The header is stored there.
+    shared_ptr: NonNull<SharedStruct>,
+
+    // Pointer to the beginning of the user data
+    pub data_ptr: NonNull<u8>,
+}
+
+/// This is stored at the beginning in the shared memory area.
+struct SharedStruct {
+    max_size: usize,
+
+    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
+    current_size: AtomicUsize,
+}
+
+const RESIZE_IN_PROGRESS: usize = 1 << 63;
+
+const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
+
+/// Error type returned by the ShmemHandle functions.
+#[derive(thiserror::Error, Debug)]
+#[error("{msg}: {errno}")]
+pub struct Error {
+    pub msg: String,
+    pub errno: Errno,
+}
+
+impl Error {
+    fn new(msg: &str, errno: Errno) -> Error {
+        Error {
+            msg: msg.to_string(),
+            errno,
+        }
+    }
+}
+
+impl ShmemHandle {
+    /// Create a new shared memory area. To communicate between processes, the processes need to be
+    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
+    ///
+    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
+    /// processes can continue using it, however.
+    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
+        // create the backing anonymous file.
+        let fd = create_backing_file(name)?;
+
+        Self::new_with_fd(fd, initial_size, max_size)
+    }
+
+    fn new_with_fd(
+        fd: OwnedFd,
+        initial_size: usize,
+        max_size: usize,
+    ) -> Result<ShmemHandle, Error> {
+        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
+        // is a little larger than this because of the SharedStruct header. Make the upper limit
+        // somewhat smaller than that, because with anything close to that, you'll run out of
+        // memory anyway.
+        if max_size >= 1 << 48 {
+            panic!("max size {max_size} too large");
+        }
+        if initial_size > max_size {
+            panic!("initial size {initial_size} larger than max size {max_size}");
+        }
+
+        // The actual initial / max size is the one given by the caller, plus the size of
+        // 'SharedStruct'.
+        let initial_size = HEADER_SIZE + initial_size;
+        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
+
+        // Reserve address space for it with mmap
+        //
+        // TODO: Use MAP_HUGETLB if possible
+        let start_ptr = unsafe {
+            nix_mmap(
+                None,
+                max_size,
+                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
+                MapFlags::MAP_SHARED,
+                &fd,
+                0,
+            )
+        }
+        .map_err(|e| Error::new("mmap failed: {e}", e))?;
+
+        // Reserve space for the initial size
+        enlarge_file(fd.as_fd(), initial_size as u64)?;
+
+        // Initialize the header
+        let shared: NonNull<SharedStruct> = start_ptr.cast();
+        unsafe {
+            shared.write(SharedStruct {
+                max_size: max_size.into(),
+                current_size: AtomicUsize::new(initial_size),
+            })
+        };
+
+        // The user data begins after the header
+        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
+
+        Ok(ShmemHandle {
+            fd,
+            max_size: max_size.into(),
+            shared_ptr: shared,
+            data_ptr,
+        })
+    }
+
+    // return reference to the header
+    fn shared(&self) -> &SharedStruct {
+        unsafe { self.shared_ptr.as_ref() }
+    }
+
+    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
+    /// when creating the area.
+    ///
+    /// This may only be called from one process/thread concurrently. We detect that case
+    /// and return an Error.
+    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
+        let new_size = new_size + HEADER_SIZE;
+        let shared = self.shared();
+
+        if new_size > self.max_size {
+            panic!(
+                "new size ({} is greater than max size ({})",
+                new_size, self.max_size
+            );
+        }
+        assert_eq!(self.max_size, shared.max_size);
+
+        // Lock the area by setting the bit in 'current_size'
+        //
+        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
+        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
+        // since this is not performance-critical, better safe than sorry .
+        let mut old_size = shared.current_size.load(Ordering::Acquire);
+        loop {
+            if (old_size & RESIZE_IN_PROGRESS) != 0 {
+                return Err(Error::new(
+                    "concurrent resize detected",
+                    Errno::UnknownErrno,
+                ));
+            }
+            match shared.current_size.compare_exchange(
+                old_size,
+                new_size,
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => break,
+                Err(x) => old_size = x,
+            }
+        }
+
+        // Ok, we got the lock.
+        //
+        // NB: If anything goes wrong, we *must* clear the bit!
+        let result = {
+            use std::cmp::Ordering::{Equal, Greater, Less};
+            match new_size.cmp(&old_size) {
+                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
+                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
+                }),
+                Equal => Ok(()),
+                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
+            }
+        };
+
+        // Unlock
+        shared.current_size.store(
+            if result.is_ok() { new_size } else { old_size },
+            Ordering::Release,
+        );
+
+        result
+    }
+
+    /// Returns the current user-visible size of the shared memory segment.
+    ///
+    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
+    /// responsibility not to access the area beyond the current size.
+    pub fn current_size(&self) -> usize {
+        let total_current_size =
+            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
+        total_current_size - HEADER_SIZE
+    }
+}
+
+impl Drop for ShmemHandle {
+    fn drop(&mut self) {
+        // SAFETY: The pointer was obtained from mmap() with the given size.
+        // We unmap the entire region.
+        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
+        // The fd is dropped automatically by OwnedFd.
+    }
+}
+
+/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
+/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
+/// development and testing, but in production we want the file to stay in memory.
+///
+/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
+#[allow(unused_variables)]
+fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
+            .map_err(|e| Error::new("memfd_create failed: {e}", e))
+    }
+    #[cfg(target_os = "macos")]
+    {
+        let file = tempfile::tempfile().map_err(|e| {
+            Error::new(
+                "could not create temporary file to back shmem area: {e}",
+                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
+            )
+        })?;
+        Ok(OwnedFd::from(file))
+    }
+}
+
+fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
+    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
+    // we don't get a segfault later when trying to actually use it.
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
+            Error::new(
+                "could not grow shmem segment, posix_fallocate failed: {e}",
+                e,
+            )
+        })
+    }
+    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
+    #[cfg(target_os = "macos")]
+    {
+        nix::unistd::ftruncate(fd, size as i64)
+            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use nix::unistd::ForkResult;
+    use std::ops::Range;
+
+    /// check that all bytes in given range have the expected value.
+    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
+        for i in range {
+            let b = unsafe { *(ptr.add(i)) };
+            assert_eq!(expected, b, "unexpected byte at offset {i}");
+        }
+    }
+
+    /// Write 'b' to all bytes in the given range
+    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
+        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
+    }
+
+    // simple single-process test of growing and shrinking
+    #[test]
+    fn test_shmem_resize() -> Result<(), Error> {
+        let max_size = 1024 * 1024;
+        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
+
+        assert_eq!(init_struct.current_size(), 0);
+
+        // Initial grow
+        let size1 = 10000;
+        init_struct.set_size(size1).unwrap();
+        assert_eq!(init_struct.current_size(), size1);
+
+        // Write some data
+        let data_ptr = init_struct.data_ptr.as_ptr();
+        write_range(data_ptr, 0xAA, 0..size1);
+        assert_range(data_ptr, 0xAA, 0..size1);
+
+        // Shrink
+        let size2 = 5000;
+        init_struct.set_size(size2).unwrap();
+        assert_eq!(init_struct.current_size(), size2);
+
+        // Grow again
+        let size3 = 20000;
+        init_struct.set_size(size3).unwrap();
+        assert_eq!(init_struct.current_size(), size3);
+
+        // Try to read it. The area that was shrunk and grown again should read as all zeros now
+        assert_range(data_ptr, 0xAA, 0..5000);
+        assert_range(data_ptr, 0, 5000..size1);
+
+        // Try to grow beyond max_size
+        //let size4 = max_size + 1;
+        //assert!(init_struct.set_size(size4).is_err());
+
+        // Dropping init_struct should unmap the memory
+        drop(init_struct);
+
+        Ok(())
+    }
+
+    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
+    /// but is stored in the shared memory area and works across processes. It's implemented by
+    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
+    struct SimpleBarrier {
+        num_procs: usize,
+        count: AtomicUsize,
+    }
+
+    impl SimpleBarrier {
+        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
+            unsafe {
+                *ptr = SimpleBarrier {
+                    num_procs,
+                    count: AtomicUsize::new(0),
+                }
+            }
+        }
+
+        pub fn wait(&self) {
+            let old = self.count.fetch_add(1, Ordering::Relaxed);
+
+            let generation = old / self.num_procs;
+
+            let mut current = old + 1;
+            while current < (generation + 1) * self.num_procs {
+                std::thread::sleep(std::time::Duration::from_millis(10));
+                current = self.count.load(Ordering::Relaxed);
+            }
+        }
+    }
+
+    #[test]
+    fn test_multi_process() {
+        // Initialize
+        let max_size = 1_000_000_000_000;
+        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
+        let ptr = init_struct.data_ptr.as_ptr();
+
+        // Store the SimpleBarrier in the first 1k of the area.
+        init_struct.set_size(10000).unwrap();
+        let barrier_ptr: *mut SimpleBarrier = unsafe {
+            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
+                .cast()
+        };
+        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
+        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
+
+        // Fork another test process. The code after this runs in both processes concurrently.
+        let fork_result = unsafe { nix::unistd::fork().unwrap() };
+
+        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, 1000..2000);
+        } else {
+            write_range(ptr, 0xBB, 2000..3000);
+        }
+        barrier.wait();
+        // Verify the contents. (in both processes)
+        assert_range(ptr, 0xAA, 1000..2000);
+        assert_range(ptr, 0xBB, 2000..3000);
+
+        // Grow, from the child this time
+        let size = 10_000_000;
+        if !fork_result.is_parent() {
+            init_struct.set_size(size).unwrap();
+        }
+        barrier.wait();
+
+        // make some writes at the end
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, (size - 10)..size);
+        } else {
+            write_range(ptr, 0xBB, (size - 20)..(size - 10));
+        }
+        barrier.wait();
+
+        // Verify the contents. (This runs in both processes)
+        assert_range(ptr, 0, (size - 1000)..(size - 20));
+        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
+        assert_range(ptr, 0xAA, (size - 10)..size);
+
+        if let ForkResult::Parent { child } = fork_result {
+            nix::sys::wait::waitpid(child, None).unwrap();
+        }
+    }
+}
--- a/libs/neonart/Cargo.toml
+++ b/libs/neonart/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "neonart"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+crossbeam-utils.workspace = true
+spin.workspace = true
+tracing.workspace = true
+
+[dev-dependencies]
+rand = "0.9.1"
+rand_distr = "0.5.1"
--- a/libs/neonart/src/algorithm.rs
+++ b/libs/neonart/src/algorithm.rs
@@ -0,0 +1,599 @@
+mod lock_and_version;
+pub(crate) mod node_ptr;
+mod node_ref;
+
+use std::vec::Vec;
+
+use crate::algorithm::lock_and_version::ConcurrentUpdateError;
+use crate::algorithm::node_ptr::MAX_PREFIX_LEN;
+use crate::algorithm::node_ref::{NewNodeRef, NodeRef, ReadLockedNodeRef, WriteLockedNodeRef};
+use crate::allocator::OutOfMemoryError;
+
+use crate::TreeWriteGuard;
+use crate::UpdateAction;
+use crate::allocator::ArtAllocator;
+use crate::epoch::EpochPin;
+use crate::{Key, Value};
+
+pub(crate) type RootPtr<V> = node_ptr::NodePtr<V>;
+
+#[derive(Debug)]
+pub enum ArtError {
+    ConcurrentUpdate, // need to retry
+    OutOfMemory,
+}
+
+impl From<ConcurrentUpdateError> for ArtError {
+    fn from(_: ConcurrentUpdateError) -> ArtError {
+        ArtError::ConcurrentUpdate
+    }
+}
+
+impl From<OutOfMemoryError> for ArtError {
+    fn from(_: OutOfMemoryError) -> ArtError {
+        ArtError::OutOfMemory
+    }
+}
+
+pub fn new_root<V: Value>(
+    allocator: &impl ArtAllocator<V>,
+) -> Result<RootPtr<V>, OutOfMemoryError> {
+    node_ptr::new_root(allocator)
+}
+
+pub(crate) fn search<'e, K: Key, V: Value>(
+    key: &K,
+    root: RootPtr<V>,
+    epoch_pin: &'e EpochPin,
+) -> Option<&'e V> {
+    loop {
+        let root_ref = NodeRef::from_root_ptr(root);
+        if let Ok(result) = lookup_recurse(key.as_bytes(), root_ref, None, epoch_pin) {
+            break result;
+        }
+        // retry
+    }
+}
+
+pub(crate) fn iter_next<'e, V: Value>(
+    key: &[u8],
+    root: RootPtr<V>,
+    epoch_pin: &'e EpochPin,
+) -> Option<(Vec<u8>, &'e V)> {
+    loop {
+        let mut path = Vec::new();
+        let root_ref = NodeRef::from_root_ptr(root);
+
+        match next_recurse(key, &mut path, root_ref, epoch_pin) {
+            Ok(Some(v)) => {
+                assert_eq!(path.len(), key.len());
+                break Some((path, v));
+            }
+            Ok(None) => break None,
+            Err(ConcurrentUpdateError()) => {
+                // retry
+                continue;
+            }
+        }
+    }
+}
+
+pub(crate) fn update_fn<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>, F>(
+    key: &K,
+    value_fn: F,
+    root: RootPtr<V>,
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), OutOfMemoryError>
+where
+    F: FnOnce(Option<&V>) -> UpdateAction<V>,
+{
+    let value_fn_cell = std::cell::Cell::new(Some(value_fn));
+    loop {
+        let root_ref = NodeRef::from_root_ptr(root);
+        let this_value_fn = |arg: Option<&V>| value_fn_cell.take().unwrap()(arg);
+        let key_bytes = key.as_bytes();
+
+        match update_recurse(
+            key_bytes,
+            this_value_fn,
+            root_ref,
+            None,
+            None,
+            guard,
+            0,
+            key_bytes,
+        ) {
+            Ok(()) => break Ok(()),
+            Err(ArtError::ConcurrentUpdate) => {
+                continue; // retry
+            }
+            Err(ArtError::OutOfMemory) => break Err(OutOfMemoryError()),
+        }
+    }
+}
+
+// Error means you must retry.
+//
+// This corresponds to the 'lookupOpt' function in the paper
+#[allow(clippy::only_used_in_recursion)]
+fn lookup_recurse<'e, V: Value>(
+    key: &[u8],
+    node: NodeRef<'e, V>,
+    parent: Option<ReadLockedNodeRef<V>>,
+    epoch_pin: &'e EpochPin,
+) -> Result<Option<&'e V>, ConcurrentUpdateError> {
+    let rnode = node.read_lock_or_restart()?;
+    if let Some(parent) = parent {
+        parent.read_unlock_or_restart()?;
+    }
+
+    // check if the prefix matches, may increment level
+    let prefix_len = if let Some(prefix_len) = rnode.prefix_matches(key) {
+        prefix_len
+    } else {
+        rnode.read_unlock_or_restart()?;
+        return Ok(None);
+    };
+
+    if rnode.is_leaf() {
+        assert_eq!(key.len(), prefix_len);
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let v = unsafe { vptr.as_ref().unwrap() };
+        return Ok(Some(v));
+    }
+
+    let key = &key[prefix_len..];
+
+    // find child (or leaf value)
+    let next_node = rnode.find_child_or_restart(key[0])?;
+
+    match next_node {
+        None => Ok(None), // key not found
+        Some(child) => lookup_recurse(&key[1..], child, Some(rnode), epoch_pin),
+    }
+}
+
+#[allow(clippy::only_used_in_recursion)]
+fn next_recurse<'e, V: Value>(
+    min_key: &[u8],
+    path: &mut Vec<u8>,
+    node: NodeRef<'e, V>,
+    epoch_pin: &'e EpochPin,
+) -> Result<Option<&'e V>, ConcurrentUpdateError> {
+    let rnode = node.read_lock_or_restart()?;
+    let prefix = rnode.get_prefix();
+    if !prefix.is_empty() {
+        path.extend_from_slice(prefix);
+    }
+
+    use std::cmp::Ordering;
+    let comparison = path.as_slice().cmp(&min_key[0..path.len()]);
+    if comparison == Ordering::Less {
+        rnode.read_unlock_or_restart()?;
+        return Ok(None);
+    }
+
+    if rnode.is_leaf() {
+        assert_eq!(path.len(), min_key.len());
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let v = unsafe { vptr.as_ref().unwrap() };
+        return Ok(Some(v));
+    }
+
+    let mut min_key_byte = match comparison {
+        Ordering::Less => unreachable!(), // checked this above already
+        Ordering::Equal => min_key[path.len()],
+        Ordering::Greater => 0,
+    };
+
+    loop {
+        match rnode.find_next_child_or_restart(min_key_byte)? {
+            None => {
+                return Ok(None);
+            }
+            Some((key_byte, child_ref)) => {
+                let path_len = path.len();
+                path.push(key_byte);
+                let result = next_recurse(min_key, path, child_ref, epoch_pin)?;
+                if result.is_some() {
+                    return Ok(result);
+                }
+                if key_byte == u8::MAX {
+                    return Ok(None);
+                }
+                path.truncate(path_len);
+                min_key_byte = key_byte + 1;
+            }
+        }
+    }
+}
+
+// This corresponds to the 'insertOpt' function in the paper
+#[allow(clippy::only_used_in_recursion)]
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn update_recurse<'e, K: Key, V: Value, A: ArtAllocator<V>, F>(
+    key: &[u8],
+    value_fn: F,
+    node: NodeRef<'e, V>,
+    rparent: Option<(ReadLockedNodeRef<V>, u8)>,
+    rgrandparent: Option<(ReadLockedNodeRef<V>, u8)>,
+    guard: &'_ mut TreeWriteGuard<'e, K, V, A>,
+    level: usize,
+    orig_key: &[u8],
+) -> Result<(), ArtError>
+where
+    F: FnOnce(Option<&V>) -> UpdateAction<V>,
+{
+    let rnode = node.read_lock_or_restart()?;
+
+    let prefix_match_len = rnode.prefix_matches(key);
+    if prefix_match_len.is_none() {
+        let (rparent, parent_key) = rparent.expect("direct children of the root have no prefix");
+        let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+        let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+        match value_fn(None) {
+            UpdateAction::Nothing => {}
+            UpdateAction::Insert(new_value) => {
+                insert_split_prefix(key, new_value, &mut wnode, &mut wparent, parent_key, guard)?;
+            }
+            UpdateAction::Remove => {
+                panic!("unexpected Remove action on insertion");
+            }
+        }
+        wnode.write_unlock();
+        wparent.write_unlock();
+        return Ok(());
+    }
+    let prefix_match_len = prefix_match_len.unwrap();
+    let key = &key[prefix_match_len..];
+    let level = level + prefix_match_len;
+
+    if rnode.is_leaf() {
+        assert_eq!(key.len(), 0);
+        let (rparent, parent_key) = rparent.expect("root cannot be leaf");
+        let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+        let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+        // safety: Now that we have acquired the write lock, we have exclusive access to the
+        // value. XXX: There might be concurrent reads though?
+        let value_mut = wnode.get_leaf_value_mut();
+
+        match value_fn(Some(value_mut)) {
+            UpdateAction::Nothing => {
+                wparent.write_unlock();
+                wnode.write_unlock();
+            }
+            UpdateAction::Insert(_) => panic!("cannot insert over existing value"),
+            UpdateAction::Remove => {
+                guard.remember_obsolete_node(wnode.as_ptr());
+                wparent.delete_child(parent_key);
+                wnode.write_unlock_obsolete();
+
+                if let Some(rgrandparent) = rgrandparent {
+                    // FIXME: Ignore concurrency error. It doesn't lead to
+                    // corruption, but it means we might leak something. Until
+                    // another update cleans it up.
+                    let _ = cleanup_parent(wparent, rgrandparent, guard);
+                }
+            }
+        }
+
+        return Ok(());
+    }
+
+    let next_node = rnode.find_child_or_restart(key[0])?;
+
+    if next_node.is_none() {
+        if rnode.is_full() {
+            let (rparent, parent_key) = rparent.expect("root node cannot become full");
+            let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+            let wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+            match value_fn(None) {
+                UpdateAction::Nothing => {
+                    wnode.write_unlock();
+                    wparent.write_unlock();
+                }
+                UpdateAction::Insert(new_value) => {
+                    insert_and_grow(key, new_value, wnode, &mut wparent, parent_key, guard)?;
+                    wparent.write_unlock();
+                }
+                UpdateAction::Remove => {
+                    panic!("unexpected Remove action on insertion");
+                }
+            };
+        } else {
+            let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+            if let Some((rparent, _)) = rparent {
+                rparent.read_unlock_or_restart()?;
+            }
+            match value_fn(None) {
+                UpdateAction::Nothing => {}
+                UpdateAction::Insert(new_value) => {
+                    insert_to_node(&mut wnode, key, new_value, guard)?;
+                }
+                UpdateAction::Remove => {
+                    panic!("unexpected Remove action on insertion");
+                }
+            };
+            wnode.write_unlock();
+        }
+        Ok(())
+    } else {
+        let next_child = next_node.unwrap(); // checked above it's not None
+        if let Some((ref rparent, _)) = rparent {
+            rparent.check_or_restart()?;
+        }
+
+        // recurse to next level
+        update_recurse(
+            &key[1..],
+            value_fn,
+            next_child,
+            Some((rnode, key[0])),
+            rparent,
+            guard,
+            level + 1,
+            orig_key,
+        )
+    }
+}
+
+#[derive(Clone)]
+enum PathElement {
+    Prefix(Vec<u8>),
+    KeyByte(u8),
+}
+
+impl std::fmt::Debug for PathElement {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            PathElement::Prefix(prefix) => write!(fmt, "{prefix:?}"),
+            PathElement::KeyByte(key_byte) => write!(fmt, "{key_byte}"),
+        }
+    }
+}
+
+pub(crate) fn dump_tree<V: Value + std::fmt::Debug>(
+    root: RootPtr<V>,
+    epoch_pin: &'_ EpochPin,
+    dst: &mut dyn std::io::Write,
+) {
+    let root_ref = NodeRef::from_root_ptr(root);
+
+    let _ = dump_recurse(&[], root_ref, epoch_pin, 0, dst);
+}
+
+// TODO: return an Err if writeln!() returns error, instead of unwrapping
+#[allow(clippy::only_used_in_recursion)]
+fn dump_recurse<'e, V: Value + std::fmt::Debug>(
+    path: &[PathElement],
+    node: NodeRef<'e, V>,
+    epoch_pin: &'e EpochPin,
+    level: usize,
+    dst: &mut dyn std::io::Write,
+) -> Result<(), ConcurrentUpdateError> {
+    let indent = str::repeat(" ", level);
+
+    let rnode = node.read_lock_or_restart()?;
+    let mut path = Vec::from(path);
+    let prefix = rnode.get_prefix();
+    if !prefix.is_empty() {
+        path.push(PathElement::Prefix(Vec::from(prefix)));
+    }
+
+    if rnode.is_leaf() {
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let val = unsafe { vptr.as_ref().unwrap() };
+        writeln!(dst, "{indent} {path:?}: {val:?}").unwrap();
+        return Ok(());
+    }
+
+    for key_byte in 0..=u8::MAX {
+        match rnode.find_child_or_restart(key_byte)? {
+            None => continue,
+            Some(child_ref) => {
+                let rchild = child_ref.read_lock_or_restart()?;
+                writeln!(
+                    dst,
+                    "{} {:?}, {}: prefix {:?}",
+                    indent,
+                    &path,
+                    key_byte,
+                    rchild.get_prefix()
+                )
+                .unwrap();
+
+                let mut child_path = path.clone();
+                child_path.push(PathElement::KeyByte(key_byte));
+
+                dump_recurse(&child_path, child_ref, epoch_pin, level + 1, dst)?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+///```text
+///        [fooba]r -> value
+///
+/// [foo]b -> [a]r  -> value
+///      e -> [ls]e -> value
+///```
+fn insert_split_prefix<K: Key, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    node: &mut WriteLockedNodeRef<V>,
+    parent: &mut WriteLockedNodeRef<V>,
+    parent_key: u8,
+    guard: &'_ TreeWriteGuard<K, V, A>,
+) -> Result<(), OutOfMemoryError> {
+    let old_node = node;
+    let old_prefix = old_node.get_prefix();
+    let common_prefix_len = common_prefix(key, old_prefix);
+
+    // Allocate a node for the new value.
+    let new_value_node = allocate_node_for_value(
+        &key[common_prefix_len + 1..],
+        value,
+        guard.tree_writer.allocator,
+    )?;
+
+    // Allocate a new internal node with the common prefix
+    // FIXME: deallocate 'new_value_node' on OOM
+    let mut prefix_node =
+        node_ref::new_internal(&key[..common_prefix_len], guard.tree_writer.allocator)?;
+
+    // Add the old node and the new nodes to the new internal node
+    prefix_node.insert_old_child(old_prefix[common_prefix_len], old_node);
+    prefix_node.insert_new_child(key[common_prefix_len], new_value_node);
+
+    // Modify the prefix of the old child in place
+    old_node.truncate_prefix(old_prefix.len() - common_prefix_len - 1);
+
+    // replace the pointer in the parent
+    parent.replace_child(parent_key, prefix_node.into_ptr());
+
+    Ok(())
+}
+
+fn insert_to_node<K: Key, V: Value, A: ArtAllocator<V>>(
+    wnode: &mut WriteLockedNodeRef<V>,
+    key: &[u8],
+    value: V,
+    guard: &'_ TreeWriteGuard<K, V, A>,
+) -> Result<(), OutOfMemoryError> {
+    let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
+    wnode.insert_child(key[0], value_child.into_ptr());
+    Ok(())
+}
+
+// On entry: 'parent' and 'node' are locked
+fn insert_and_grow<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    wnode: WriteLockedNodeRef<V>,
+    parent: &mut WriteLockedNodeRef<V>,
+    parent_key_byte: u8,
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), ArtError> {
+    let mut bigger_node = wnode.grow(guard.tree_writer.allocator)?;
+
+    // FIXME: deallocate 'bigger_node' on OOM
+    let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
+    bigger_node.insert_new_child(key[0], value_child);
+
+    // Replace the pointer in the parent
+    parent.replace_child(parent_key_byte, bigger_node.into_ptr());
+
+    guard.remember_obsolete_node(wnode.as_ptr());
+    wnode.write_unlock_obsolete();
+
+    Ok(())
+}
+
+fn cleanup_parent<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
+    wparent: WriteLockedNodeRef<V>,
+    rgrandparent: (ReadLockedNodeRef<V>, u8),
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), ArtError> {
+    let (rgrandparent, grandparent_key_byte) = rgrandparent;
+
+    // If the parent becomes completely empty after the deletion, remove the parent from the
+    // grandparent. (This case is possible because we reserve only 8 bytes for the prefix.)
+    // TODO: not implemented.
+
+    // If the parent has only one child, replace the parent with the remaining child. (This is not
+    // possible if the child's prefix field cannot absorb the parent's)
+    if wparent.num_children() == 1 {
+        // Try to lock the remaining child. This can fail if the child is updated
+        // concurrently.
+        let (key_byte, remaining_child) = wparent.find_remaining_child();
+
+        let mut wremaining_child = remaining_child.write_lock_or_restart()?;
+
+        if 1 + wremaining_child.get_prefix().len() + wparent.get_prefix().len() <= MAX_PREFIX_LEN {
+            let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
+
+            // Ok, we have locked the leaf, the parent, the grandparent, and the parent's only
+            // remaining leaf. Proceed with the updates.
+
+            // Update the prefix on the remaining leaf
+            wremaining_child.prepend_prefix(wparent.get_prefix(), key_byte);
+
+            // Replace the pointer in the grandparent to point directly to the remaining leaf
+            wgrandparent.replace_child(grandparent_key_byte, wremaining_child.as_ptr());
+
+            // Mark the parent as deleted.
+            guard.remember_obsolete_node(wparent.as_ptr());
+            wparent.write_unlock_obsolete();
+            return Ok(());
+        }
+    }
+
+    // If the parent's children would fit on a smaller node type after the deletion, replace it with
+    // a smaller node.
+    if wparent.can_shrink() {
+        let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
+        let smaller_node = wparent.shrink(guard.tree_writer.allocator)?;
+
+        // Replace the pointer in the grandparent
+        wgrandparent.replace_child(grandparent_key_byte, smaller_node.into_ptr());
+
+        guard.remember_obsolete_node(wparent.as_ptr());
+        wparent.write_unlock_obsolete();
+        return Ok(());
+    }
+
+    // nothing to do
+    wparent.write_unlock();
+    Ok(())
+}
+
+// Allocate a new leaf node to hold 'value'. If the key is long, we
+// may need to allocate new internal nodes to hold it too
+fn allocate_node_for_value<'a, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError> {
+    let mut prefix_off = key.len().saturating_sub(MAX_PREFIX_LEN);
+
+    let leaf_node = node_ref::new_leaf(&key[prefix_off..key.len()], value, allocator)?;
+
+    let mut node = leaf_node;
+    while prefix_off > 0 {
+        // Need another internal node
+        let remain_prefix = &key[0..prefix_off];
+
+        prefix_off = remain_prefix.len().saturating_sub(MAX_PREFIX_LEN + 1);
+        let mut internal_node = node_ref::new_internal(
+            &remain_prefix[prefix_off..remain_prefix.len() - 1],
+            allocator,
+        )?;
+        internal_node.insert_new_child(*remain_prefix.last().unwrap(), node);
+        node = internal_node;
+    }
+
+    Ok(node)
+}
+
+fn common_prefix(a: &[u8], b: &[u8]) -> usize {
+    for i in 0..MAX_PREFIX_LEN {
+        if a[i] != b[i] {
+            return i;
+        }
+    }
+    panic!("prefixes are equal");
+}
--- a/libs/neonart/src/algorithm/lock_and_version.rs
+++ b/libs/neonart/src/algorithm/lock_and_version.rs
@@ -0,0 +1,117 @@
+//! Each node in the tree has contains one atomic word that stores three things:
+//!
+//! Bit 0: set if the node is "obsolete". An obsolete node has been removed from the tree,
+//!        but might still be accessed by concurrent readers until the epoch expires.
+//! Bit 1: set if the node is currently write-locked. Used as a spinlock.
+//! Bits 2-63: Version number, incremented every time the node is modified.
+//!
+//! AtomicLockAndVersion represents that.
+
+use std::sync::atomic::{AtomicU64, Ordering};
+
+pub(crate) struct ConcurrentUpdateError();
+
+pub(crate) struct AtomicLockAndVersion {
+    inner: AtomicU64,
+}
+
+impl AtomicLockAndVersion {
+    pub(crate) fn new() -> AtomicLockAndVersion {
+        AtomicLockAndVersion {
+            inner: AtomicU64::new(0),
+        }
+    }
+}
+
+impl AtomicLockAndVersion {
+    pub(crate) fn read_lock_or_restart(&self) -> Result<u64, ConcurrentUpdateError> {
+        let version = self.await_node_unlocked();
+        if is_obsolete(version) {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(version)
+    }
+
+    pub(crate) fn check_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
+        self.read_unlock_or_restart(version)
+    }
+
+    pub(crate) fn read_unlock_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
+        if self.inner.load(Ordering::Acquire) != version {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn upgrade_to_write_lock_or_restart(
+        &self,
+        version: u64,
+    ) -> Result<(), ConcurrentUpdateError> {
+        if self
+            .inner
+            .compare_exchange(
+                version,
+                set_locked_bit(version),
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            )
+            .is_err()
+        {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn write_lock_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
+        let old = self.inner.load(Ordering::Relaxed);
+        if is_obsolete(old) || is_locked(old) {
+            return Err(ConcurrentUpdateError());
+        }
+        if self
+            .inner
+            .compare_exchange(
+                old,
+                set_locked_bit(old),
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            )
+            .is_err()
+        {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn write_unlock(&self) {
+        // reset locked bit and overflow into version
+        self.inner.fetch_add(2, Ordering::Release);
+    }
+
+    pub(crate) fn write_unlock_obsolete(&self) {
+        // set obsolete, reset locked, overflow into version
+        self.inner.fetch_add(3, Ordering::Release);
+    }
+
+    // Helper functions
+    fn await_node_unlocked(&self) -> u64 {
+        let mut version = self.inner.load(Ordering::Acquire);
+        while is_locked(version) {
+            // spinlock
+            std::thread::yield_now();
+            version = self.inner.load(Ordering::Acquire)
+        }
+        version
+    }
+}
+
+fn set_locked_bit(version: u64) -> u64 {
+    version + 2
+}
+
+fn is_obsolete(version: u64) -> bool {
+    (version & 1) == 1
+}
+
+fn is_locked(version: u64) -> bool {
+    (version & 2) == 2
+}
--- a/libs/neonart/src/algorithm/node_ptr.rs
+++ b/libs/neonart/src/algorithm/node_ptr.rs
--- a/libs/neonart/src/algorithm/node_ref.rs
+++ b/libs/neonart/src/algorithm/node_ref.rs
@@ -0,0 +1,349 @@
+use std::fmt::Debug;
+use std::marker::PhantomData;
+
+use super::node_ptr;
+use super::node_ptr::NodePtr;
+use crate::EpochPin;
+use crate::Value;
+use crate::algorithm::lock_and_version::AtomicLockAndVersion;
+use crate::algorithm::lock_and_version::ConcurrentUpdateError;
+use crate::allocator::ArtAllocator;
+use crate::allocator::OutOfMemoryError;
+
+pub struct NodeRef<'e, V> {
+    ptr: NodePtr<V>,
+
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V> Debug for NodeRef<'e, V> {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.ptr)
+    }
+}
+
+impl<'e, V: Value> NodeRef<'e, V> {
+    pub(crate) fn from_root_ptr(root_ptr: NodePtr<V>) -> NodeRef<'e, V> {
+        NodeRef {
+            ptr: root_ptr,
+            phantom: PhantomData,
+        }
+    }
+
+    pub(crate) fn read_lock_or_restart(
+        &self,
+    ) -> Result<ReadLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        let version = self.lockword().read_lock_or_restart()?;
+        Ok(ReadLockedNodeRef {
+            ptr: self.ptr,
+            version,
+            phantom: self.phantom,
+        })
+    }
+
+    pub(crate) fn write_lock_or_restart(
+        &self,
+    ) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        self.lockword().write_lock_or_restart()?;
+        Ok(WriteLockedNodeRef {
+            ptr: self.ptr,
+            phantom: self.phantom,
+        })
+    }
+
+    fn lockword(&self) -> &AtomicLockAndVersion {
+        self.ptr.lockword()
+    }
+}
+
+/// A reference to a node that has been optimistically read-locked. The functions re-check
+/// the version after each read.
+pub struct ReadLockedNodeRef<'e, V> {
+    ptr: NodePtr<V>,
+    version: u64,
+
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V: Value> ReadLockedNodeRef<'e, V> {
+    pub(crate) fn is_leaf(&self) -> bool {
+        self.ptr.is_leaf()
+    }
+
+    pub(crate) fn is_full(&self) -> bool {
+        self.ptr.is_full()
+    }
+
+    pub(crate) fn get_prefix(&self) -> &[u8] {
+        self.ptr.get_prefix()
+    }
+
+    /// Note: because we're only holding a read lock, the prefix can change concurrently.
+    /// You must be prepared to restart, if read_unlock() returns error later.
+    ///
+    /// Returns the length of the prefix, or None if it's not a match
+    pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
+        self.ptr.prefix_matches(key)
+    }
+
+    pub(crate) fn find_child_or_restart(
+        &self,
+        key_byte: u8,
+    ) -> Result<Option<NodeRef<'e, V>>, ConcurrentUpdateError> {
+        let child_or_value = self.ptr.find_child(key_byte);
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        match child_or_value {
+            None => Ok(None),
+            Some(child_ptr) => Ok(Some(NodeRef {
+                ptr: child_ptr,
+                phantom: self.phantom,
+            })),
+        }
+    }
+
+    pub(crate) fn find_next_child_or_restart(
+        &self,
+        min_key_byte: u8,
+    ) -> Result<Option<(u8, NodeRef<'e, V>)>, ConcurrentUpdateError> {
+        let child_or_value = self.ptr.find_next_child(min_key_byte);
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        match child_or_value {
+            None => Ok(None),
+            Some((k, child_ptr)) => Ok(Some((
+                k,
+                NodeRef {
+                    ptr: child_ptr,
+                    phantom: self.phantom,
+                },
+            ))),
+        }
+    }
+
+    pub(crate) fn get_leaf_value_ptr(&self) -> Result<*const V, ConcurrentUpdateError> {
+        let result = self.ptr.get_leaf_value();
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        // Extend the lifetime.
+        let result = std::ptr::from_ref(result);
+
+        Ok(result)
+    }
+
+    pub(crate) fn upgrade_to_write_lock_or_restart(
+        self,
+    ) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        self.ptr
+            .lockword()
+            .upgrade_to_write_lock_or_restart(self.version)?;
+
+        Ok(WriteLockedNodeRef {
+            ptr: self.ptr,
+            phantom: self.phantom,
+        })
+    }
+
+    pub(crate) fn read_unlock_or_restart(self) -> Result<(), ConcurrentUpdateError> {
+        self.ptr.lockword().check_or_restart(self.version)?;
+        Ok(())
+    }
+
+    pub(crate) fn check_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
+        self.ptr.lockword().check_or_restart(self.version)?;
+        Ok(())
+    }
+}
+
+/// A reference to a node that has been optimistically read-locked. The functions re-check
+/// the version after each read.
+pub struct WriteLockedNodeRef<'e, V> {
+    ptr: NodePtr<V>,
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V: Value> WriteLockedNodeRef<'e, V> {
+    pub(crate) fn can_shrink(&self) -> bool {
+        self.ptr.can_shrink()
+    }
+
+    pub(crate) fn num_children(&self) -> usize {
+        self.ptr.num_children()
+    }
+
+    pub(crate) fn write_unlock(mut self) {
+        self.ptr.lockword().write_unlock();
+        self.ptr = NodePtr::null();
+    }
+
+    pub(crate) fn write_unlock_obsolete(mut self) {
+        self.ptr.lockword().write_unlock_obsolete();
+        self.ptr = NodePtr::null();
+    }
+
+    pub(crate) fn get_prefix(&self) -> &[u8] {
+        self.ptr.get_prefix()
+    }
+
+    pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
+        self.ptr.truncate_prefix(new_prefix_len)
+    }
+
+    pub(crate) fn prepend_prefix(&mut self, prefix: &[u8], prefix_byte: u8) {
+        self.ptr.prepend_prefix(prefix, prefix_byte)
+    }
+
+    pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
+        self.ptr.insert_child(key_byte, child)
+    }
+
+    pub(crate) fn get_leaf_value_mut(&mut self) -> &mut V {
+        self.ptr.get_leaf_value_mut()
+    }
+
+    pub(crate) fn grow<'a, A>(
+        &self,
+        allocator: &'a A,
+    ) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+    where
+        A: ArtAllocator<V>,
+    {
+        let new_node = self.ptr.grow(allocator)?;
+        Ok(NewNodeRef {
+            ptr: new_node,
+            allocator,
+            extra_nodes: Vec::new(),
+        })
+    }
+
+    pub(crate) fn shrink<'a, A>(
+        &self,
+        allocator: &'a A,
+    ) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+    where
+        A: ArtAllocator<V>,
+    {
+        let new_node = self.ptr.shrink(allocator)?;
+        Ok(NewNodeRef {
+            ptr: new_node,
+            allocator,
+            extra_nodes: Vec::new(),
+        })
+    }
+
+    pub(crate) fn as_ptr(&self) -> NodePtr<V> {
+        self.ptr
+    }
+
+    pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
+        self.ptr.replace_child(key_byte, replacement);
+    }
+
+    pub(crate) fn delete_child(&mut self, key_byte: u8) {
+        self.ptr.delete_child(key_byte);
+    }
+
+    pub(crate) fn find_remaining_child(&self) -> (u8, NodeRef<'e, V>) {
+        assert_eq!(self.num_children(), 1);
+        let child_or_value = self.ptr.find_next_child(0);
+
+        match child_or_value {
+            None => panic!("could not find only child in node"),
+            Some((k, child_ptr)) => (
+                k,
+                NodeRef {
+                    ptr: child_ptr,
+                    phantom: self.phantom,
+                },
+            ),
+        }
+    }
+}
+
+impl<'e, V> Drop for WriteLockedNodeRef<'e, V> {
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            self.ptr.lockword().write_unlock();
+        }
+    }
+}
+
+pub(crate) struct NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    ptr: NodePtr<V>,
+    allocator: &'a A,
+
+    extra_nodes: Vec<NodePtr<V>>,
+}
+
+impl<'a, V, A> NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    pub(crate) fn insert_old_child(&mut self, key_byte: u8, child: &WriteLockedNodeRef<V>) {
+        self.ptr.insert_child(key_byte, child.as_ptr())
+    }
+
+    pub(crate) fn into_ptr(mut self) -> NodePtr<V> {
+        let ptr = self.ptr;
+        self.ptr = NodePtr::null();
+        ptr
+    }
+
+    pub(crate) fn insert_new_child(&mut self, key_byte: u8, child: NewNodeRef<'a, V, A>) {
+        let child_ptr = child.into_ptr();
+        self.ptr.insert_child(key_byte, child_ptr);
+        self.extra_nodes.push(child_ptr);
+    }
+}
+
+impl<'a, V, A> Drop for NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    /// This drop implementation deallocates the newly allocated node, if into_ptr() was not called.
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            self.ptr.deallocate(self.allocator);
+            for p in self.extra_nodes.iter() {
+                p.deallocate(self.allocator);
+            }
+        }
+    }
+}
+
+pub(crate) fn new_internal<'a, V, A>(
+    prefix: &[u8],
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    Ok(NewNodeRef {
+        ptr: node_ptr::new_internal(prefix, allocator)?,
+        allocator,
+        extra_nodes: Vec::new(),
+    })
+}
+
+pub(crate) fn new_leaf<'a, V, A>(
+    prefix: &[u8],
+    value: V,
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    Ok(NewNodeRef {
+        ptr: node_ptr::new_leaf(prefix, value, allocator)?,
+        allocator,
+        extra_nodes: Vec::new(),
+    })
+}
--- a/libs/neonart/src/allocator.rs
+++ b/libs/neonart/src/allocator.rs
@@ -0,0 +1,156 @@
+pub mod block;
+mod multislab;
+mod slab;
+pub mod r#static;
+
+use std::alloc::Layout;
+use std::marker::PhantomData;
+use std::mem::MaybeUninit;
+use std::sync::atomic::Ordering;
+
+use crate::allocator::multislab::MultiSlabAllocator;
+use crate::allocator::r#static::alloc_from_slice;
+
+use spin;
+
+use crate::Tree;
+pub use crate::algorithm::node_ptr::{
+    NodeInternal4, NodeInternal16, NodeInternal48, NodeInternal256, NodeLeaf,
+};
+
+#[derive(Debug)]
+pub struct OutOfMemoryError();
+
+pub trait ArtAllocator<V: crate::Value> {
+    fn alloc_tree(&self) -> *mut Tree<V>;
+
+    fn alloc_node_internal4(&self) -> *mut NodeInternal4<V>;
+    fn alloc_node_internal16(&self) -> *mut NodeInternal16<V>;
+    fn alloc_node_internal48(&self) -> *mut NodeInternal48<V>;
+    fn alloc_node_internal256(&self) -> *mut NodeInternal256<V>;
+    fn alloc_node_leaf(&self) -> *mut NodeLeaf<V>;
+
+    fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>);
+    fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>);
+    fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>);
+    fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>);
+    fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>);
+}
+
+pub struct ArtMultiSlabAllocator<'t, V>
+where
+    V: crate::Value,
+{
+    tree_area: spin::Mutex<Option<&'t mut MaybeUninit<Tree<V>>>>,
+
+    pub(crate) inner: MultiSlabAllocator<'t, 5>,
+
+    phantom_val: PhantomData<V>,
+}
+
+impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
+    const LAYOUTS: [Layout; 5] = [
+        Layout::new::<NodeInternal4<V>>(),
+        Layout::new::<NodeInternal16<V>>(),
+        Layout::new::<NodeInternal48<V>>(),
+        Layout::new::<NodeInternal256<V>>(),
+        Layout::new::<NodeLeaf<V>>(),
+    ];
+
+    pub fn new(area: &'t mut [MaybeUninit<u8>]) -> &'t mut ArtMultiSlabAllocator<'t, V> {
+        let (allocator_area, remain) = alloc_from_slice::<ArtMultiSlabAllocator<V>>(area);
+        let (tree_area, remain) = alloc_from_slice::<Tree<V>>(remain);
+
+        allocator_area.write(ArtMultiSlabAllocator {
+            tree_area: spin::Mutex::new(Some(tree_area)),
+            inner: MultiSlabAllocator::new(remain, &Self::LAYOUTS),
+            phantom_val: PhantomData,
+        })
+    }
+}
+
+impl<'t, V: crate::Value> ArtAllocator<V> for ArtMultiSlabAllocator<'t, V> {
+    fn alloc_tree(&self) -> *mut Tree<V> {
+        let mut t = self.tree_area.lock();
+        if let Some(tree_area) = t.take() {
+            return tree_area.as_mut_ptr().cast();
+        }
+        panic!("cannot allocate more than one tree");
+    }
+
+    fn alloc_node_internal4(&self) -> *mut NodeInternal4<V> {
+        self.inner.alloc_slab(0).cast()
+    }
+    fn alloc_node_internal16(&self) -> *mut NodeInternal16<V> {
+        self.inner.alloc_slab(1).cast()
+    }
+    fn alloc_node_internal48(&self) -> *mut NodeInternal48<V> {
+        self.inner.alloc_slab(2).cast()
+    }
+    fn alloc_node_internal256(&self) -> *mut NodeInternal256<V> {
+        self.inner.alloc_slab(3).cast()
+    }
+    fn alloc_node_leaf(&self) -> *mut NodeLeaf<V> {
+        self.inner.alloc_slab(4).cast()
+    }
+
+    fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>) {
+        self.inner.dealloc_slab(0, ptr.cast())
+    }
+
+    fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>) {
+        self.inner.dealloc_slab(1, ptr.cast())
+    }
+    fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>) {
+        self.inner.dealloc_slab(2, ptr.cast())
+    }
+    fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>) {
+        self.inner.dealloc_slab(3, ptr.cast())
+    }
+    fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>) {
+        self.inner.dealloc_slab(4, ptr.cast())
+    }
+}
+
+impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
+    pub(crate) fn get_statistics(&self) -> ArtMultiSlabStats {
+        ArtMultiSlabStats {
+            num_internal4: self.inner.slab_descs[0]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal16: self.inner.slab_descs[1]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal48: self.inner.slab_descs[2]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal256: self.inner.slab_descs[3]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_leaf: self.inner.slab_descs[4]
+                .num_allocated
+                .load(Ordering::Relaxed),
+
+            num_blocks_internal4: self.inner.slab_descs[0].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal16: self.inner.slab_descs[1].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal48: self.inner.slab_descs[2].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal256: self.inner.slab_descs[3].num_blocks.load(Ordering::Relaxed),
+            num_blocks_leaf: self.inner.slab_descs[4].num_blocks.load(Ordering::Relaxed),
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ArtMultiSlabStats {
+    pub num_internal4: u64,
+    pub num_internal16: u64,
+    pub num_internal48: u64,
+    pub num_internal256: u64,
+    pub num_leaf: u64,
+
+    pub num_blocks_internal4: u64,
+    pub num_blocks_internal16: u64,
+    pub num_blocks_internal48: u64,
+    pub num_blocks_internal256: u64,
+    pub num_blocks_leaf: u64,
+}
--- a/libs/neonart/src/allocator/block.rs
+++ b/libs/neonart/src/allocator/block.rs
@@ -0,0 +1,191 @@
+//! Simple allocator of fixed-size blocks
+
+use std::mem::MaybeUninit;
+use std::sync::atomic::{AtomicU64, Ordering};
+
+use spin;
+
+pub const BLOCK_SIZE: usize = 16 * 1024;
+
+const INVALID_BLOCK: u64 = u64::MAX;
+
+pub(crate) struct BlockAllocator<'t> {
+    blocks_ptr: &'t [MaybeUninit<u8>],
+    num_blocks: u64,
+    num_initialized: AtomicU64,
+
+    freelist_head: spin::Mutex<u64>,
+}
+
+struct FreeListBlock {
+    inner: spin::Mutex<FreeListBlockInner>,
+}
+
+struct FreeListBlockInner {
+    next: u64,
+
+    num_free_blocks: u64,
+    free_blocks: [u64; 100], // FIXME: fill the rest of the block
+}
+
+impl<'t> BlockAllocator<'t> {
+    pub(crate) fn new(area: &'t mut [MaybeUninit<u8>]) -> Self {
+        // Use all the space for the blocks
+        let padding = area.as_ptr().align_offset(BLOCK_SIZE);
+        let remain = &mut area[padding..];
+
+        let num_blocks = (remain.len() / BLOCK_SIZE) as u64;
+
+        BlockAllocator {
+            blocks_ptr: remain,
+            num_blocks,
+            num_initialized: AtomicU64::new(0),
+            freelist_head: spin::Mutex::new(INVALID_BLOCK),
+        }
+    }
+
+    /// safety: you must hold a lock on the pointer to this block, otherwise it might get
+    /// reused for another kind of block
+    fn read_freelist_block(&self, blkno: u64) -> &FreeListBlock {
+        let ptr: *const FreeListBlock = self.get_block_ptr(blkno).cast();
+        unsafe { ptr.as_ref().unwrap() }
+    }
+
+    fn get_block_ptr(&self, blkno: u64) -> *mut u8 {
+        assert!(blkno < self.num_blocks);
+        unsafe {
+            self.blocks_ptr
+                .as_ptr()
+                .byte_offset(blkno as isize * BLOCK_SIZE as isize)
+        }
+        .cast_mut()
+        .cast()
+    }
+
+    #[allow(clippy::mut_from_ref)]
+    pub(crate) fn alloc_block(&self) -> &mut [MaybeUninit<u8>] {
+        // FIXME: handle OOM
+        let blkno = self.alloc_block_internal();
+        if blkno == INVALID_BLOCK {
+            panic!("out of memory");
+        }
+
+        let ptr: *mut MaybeUninit<u8> = self.get_block_ptr(blkno).cast();
+        unsafe { std::slice::from_raw_parts_mut(ptr, BLOCK_SIZE) }
+    }
+
+    fn alloc_block_internal(&self) -> u64 {
+        //  check the free list.
+        {
+            let mut freelist_head = self.freelist_head.lock();
+            if *freelist_head != INVALID_BLOCK {
+                let freelist_block = self.read_freelist_block(*freelist_head);
+
+                // acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
+                let mut g = freelist_block.inner.lock();
+
+                if g.num_free_blocks > 0 {
+                    g.num_free_blocks -= 1;
+                    let result = g.free_blocks[g.num_free_blocks as usize];
+                    return result;
+                } else {
+                    // consume the freelist block itself
+                    let result = *freelist_head;
+                    *freelist_head = g.next;
+                    // This freelist block is now unlinked and can be repurposed
+                    drop(g);
+                    return result;
+                }
+            }
+        }
+
+        // If there are some blocks left that we've never used, pick next such block
+        let mut next_uninitialized = self.num_initialized.load(Ordering::Relaxed);
+        while next_uninitialized < self.num_blocks {
+            match self.num_initialized.compare_exchange(
+                next_uninitialized,
+                next_uninitialized + 1,
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => {
+                    return next_uninitialized;
+                }
+                Err(old) => {
+                    next_uninitialized = old;
+                    continue;
+                }
+            }
+        }
+
+        // out of blocks
+        INVALID_BLOCK
+    }
+
+    // TODO: this is currently unused. The slab allocator never releases blocks
+    #[allow(dead_code)]
+    pub(crate) fn release_block(&self, block_ptr: *mut u8) {
+        let blockno = unsafe { block_ptr.byte_offset_from(self.blocks_ptr) / BLOCK_SIZE as isize };
+        self.release_block_internal(blockno as u64);
+    }
+
+    fn release_block_internal(&self, blockno: u64) {
+        let mut freelist_head = self.freelist_head.lock();
+        if *freelist_head != INVALID_BLOCK {
+            let freelist_block = self.read_freelist_block(*freelist_head);
+
+            // acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
+            let mut g = freelist_block.inner.lock();
+
+            let num_free_blocks = g.num_free_blocks;
+            if num_free_blocks < g.free_blocks.len() as u64 {
+                g.free_blocks[num_free_blocks as usize] = blockno;
+                g.num_free_blocks += 1;
+                return;
+            }
+        }
+
+        // Convert the block into a new freelist block
+        let block_ptr: *mut FreeListBlock = self.get_block_ptr(blockno).cast();
+        let init = FreeListBlock {
+            inner: spin::Mutex::new(FreeListBlockInner {
+                next: *freelist_head,
+                num_free_blocks: 0,
+                free_blocks: [INVALID_BLOCK; 100],
+            }),
+        };
+        unsafe { (*block_ptr) = init };
+        *freelist_head = blockno;
+    }
+
+    // for debugging
+    pub(crate) fn get_statistics(&self) -> BlockAllocatorStats {
+        let mut num_free_blocks = 0;
+
+        let mut _prev_lock = None;
+        let head_lock = self.freelist_head.lock();
+        let mut next_blk = *head_lock;
+        let mut _head_lock = Some(head_lock);
+        while next_blk != INVALID_BLOCK {
+            let freelist_block = self.read_freelist_block(next_blk);
+            let lock = freelist_block.inner.lock();
+            num_free_blocks += lock.num_free_blocks;
+            next_blk = lock.next;
+            _prev_lock = Some(lock); // hold the lock until we've read the next block
+            _head_lock = None;
+        }
+
+        BlockAllocatorStats {
+            num_blocks: self.num_blocks,
+            num_initialized: self.num_initialized.load(Ordering::Relaxed),
+            num_free_blocks,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct BlockAllocatorStats {
+    pub num_blocks: u64,
+    pub num_initialized: u64,
+    pub num_free_blocks: u64,
+}
--- a/libs/neonart/src/allocator/multislab.rs
+++ b/libs/neonart/src/allocator/multislab.rs
@@ -0,0 +1,33 @@
+use std::alloc::Layout;
+use std::mem::MaybeUninit;
+
+use crate::allocator::block::BlockAllocator;
+use crate::allocator::slab::SlabDesc;
+
+pub struct MultiSlabAllocator<'t, const N: usize> {
+    pub(crate) block_allocator: BlockAllocator<'t>,
+
+    pub(crate) slab_descs: [SlabDesc; N],
+}
+
+impl<'t, const N: usize> MultiSlabAllocator<'t, N> {
+    pub(crate) fn new(
+        area: &'t mut [MaybeUninit<u8>],
+        layouts: &[Layout; N],
+    ) -> MultiSlabAllocator<'t, N> {
+        let block_allocator = BlockAllocator::new(area);
+        MultiSlabAllocator {
+            block_allocator,
+
+            slab_descs: std::array::from_fn(|i| SlabDesc::new(&layouts[i])),
+        }
+    }
+
+    pub(crate) fn alloc_slab(&self, slab_idx: usize) -> *mut u8 {
+        self.slab_descs[slab_idx].alloc_chunk(&self.block_allocator)
+    }
+
+    pub(crate) fn dealloc_slab(&self, slab_idx: usize, ptr: *mut u8) {
+        self.slab_descs[slab_idx].dealloc_chunk(ptr, &self.block_allocator)
+    }
+}
--- a/libs/neonart/src/allocator/slab.rs
+++ b/libs/neonart/src/allocator/slab.rs
@@ -0,0 +1,433 @@
+//! A slab allocator that carves out fixed-size chunks from larger blocks.
+//!
+//!
+
+use std::alloc::Layout;
+use std::mem::MaybeUninit;
+use std::ops::Deref;
+use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
+
+use spin;
+
+use super::alloc_from_slice;
+use super::block::BlockAllocator;
+
+use crate::allocator::block::BLOCK_SIZE;
+
+pub(crate) struct SlabDesc {
+    pub(crate) layout: Layout,
+
+    block_lists: spin::RwLock<BlockLists>,
+
+    pub(crate) num_blocks: AtomicU64,
+    pub(crate) num_allocated: AtomicU64,
+}
+
+// FIXME: Not sure if SlabDesc is really Sync or Send. It probably is when it's empty, but
+// 'block_lists' contains pointers when it's not empty. In the current use as part of the
+// the art tree, SlabDescs are only moved during initialization.
+unsafe impl Sync for SlabDesc {}
+unsafe impl Send for SlabDesc {}
+
+#[derive(Default, Debug)]
+struct BlockLists {
+    full_blocks: BlockList,
+    nonfull_blocks: BlockList,
+}
+
+impl BlockLists {
+    // Unlink a node. It must be in either one of the two lists.
+    unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
+        let list = unsafe {
+            if (*elem).next.is_null() {
+                if self.full_blocks.tail == elem {
+                    Some(&mut self.full_blocks)
+                } else {
+                    Some(&mut self.nonfull_blocks)
+                }
+            } else if (*elem).prev.is_null() {
+                if self.full_blocks.head == elem {
+                    Some(&mut self.full_blocks)
+                } else {
+                    Some(&mut self.nonfull_blocks)
+                }
+            } else {
+                None
+            }
+        };
+        unsafe { unlink_slab_block(list, elem) };
+    }
+}
+
+unsafe fn unlink_slab_block(mut list: Option<&mut BlockList>, elem: *mut SlabBlockHeader) {
+    unsafe {
+        if (*elem).next.is_null() {
+            assert_eq!(list.as_ref().unwrap().tail, elem);
+            list.as_mut().unwrap().tail = (*elem).prev;
+        } else {
+            assert_eq!((*(*elem).next).prev, elem);
+            (*(*elem).next).prev = (*elem).prev;
+        }
+        if (*elem).prev.is_null() {
+            assert_eq!(list.as_ref().unwrap().head, elem);
+            list.as_mut().unwrap().head = (*elem).next;
+        } else {
+            assert_eq!((*(*elem).prev).next, elem);
+            (*(*elem).prev).next = (*elem).next;
+        }
+    }
+}
+
+#[derive(Debug)]
+struct BlockList {
+    head: *mut SlabBlockHeader,
+    tail: *mut SlabBlockHeader,
+}
+
+impl Default for BlockList {
+    fn default() -> Self {
+        BlockList {
+            head: std::ptr::null_mut(),
+            tail: std::ptr::null_mut(),
+        }
+    }
+}
+
+impl BlockList {
+    unsafe fn push_head(&mut self, elem: *mut SlabBlockHeader) {
+        unsafe {
+            if self.is_empty() {
+                self.tail = elem;
+                (*elem).next = std::ptr::null_mut();
+            } else {
+                (*elem).next = self.head;
+                (*self.head).prev = elem;
+            }
+            (*elem).prev = std::ptr::null_mut();
+            self.head = elem;
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        self.head.is_null()
+    }
+
+    unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
+        unsafe { unlink_slab_block(Some(self), elem) }
+    }
+
+    #[cfg(test)]
+    fn dump(&self) {
+        let mut next = self.head;
+
+        while !next.is_null() {
+            let n = unsafe { next.as_ref() }.unwrap();
+            eprintln!(
+                "  blk {:?} (free {}/{})",
+                next,
+                n.num_free_chunks.load(Ordering::Relaxed),
+                n.num_chunks
+            );
+            next = n.next;
+        }
+    }
+}
+
+impl SlabDesc {
+    pub(crate) fn new(layout: &Layout) -> SlabDesc {
+        SlabDesc {
+            layout: *layout,
+            block_lists: spin::RwLock::new(BlockLists::default()),
+            num_allocated: AtomicU64::new(0),
+            num_blocks: AtomicU64::new(0),
+        }
+    }
+}
+
+#[derive(Debug)]
+struct SlabBlockHeader {
+    free_chunks_head: spin::Mutex<*mut FreeChunk>,
+    num_free_chunks: AtomicU32,
+    num_chunks: u32, // this is really a constant for a given Layout
+
+    // these fields are protected by the lock on the BlockLists
+    prev: *mut SlabBlockHeader,
+    next: *mut SlabBlockHeader,
+}
+
+struct FreeChunk {
+    next: *mut FreeChunk,
+}
+
+enum ReadOrWriteGuard<'a, T> {
+    Read(spin::RwLockReadGuard<'a, T>),
+    Write(spin::RwLockWriteGuard<'a, T>),
+}
+
+impl<'a, T> Deref for ReadOrWriteGuard<'a, T> {
+    type Target = T;
+
+    fn deref(&self) -> &<Self as Deref>::Target {
+        match self {
+            ReadOrWriteGuard::Read(g) => g.deref(),
+            ReadOrWriteGuard::Write(g) => g.deref(),
+        }
+    }
+}
+
+impl SlabDesc {
+    pub fn alloc_chunk(&self, block_allocator: &BlockAllocator) -> *mut u8 {
+        // Are there any free chunks?
+        let mut acquire_write = false;
+        'outer: loop {
+            let mut block_lists_guard = if acquire_write {
+                ReadOrWriteGuard::Write(self.block_lists.write())
+            } else {
+                ReadOrWriteGuard::Read(self.block_lists.read())
+            };
+            'inner: loop {
+                let block_ptr = block_lists_guard.nonfull_blocks.head;
+                if block_ptr.is_null() {
+                    break 'outer;
+                }
+                unsafe {
+                    let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
+                    if !(*free_chunks_head).is_null() {
+                        let result = *free_chunks_head;
+                        (*free_chunks_head) = (*result).next;
+                        let _old = (*block_ptr).num_free_chunks.fetch_sub(1, Ordering::Relaxed);
+
+                        self.num_allocated.fetch_add(1, Ordering::Relaxed);
+                        return result.cast();
+                    }
+                }
+
+                // The block at the head of the list was full. Grab write lock and retry
+                match block_lists_guard {
+                    ReadOrWriteGuard::Read(_) => {
+                        acquire_write = true;
+                        continue 'outer;
+                    }
+                    ReadOrWriteGuard::Write(ref mut g) => {
+                        // move the node to the list of full blocks
+                        unsafe {
+                            g.nonfull_blocks.unlink(block_ptr);
+                            g.full_blocks.push_head(block_ptr);
+                        };
+                        continue 'inner;
+                    }
+                }
+            }
+        }
+
+        // no free chunks. Allocate a new block (and the chunk from that)
+        let (new_block, new_chunk) = self.alloc_block_and_chunk(block_allocator);
+        self.num_blocks.fetch_add(1, Ordering::Relaxed);
+
+        // Add the block to the list in the SlabDesc
+        unsafe {
+            let mut block_lists_guard = self.block_lists.write();
+            block_lists_guard.nonfull_blocks.push_head(new_block);
+        }
+        self.num_allocated.fetch_add(1, Ordering::Relaxed);
+        new_chunk
+    }
+
+    pub fn dealloc_chunk(&self, chunk_ptr: *mut u8, _block_allocator: &BlockAllocator) {
+        // Find the block it belongs to. You can find the block from the address. (And knowing the
+        // layout, you could calculate the chunk number too.)
+        let block_ptr: *mut SlabBlockHeader = {
+            let block_addr = (chunk_ptr.addr() / BLOCK_SIZE) * BLOCK_SIZE;
+            chunk_ptr.with_addr(block_addr).cast()
+        };
+        let chunk_ptr: *mut FreeChunk = chunk_ptr.cast();
+
+        // Mark the chunk as free in 'freechunks' list
+        let num_chunks;
+        let num_free_chunks;
+        unsafe {
+            let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
+            (*chunk_ptr).next = *free_chunks_head;
+            *free_chunks_head = chunk_ptr;
+
+            num_free_chunks = (*block_ptr).num_free_chunks.fetch_add(1, Ordering::Relaxed) + 1;
+            num_chunks = (*block_ptr).num_chunks;
+        }
+
+        if num_free_chunks == 1 {
+            // If the block was full previously, add it to the nonfull blocks list. Note that
+            // we're not holding the lock anymore, so it can immediately become full again.
+            // That's harmless, it will be moved back to the full list again when a call
+            // to alloc_chunk() sees it.
+            let mut block_lists = self.block_lists.write();
+            unsafe {
+                block_lists.unlink(block_ptr);
+                block_lists.nonfull_blocks.push_head(block_ptr);
+            };
+        } else if num_free_chunks == num_chunks {
+            // If the block became completely empty, move it to the free list
+            // TODO
+            // FIXME: we're still holding the spinlock. It's not exactly safe to return it to
+            // the free blocks list, is it? Defer it as garbage to wait out concurrent updates?
+            //block_allocator.release_block()
+        }
+
+        // update stats
+        self.num_allocated.fetch_sub(1, Ordering::Relaxed);
+    }
+
+    fn alloc_block_and_chunk(
+        &self,
+        block_allocator: &BlockAllocator,
+    ) -> (*mut SlabBlockHeader, *mut u8) {
+        // fixme: handle OOM
+        let block_slice: &mut [MaybeUninit<u8>] = block_allocator.alloc_block();
+        let (block_header, remain) = alloc_from_slice::<SlabBlockHeader>(block_slice);
+
+        let padding = remain.as_ptr().align_offset(self.layout.align());
+
+        let num_chunks = (remain.len() - padding) / self.layout.size();
+
+        let first_chunk_ptr: *mut FreeChunk = remain[padding..].as_mut_ptr().cast();
+
+        unsafe {
+            let mut chunk_ptr = first_chunk_ptr;
+            for _ in 0..num_chunks - 1 {
+                let next_chunk_ptr = chunk_ptr.byte_add(self.layout.size());
+                (*chunk_ptr).next = next_chunk_ptr;
+                chunk_ptr = next_chunk_ptr;
+            }
+            (*chunk_ptr).next = std::ptr::null_mut();
+
+            let result_chunk = first_chunk_ptr;
+
+            let block_header = block_header.write(SlabBlockHeader {
+                free_chunks_head: spin::Mutex::new((*first_chunk_ptr).next),
+                prev: std::ptr::null_mut(),
+                next: std::ptr::null_mut(),
+                num_chunks: num_chunks as u32,
+                num_free_chunks: AtomicU32::new(num_chunks as u32 - 1),
+            });
+
+            (block_header, result_chunk.cast())
+        }
+    }
+
+    #[cfg(test)]
+    fn dump(&self) {
+        eprintln!(
+            "slab dump ({} blocks, {} allocated chunks)",
+            self.num_blocks.load(Ordering::Relaxed),
+            self.num_allocated.load(Ordering::Relaxed)
+        );
+        let lists = self.block_lists.read();
+
+        eprintln!("nonfull blocks:");
+        lists.nonfull_blocks.dump();
+        eprintln!("full blocks:");
+        lists.full_blocks.dump();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use rand::Rng;
+    use rand_distr::Zipf;
+
+    struct TestObject {
+        val: usize,
+        _dummy: [u8; BLOCK_SIZE / 4],
+    }
+
+    struct TestObjectSlab<'a>(SlabDesc, BlockAllocator<'a>);
+    impl<'a> TestObjectSlab<'a> {
+        fn new(block_allocator: BlockAllocator) -> TestObjectSlab {
+            TestObjectSlab(SlabDesc::new(&Layout::new::<TestObject>()), block_allocator)
+        }
+
+        fn alloc(&self, val: usize) -> *mut TestObject {
+            let obj: *mut TestObject = self.0.alloc_chunk(&self.1).cast();
+            unsafe { (*obj).val = val };
+            obj
+        }
+
+        fn dealloc(&self, obj: *mut TestObject) {
+            self.0.dealloc_chunk(obj.cast(), &self.1)
+        }
+    }
+
+    #[test]
+    fn test_slab_alloc() {
+        const MEM_SIZE: usize = 100000000;
+        let mut area = Box::new_uninit_slice(MEM_SIZE);
+        let block_allocator = BlockAllocator::new(&mut area);
+
+        let slab = TestObjectSlab::new(block_allocator);
+
+        let mut all: Vec<*mut TestObject> = Vec::new();
+        for i in 0..11 {
+            all.push(slab.alloc(i));
+        }
+        #[allow(clippy::needless_range_loop)]
+        for i in 0..11 {
+            assert!(unsafe { (*all[i]).val == i });
+        }
+
+        let distribution = Zipf::new(10.0, 1.1).unwrap();
+        let mut rng = rand::rng();
+        for _ in 0..100000 {
+            slab.0.dump();
+            let idx = rng.sample(distribution) as usize;
+            let ptr: *mut TestObject = all[idx];
+            if !ptr.is_null() {
+                assert_eq!(unsafe { (*ptr).val }, idx);
+                slab.dealloc(ptr);
+                all[idx] = std::ptr::null_mut();
+            } else {
+                all[idx] = slab.alloc(idx);
+            }
+        }
+    }
+
+    fn new_test_blk(i: u32) -> *mut SlabBlockHeader {
+        Box::into_raw(Box::new(SlabBlockHeader {
+            free_chunks_head: spin::Mutex::new(std::ptr::null_mut()),
+            num_free_chunks: AtomicU32::new(0),
+            num_chunks: i,
+            prev: std::ptr::null_mut(),
+            next: std::ptr::null_mut(),
+        }))
+    }
+
+    #[test]
+    fn test_block_linked_list() {
+        // note: these are leaked, but that's OK for tests
+        let a = new_test_blk(0);
+        let b = new_test_blk(1);
+
+        let mut list = BlockList::default();
+        assert!(list.is_empty());
+
+        unsafe {
+            list.push_head(a);
+            assert!(!list.is_empty());
+            list.unlink(a);
+        }
+        assert!(list.is_empty());
+
+        unsafe {
+            list.push_head(b);
+            list.push_head(a);
+            assert_eq!(list.head, a);
+            assert_eq!((*a).next, b);
+            assert_eq!((*b).prev, a);
+            assert_eq!(list.tail, b);
+
+            list.unlink(a);
+            list.unlink(b);
+            assert!(list.is_empty());
+        }
+    }
+}
--- a/libs/neonart/src/allocator/static.rs
+++ b/libs/neonart/src/allocator/static.rs
@@ -0,0 +1,44 @@
+use std::mem::MaybeUninit;
+
+pub fn alloc_from_slice<T>(
+    area: &mut [MaybeUninit<u8>],
+) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
+    let layout = std::alloc::Layout::new::<T>();
+
+    let area_start = area.as_mut_ptr();
+
+    // pad to satisfy alignment requirements
+    let padding = area_start.align_offset(layout.align());
+    if padding + layout.size() > area.len() {
+        panic!("out of memory");
+    }
+    let area = &mut area[padding..];
+    let (result_area, remain) = area.split_at_mut(layout.size());
+
+    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
+    let result = unsafe { result_ptr.as_mut().unwrap() };
+
+    (result, remain)
+}
+
+pub fn alloc_array_from_slice<T>(
+    area: &mut [MaybeUninit<u8>],
+    len: usize,
+) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
+    let layout = std::alloc::Layout::new::<T>();
+
+    let area_start = area.as_mut_ptr();
+
+    // pad to satisfy alignment requirements
+    let padding = area_start.align_offset(layout.align());
+    if padding + layout.size() * len > area.len() {
+        panic!("out of memory");
+    }
+    let area = &mut area[padding..];
+    let (result_area, remain) = area.split_at_mut(layout.size() * len);
+
+    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
+    let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
+
+    (result, remain)
+}
--- a/libs/neonart/src/epoch.rs
+++ b/libs/neonart/src/epoch.rs
@@ -0,0 +1,142 @@
+//! This is similar to crossbeam_epoch crate, but works in shared memory
+
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
+
+use crossbeam_utils::CachePadded;
+
+const NUM_SLOTS: usize = 1000;
+
+/// This is the struct that is stored in shmem
+///
+/// bit 0: is it pinned or not?
+/// rest of the bits are the epoch counter.
+pub struct EpochShared {
+    global_epoch: AtomicU64,
+    participants: [CachePadded<AtomicU64>; NUM_SLOTS],
+
+    broadcast_lock: spin::Mutex<()>,
+}
+
+impl EpochShared {
+    pub fn new() -> EpochShared {
+        EpochShared {
+            global_epoch: AtomicU64::new(2),
+            participants: [const { CachePadded::new(AtomicU64::new(2)) }; NUM_SLOTS],
+            broadcast_lock: spin::Mutex::new(()),
+        }
+    }
+
+    pub fn register(&self) -> LocalHandle {
+        LocalHandle {
+            global: self,
+            last_slot: AtomicUsize::new(0), // todo: choose more intelligently
+        }
+    }
+
+    fn release_pin(&self, slot: usize, _epoch: u64) {
+        let global_epoch = self.global_epoch.load(Ordering::Relaxed);
+        self.participants[slot].store(global_epoch, Ordering::Relaxed);
+    }
+
+    fn pin_internal(&self, slot_hint: usize) -> (usize, u64) {
+        // pick a slot
+        let mut slot = slot_hint;
+        let epoch = loop {
+            let old = self.participants[slot].fetch_or(1, Ordering::Relaxed);
+            if old & 1 == 0 {
+                // Got this slot
+                break old;
+            }
+
+            // the slot was busy by another thread / process. try a different slot
+            slot += 1;
+            if slot == NUM_SLOTS {
+                slot = 0;
+            }
+            continue;
+        };
+        (slot, epoch)
+    }
+
+    pub(crate) fn advance(&self) -> u64 {
+        // Advance the global epoch
+        let old_epoch = self.global_epoch.fetch_add(2, Ordering::Relaxed);
+        // Anyone that release their pin after this will update their slot.
+        old_epoch + 2
+    }
+
+    pub(crate) fn broadcast(&self) {
+        let Some(_guard) = self.broadcast_lock.try_lock() else {
+            return;
+        };
+
+        let epoch = self.global_epoch.load(Ordering::Relaxed);
+        let old_epoch = epoch.wrapping_sub(2);
+
+        // Update all free slots.
+        for i in 0..NUM_SLOTS {
+            // TODO: check result, as a sanity check. It should either be the old epoch, or pinned
+            let _ = self.participants[i].compare_exchange(
+                old_epoch,
+                epoch,
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+            );
+        }
+
+        // FIXME: memory fence here, since we used Relaxed?
+    }
+
+    pub(crate) fn get_oldest(&self) -> u64 {
+        // Read all slots.
+        let now = self.global_epoch.load(Ordering::Relaxed);
+        let mut oldest = now;
+        for i in 0..NUM_SLOTS {
+            let this_epoch = self.participants[i].load(Ordering::Relaxed);
+            let delta = now.wrapping_sub(this_epoch);
+            if delta > u64::MAX / 2 {
+                // this is very recent
+            } else if delta > now.wrapping_sub(oldest) {
+                oldest = this_epoch;
+            }
+        }
+        oldest
+    }
+
+    pub(crate) fn get_current(&self) -> u64 {
+        self.global_epoch.load(Ordering::Relaxed)
+    }
+}
+
+pub(crate) struct EpochPin<'e> {
+    slot: usize,
+    pub(crate) epoch: u64,
+
+    handle: &'e LocalHandle<'e>,
+}
+
+impl<'e> Drop for EpochPin<'e> {
+    fn drop(&mut self) {
+        self.handle.global.release_pin(self.slot, self.epoch);
+    }
+}
+
+pub struct LocalHandle<'g> {
+    global: &'g EpochShared,
+
+    last_slot: AtomicUsize,
+}
+
+impl<'g> LocalHandle<'g> {
+    pub fn pin(&self) -> EpochPin {
+        let (slot, epoch) = self
+            .global
+            .pin_internal(self.last_slot.load(Ordering::Relaxed));
+        self.last_slot.store(slot, Ordering::Relaxed);
+        EpochPin {
+            handle: self,
+            epoch,
+            slot,
+        }
+    }
+}
--- a/libs/neonart/src/lib.rs
+++ b/libs/neonart/src/lib.rs
@@ -0,0 +1,583 @@
+//! Adaptive Radix Tree (ART) implementation, with Optimistic Lock Coupling.
+//!
+//! The data structure is described in these two papers:
+//!
+//! [1] Leis, V. & Kemper, Alfons & Neumann, Thomas. (2013).
+//!     The adaptive radix tree: ARTful indexing for main-memory databases.
+//!     Proceedings - International Conference on Data Engineering. 38-49. 10.1109/ICDE.2013.6544812.
+//!     https://db.in.tum.de/~leis/papers/ART.pdf
+//!
+//! [2] Leis, Viktor & Scheibner, Florian & Kemper, Alfons & Neumann, Thomas. (2016).
+//!     The ART of practical synchronization.
+//!     1-8. 10.1145/2933349.2933352.
+//!     https://db.in.tum.de/~leis/papers/artsync.pdf
+//!
+//! [1] describes the base data structure, and [2] describes the Optimistic Lock Coupling that we
+//! use.
+//!
+//! The papers mention a few different variants. We have made the following choices in this
+//! implementation:
+//!
+//! - All keys have the same length
+//!
+//! - Single-value leaves.
+//!
+//! - For collapsing inner nodes, we use the Pessimistic approach, where each inner node stores a
+//!   variable length "prefix", which stores the keys of all the one-way nodes which have been
+//!   removed. However, similar to the "hybrid" approach described in the paper, each node only has
+//!   space for a constant-size prefix of 8 bytes. If a node would have a longer prefix, then we
+//!   create create one-way nodes to store them. (There was no particular reason for this choice,
+//!   the "hybrid" approach described in the paper might be better.)
+//!
+//! - For concurrency, we use Optimistic Lock Coupling. The paper [2] also describes another method,
+//!   ROWEX, which generally performs better when there is contention, but that is not important
+//!   for use and Optimisic Lock Coupling is simpler to implement.
+//!
+//! ## Requirements
+//!
+//! This data structure is currently used for the integrated LFC, relsize and last-written LSN cache
+//! in the compute communicator, part of the 'neon' Postgres extension. We have some unique
+//! requirements, which is why we had to write our own. Namely:
+//!
+//! - The data structure has to live in fixed-sized shared memory segment. That rules out any
+//!   built-in Rust collections and most crates. (Except possibly with the 'allocator_api' rust
+//!   feature, which still nightly-only experimental as of this writing).
+//!
+//! - The data structure is accessed from multiple processes. Only one process updates the data
+//!   structure, but other processes perform reads. That rules out using built-in Rust locking
+//!   primitives like Mutex and RwLock, and most crates too.
+//!
+//! - Within the one process with write-access, multiple threads can perform updates concurrently.
+//!   That rules out using PostgreSQL LWLocks for the locking.
+//!
+//! The implementation is generic, and doesn't depend on any PostgreSQL specifics, but it has been
+//! written with that usage and the above constraints in mind. Some noteworthy assumptions:
+//!
+//! - Contention is assumed to be rare. In the integrated cache in PostgreSQL, there's higher level
+//!   locking in the PostgreSQL buffer manager, which ensures that two backends should not try to
+//!   read / write the same page at the same time. (Prefetching can conflict with actual reads,
+//!   however.)
+//!
+//!  - The keys in the integrated cache are 17 bytes long.
+//!
+//! ## Usage
+//!
+//! Because this is designed to be used as a Postgres shared memory data structure, initialization
+//! happens in three stages:
+//!
+//! 0. A fixed area of shared memory is allocated at postmaster startup.
+//!
+//! 1. TreeInitStruct::new() is called to initialize it, still in Postmaster process, before any
+//!    other process or thread is running. It returns a TreeInitStruct, which is inherited by all
+//!    the processes through fork().
+//!
+//! 2. One process may have write-access to the struct, by calling
+//!    [TreeInitStruct::attach_writer]. (That process is the communicator process.)
+//!
+//! 3. Other processes get read-access to the struct, by calling [TreeInitStruct::attach_reader]
+//!
+//! "Write access" means that you can insert / update / delete values in the tree.
+//!
+//! NOTE: The Values stored in the tree are sometimes moved, when a leaf node fills up and a new
+//! larger node needs to be allocated. The versioning and epoch-based allocator ensure that the data
+//! structure stays consistent, but if the Value has interior mutability, like atomic fields,
+//! updates to such fields might be lost if the leaf node is concurrently moved! If that becomes a
+//! problem, the version check could be passed up to the caller, so that the caller could detect the
+//! lost updates and retry the operation.
+//!
+//! ## Implementation
+//!
+//! node_ptr: Provides low-level implementations of the four different node types (eight actually,
+//! since there is an Internal and Leaf variant of each)
+//!
+//! lock_and_version.rs: Provides an abstraction for the combined lock and version counter on each
+//! node.
+//!
+//! node_ref.rs: The code in node_ptr.rs deals with raw pointers. node_ref.rs provides more type-safe
+//!   abstractions on top.
+//!
+//! algorithm.rs: Contains the functions to implement lookups and updates in the tree
+//!
+//! allocator.rs: Provides a facility to allocate memory for the tree nodes. (We must provide our
+//!   own abstraction for that because we need the data structure to live in a pre-allocated shared
+//!   memory segment).
+//!
+//! epoch.rs: The data structure requires that when a node is removed from the tree, it is not
+//!   immediately deallocated, but stays around for as long as concurrent readers might still have
+//!   pointers to them. This is enforced by an epoch system. This is similar to
+//!   e.g. crossbeam_epoch, but we couldn't use that either because it has to work across processes
+//!   communicating over the shared memory segment.
+//!
+//! ## See also
+//!
+//! There are some existing Rust ART implementations out there, but none of them filled all
+//! the requirements:
+//!
+//! - https://github.com/XiangpengHao/congee
+//! - https://github.com/declanvk/blart
+//!
+//! ## TODO
+//!
+//! - Removing values has not been implemented
+
+mod algorithm;
+pub mod allocator;
+mod epoch;
+
+use algorithm::RootPtr;
+use algorithm::node_ptr::NodePtr;
+
+use std::collections::VecDeque;
+use std::fmt::Debug;
+use std::marker::PhantomData;
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+use crate::epoch::EpochPin;
+
+#[cfg(test)]
+mod tests;
+
+use allocator::ArtAllocator;
+pub use allocator::ArtMultiSlabAllocator;
+pub use allocator::OutOfMemoryError;
+
+/// Fixed-length key type.
+///
+pub trait Key: Debug {
+    const KEY_LEN: usize;
+
+    fn as_bytes(&self) -> &[u8];
+}
+
+/// Values stored in the tree
+///
+/// Values need to be Cloneable, because when a node "grows", the value is copied to a new node and
+/// the old sticks around until all readers that might see the old value are gone.
+// fixme obsolete, no longer needs Clone
+pub trait Value {}
+
+const MAX_GARBAGE: usize = 1024;
+
+/// The root of the tree, plus other tree-wide data. This is stored in the shared memory.
+pub struct Tree<V: Value> {
+    /// For simplicity, so that we never need to grow or shrink the root, the root node is always an
+    /// Internal256 node. Also, it never has a prefix (that's actually a bit wasteful, incurring one
+    /// indirection to every lookup)
+    root: RootPtr<V>,
+
+    writer_attached: AtomicBool,
+
+    epoch: epoch::EpochShared,
+}
+
+unsafe impl<V: Value + Sync> Sync for Tree<V> {}
+unsafe impl<V: Value + Send> Send for Tree<V> {}
+
+struct GarbageQueue<V>(VecDeque<(NodePtr<V>, u64)>);
+
+unsafe impl<V: Value + Sync> Sync for GarbageQueue<V> {}
+unsafe impl<V: Value + Send> Send for GarbageQueue<V> {}
+
+impl<V> GarbageQueue<V> {
+    fn new() -> GarbageQueue<V> {
+        GarbageQueue(VecDeque::with_capacity(MAX_GARBAGE))
+    }
+
+    fn remember_obsolete_node(&mut self, ptr: NodePtr<V>, epoch: u64) {
+        self.0.push_front((ptr, epoch));
+    }
+
+    fn next_obsolete(&mut self, cutoff_epoch: u64) -> Option<NodePtr<V>> {
+        if let Some(back) = self.0.back() {
+            if back.1 < cutoff_epoch {
+                return Some(self.0.pop_back().unwrap().0);
+            }
+        }
+        None
+    }
+}
+
+/// Struct created at postmaster startup
+pub struct TreeInitStruct<'t, K: Key, V: Value, A: ArtAllocator<V>> {
+    tree: &'t Tree<V>,
+
+    allocator: &'t A,
+
+    phantom_key: PhantomData<K>,
+}
+
+/// The worker process has a reference to this. The write operations are only safe
+/// from the worker process
+pub struct TreeWriteAccess<'t, K: Key, V: Value, A: ArtAllocator<V>>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'t Tree<V>,
+
+    pub allocator: &'t A,
+
+    epoch_handle: epoch::LocalHandle<'t>,
+
+    phantom_key: PhantomData<K>,
+
+    /// Obsolete nodes that cannot be recycled until their epoch expires.
+    garbage: spin::Mutex<GarbageQueue<V>>,
+}
+
+/// The backends have a reference to this. It cannot be used to modify the tree
+pub struct TreeReadAccess<'t, K: Key, V: Value>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'t Tree<V>,
+
+    epoch_handle: epoch::LocalHandle<'t>,
+
+    phantom_key: PhantomData<K>,
+}
+
+impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeInitStruct<'t, K, V, A> {
+    pub fn new(allocator: &'t A) -> TreeInitStruct<'t, K, V, A> {
+        let tree_ptr = allocator.alloc_tree();
+        let tree_ptr = NonNull::new(tree_ptr).expect("out of memory");
+        let init = Tree {
+            root: algorithm::new_root(allocator).expect("out of memory"),
+            writer_attached: AtomicBool::new(false),
+            epoch: epoch::EpochShared::new(),
+        };
+        unsafe { tree_ptr.write(init) };
+
+        TreeInitStruct {
+            tree: unsafe { tree_ptr.as_ref() },
+            allocator,
+            phantom_key: PhantomData,
+        }
+    }
+
+    pub fn attach_writer(self) -> TreeWriteAccess<'t, K, V, A> {
+        let previously_attached = self.tree.writer_attached.swap(true, Ordering::Relaxed);
+        if previously_attached {
+            panic!("writer already attached");
+        }
+        TreeWriteAccess {
+            tree: self.tree,
+            allocator: self.allocator,
+            phantom_key: PhantomData,
+            epoch_handle: self.tree.epoch.register(),
+            garbage: spin::Mutex::new(GarbageQueue::new()),
+        }
+    }
+
+    pub fn attach_reader(self) -> TreeReadAccess<'t, K, V> {
+        TreeReadAccess {
+            tree: self.tree,
+            phantom_key: PhantomData,
+            epoch_handle: self.tree.epoch.register(),
+        }
+    }
+}
+
+impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteAccess<'t, K, V, A> {
+    pub fn start_write<'g>(&'t self) -> TreeWriteGuard<'g, K, V, A>
+    where
+        't: 'g,
+    {
+        TreeWriteGuard {
+            tree_writer: self,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+            created_garbage: false,
+        }
+    }
+
+    pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
+        TreeReadGuard {
+            tree: self.tree,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+        }
+    }
+}
+
+impl<'t, K: Key, V: Value> TreeReadAccess<'t, K, V> {
+    pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
+        TreeReadGuard {
+            tree: self.tree,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+        }
+    }
+}
+
+pub struct TreeReadGuard<'e, K, V>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'e Tree<V>,
+
+    epoch_pin: EpochPin<'e>,
+    phantom_key: PhantomData<K>,
+}
+
+impl<'e, K: Key, V: Value> TreeReadGuard<'e, K, V> {
+    pub fn get(&'e self, key: &K) -> Option<&'e V> {
+        algorithm::search(key, self.tree.root, &self.epoch_pin)
+    }
+}
+
+pub struct TreeWriteGuard<'e, K, V, A>
+where
+    K: Key,
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    tree_writer: &'e TreeWriteAccess<'e, K, V, A>,
+
+    epoch_pin: EpochPin<'e>,
+    phantom_key: PhantomData<K>,
+
+    created_garbage: bool,
+}
+
+pub enum UpdateAction<V> {
+    Nothing,
+    Insert(V),
+    Remove,
+}
+
+impl<'e, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
+    /// Get a value
+    pub fn get(&'e mut self, key: &K) -> Option<&'e V> {
+        algorithm::search(key, self.tree_writer.tree.root, &self.epoch_pin)
+    }
+
+    /// Insert a value
+    pub fn insert(self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
+        let mut success = None;
+
+        self.update_with_fn(key, |existing| {
+            if existing.is_some() {
+                success = Some(false);
+                UpdateAction::Nothing
+            } else {
+                success = Some(true);
+                UpdateAction::Insert(value)
+            }
+        })?;
+        Ok(success.expect("value_fn not called"))
+    }
+
+    /// Remove value. Returns true if it existed
+    pub fn remove(self, key: &K) -> bool {
+        let mut result = false;
+        // FIXME: It's not clear if OOM is expected while removing. It seems
+        // not nice, but shrinking a node can OOM. Then again, we could opt
+        // to not shrink a node if we cannot allocate, to live a little longer.
+        self.update_with_fn(key, |existing| match existing {
+            Some(_) => {
+                result = true;
+                UpdateAction::Remove
+            }
+            None => UpdateAction::Nothing,
+        })
+        .expect("out of memory while removing");
+        result
+    }
+
+    /// Try to remove value and return the old value.
+    pub fn remove_and_return(self, key: &K) -> Option<V>
+    where
+        V: Clone,
+    {
+        let mut old = None;
+        self.update_with_fn(key, |existing| {
+            old = existing.cloned();
+            UpdateAction::Remove
+        })
+        .expect("out of memory while removing");
+        old
+    }
+
+    /// Update key using the given function. All the other modifying operations are based on this.
+    ///
+    /// The function is passed a reference to the existing value, if any. If the function
+    /// returns None, the value is removed from the tree (or if there was no existing value,
+    /// does nothing). If the function returns Some, the existing value is replaced, of if there
+    /// was no existing value, it is inserted. FIXME: update comment
+    pub fn update_with_fn<F>(mut self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
+    where
+        F: FnOnce(Option<&V>) -> UpdateAction<V>,
+    {
+        algorithm::update_fn(key, value_fn, self.tree_writer.tree.root, &mut self)?;
+
+        if self.created_garbage {
+            let _ = self.collect_garbage();
+        }
+        Ok(())
+    }
+
+    fn remember_obsolete_node(&mut self, ptr: NodePtr<V>) {
+        self.tree_writer
+            .garbage
+            .lock()
+            .remember_obsolete_node(ptr, self.epoch_pin.epoch);
+        self.created_garbage = true;
+    }
+
+    // returns number of nodes recycled
+    fn collect_garbage(&self) -> usize {
+        self.tree_writer.tree.epoch.advance();
+        self.tree_writer.tree.epoch.broadcast();
+
+        let cutoff_epoch = self.tree_writer.tree.epoch.get_oldest();
+
+        let mut result = 0;
+        let mut garbage_queue = self.tree_writer.garbage.lock();
+        while let Some(ptr) = garbage_queue.next_obsolete(cutoff_epoch) {
+            ptr.deallocate(self.tree_writer.allocator);
+            result += 1;
+        }
+        result
+    }
+}
+
+pub struct TreeIterator<K>
+where
+    K: Key + for<'a> From<&'a [u8]>,
+{
+    done: bool,
+    pub next_key: Vec<u8>,
+    max_key: Option<Vec<u8>>,
+
+    phantom_key: PhantomData<K>,
+}
+
+impl<K> TreeIterator<K>
+where
+    K: Key + for<'a> From<&'a [u8]>,
+{
+    pub fn new_wrapping() -> TreeIterator<K> {
+        TreeIterator {
+            done: false,
+            next_key: vec![0; K::KEY_LEN],
+            max_key: None,
+            phantom_key: PhantomData,
+        }
+    }
+
+    pub fn new(range: &std::ops::Range<K>) -> TreeIterator<K> {
+        let result = TreeIterator {
+            done: false,
+            next_key: Vec::from(range.start.as_bytes()),
+            max_key: Some(Vec::from(range.end.as_bytes())),
+            phantom_key: PhantomData,
+        };
+        assert_eq!(result.next_key.len(), K::KEY_LEN);
+        assert_eq!(result.max_key.as_ref().unwrap().len(), K::KEY_LEN);
+
+        result
+    }
+
+    pub fn next<'g, V>(&mut self, read_guard: &'g TreeReadGuard<'g, K, V>) -> Option<(K, &'g V)>
+    where
+        V: Value,
+    {
+        if self.done {
+            return None;
+        }
+
+        let mut wrapped_around = false;
+        loop {
+            assert_eq!(self.next_key.len(), K::KEY_LEN);
+            if let Some((k, v)) =
+                algorithm::iter_next(&self.next_key, read_guard.tree.root, &read_guard.epoch_pin)
+            {
+                assert_eq!(k.len(), K::KEY_LEN);
+                assert_eq!(self.next_key.len(), K::KEY_LEN);
+
+                // Check if we reached the end of the range
+                if let Some(max_key) = &self.max_key {
+                    if k.as_slice() >= max_key.as_slice() {
+                        self.done = true;
+                        break None;
+                    }
+                }
+
+                // increment the key
+                self.next_key = k.clone();
+                increment_key(self.next_key.as_mut_slice());
+                let k = k.as_slice().into();
+
+                break Some((k, v));
+            } else {
+                if self.max_key.is_some() {
+                    self.done = true;
+                } else {
+                    // Start from beginning
+                    if !wrapped_around {
+                        for i in 0..K::KEY_LEN {
+                            self.next_key[i] = 0;
+                        }
+                        wrapped_around = true;
+                        continue;
+                    } else {
+                        // The tree is completely empty
+                        // FIXME: perhaps we should remember the starting point instead.
+                        // Currently this will scan some ranges twice.
+                        break None;
+                    }
+                }
+                break None;
+            }
+        }
+    }
+}
+
+fn increment_key(key: &mut [u8]) -> bool {
+    for i in (0..key.len()).rev() {
+        let (byte, overflow) = key[i].overflowing_add(1);
+        key[i] = byte;
+        if !overflow {
+            return false;
+        }
+    }
+    true
+}
+
+// Debugging functions
+impl<'e, K: Key, V: Value + Debug, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
+    pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
+        algorithm::dump_tree(self.tree_writer.tree.root, &self.epoch_pin, dst)
+    }
+}
+impl<'e, K: Key, V: Value + Debug> TreeReadGuard<'e, K, V> {
+    pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
+        algorithm::dump_tree(self.tree.root, &self.epoch_pin, dst)
+    }
+}
+impl<'e, K: Key, V: Value> TreeWriteAccess<'e, K, V, ArtMultiSlabAllocator<'e, V>> {
+    pub fn get_statistics(&self) -> ArtTreeStatistics {
+        self.allocator.get_statistics();
+        ArtTreeStatistics {
+            blocks: self.allocator.inner.block_allocator.get_statistics(),
+            slabs: self.allocator.get_statistics(),
+            epoch: self.tree.epoch.get_current(),
+            oldest_epoch: self.tree.epoch.get_oldest(),
+            num_garbage: self.garbage.lock().0.len() as u64,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ArtTreeStatistics {
+    pub blocks: allocator::block::BlockAllocatorStats,
+    pub slabs: allocator::ArtMultiSlabStats,
+
+    pub epoch: u64,
+    pub oldest_epoch: u64,
+    pub num_garbage: u64,
+}
--- a/libs/neonart/src/tests.rs
+++ b/libs/neonart/src/tests.rs
@@ -0,0 +1,236 @@
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fmt::{Debug, Formatter};
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use crate::ArtAllocator;
+use crate::ArtMultiSlabAllocator;
+use crate::TreeInitStruct;
+use crate::TreeIterator;
+use crate::TreeWriteAccess;
+use crate::UpdateAction;
+
+use crate::{Key, Value};
+
+use rand::Rng;
+use rand::seq::SliceRandom;
+use rand_distr::Zipf;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl TestKey {
+    const MIN: TestKey = TestKey([0; TEST_KEY_LEN]);
+    const MAX: TestKey = TestKey([u8::MAX; TEST_KEY_LEN]);
+}
+
+impl Key for TestKey {
+    const KEY_LEN: usize = TEST_KEY_LEN;
+    fn as_bytes(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+impl From<&TestKey> for u128 {
+    fn from(val: &TestKey) -> u128 {
+        u128::from_be_bytes(val.0)
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl<'a> From<&'a [u8]> for TestKey {
+    fn from(bytes: &'a [u8]) -> TestKey {
+        TestKey(bytes.try_into().unwrap())
+    }
+}
+
+impl Value for usize {}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
+    const MEM_SIZE: usize = 10000000;
+    let mut area = Box::new_uninit_slice(MEM_SIZE);
+
+    let allocator = ArtMultiSlabAllocator::new(&mut area);
+
+    let init_struct = TreeInitStruct::<TestKey, usize, _>::new(allocator);
+    let tree_writer = init_struct.attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+        let w = tree_writer.start_write();
+        let res = w.insert(&(*k).into(), idx);
+        assert!(res.is_ok());
+    }
+
+    for (idx, k) in keys.iter().enumerate() {
+        let r = tree_writer.start_read();
+        let value = r.get(&(*k).into());
+        assert_eq!(value, Some(idx).as_ref());
+    }
+
+    eprintln!("stats: {:?}", tree_writer.get_statistics());
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut rand::rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.contains(&key) {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
+
+struct TestValue(AtomicUsize);
+
+impl TestValue {
+    fn new(val: usize) -> TestValue {
+        TestValue(AtomicUsize::new(val))
+    }
+
+    fn load(&self) -> usize {
+        self.0.load(Ordering::Relaxed)
+    }
+}
+
+impl Value for TestValue {}
+
+impl Clone for TestValue {
+    fn clone(&self) -> TestValue {
+        TestValue::new(self.load())
+    }
+}
+
+impl Debug for TestValue {
+    fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.load())
+    }
+}
+
+#[derive(Clone, Debug)]
+struct TestOp(TestKey, Option<usize>);
+
+fn apply_op<A: ArtAllocator<TestValue>>(
+    op: &TestOp,
+    tree: &TreeWriteAccess<TestKey, TestValue, A>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    eprintln!("applying op: {op:?}");
+
+    // apply the change to the shadow tree first
+    let shadow_existing = if let Some(v) = op.1 {
+        shadow.insert(op.0, v)
+    } else {
+        shadow.remove(&op.0)
+    };
+
+    // apply to Art tree
+    let w = tree.start_write();
+    w.update_with_fn(&op.0, |existing| {
+        assert_eq!(existing.map(TestValue::load), shadow_existing);
+
+        match (existing, op.1) {
+            (None, None) => UpdateAction::Nothing,
+            (None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
+            (Some(_old_val), None) => UpdateAction::Remove,
+            (Some(old_val), Some(new_val)) => {
+                old_val.0.store(new_val, Ordering::Relaxed);
+                UpdateAction::Nothing
+            }
+        }
+    })
+    .expect("out of memory");
+}
+
+fn test_iter<A: ArtAllocator<TestValue>>(
+    tree: &TreeWriteAccess<TestKey, TestValue, A>,
+    shadow: &BTreeMap<TestKey, usize>,
+) {
+    let mut shadow_iter = shadow.iter();
+    let mut iter = TreeIterator::new(&(TestKey::MIN..TestKey::MAX));
+
+    loop {
+        let shadow_item = shadow_iter.next().map(|(k, v)| (*k, *v));
+        let r = tree.start_read();
+        let item = iter.next(&r);
+
+        if shadow_item != item.map(|(k, v)| (k, v.load())) {
+            eprintln!("FAIL: iterator returned {item:?}, expected {shadow_item:?}");
+            tree.start_read().dump(&mut std::io::stderr());
+
+            eprintln!("SHADOW:");
+            for si in shadow {
+                eprintln!("key: {:?}, val: {}", si.0, si.1);
+            }
+            panic!("FAIL: iterator returned {item:?}, expected {shadow_item:?}");
+        }
+        if item.is_none() {
+            break;
+        }
+    }
+}
+
+#[test]
+fn random_ops() {
+    const MEM_SIZE: usize = 10000000;
+    let mut area = Box::new_uninit_slice(MEM_SIZE);
+
+    let allocator = ArtMultiSlabAllocator::new(&mut area);
+
+    let init_struct = TreeInitStruct::<TestKey, TestValue, _>::new(allocator);
+    let tree_writer = init_struct.attach_writer();
+
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
+    let mut rng = rand::rng();
+    for i in 0..100000 {
+        let mut key: TestKey = (rng.sample(distribution) as u128).into();
+
+        if rng.random_bool(0.10) {
+            key = TestKey::from(u128::from(&key) | 0xffffffff);
+        }
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &tree_writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            eprintln!("stats: {:?}", tree_writer.get_statistics());
+            test_iter(&tree_writer, &shadow);
+        }
+    }
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -54,6 +54,7 @@ pageserver_api.workspace = true
 pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
 pageserver_compaction.workspace = true
 pageserver_page_api.workspace = true
+peekable.workspace = true
 pem.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
@@ -66,6 +67,7 @@ postgres-types.workspace = true
 posthog_client_lite.workspace = true
 pprof.workspace = true
 pq_proto.workspace = true
+prost.workspace = true
 rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
--- a/pageserver/client_grpc/Cargo.toml
+++ b/pageserver/client_grpc/Cargo.toml
@@ -0,0 +1,32 @@
+[package]
+name = "pageserver_client_grpc"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+anyhow.workspace = true
+bytes.workspace = true
+futures.workspace = true
+http.workspace = true
+thiserror.workspace = true
+tonic.workspace = true
+tracing.workspace = true
+tokio = { version = "1.43.1", features = ["full", "macros", "net", "io-util", "rt", "rt-multi-thread"] }
+uuid = { version = "1", features = ["v4"] }
+tower = {  version = "0.4", features = ["timeout", "util"] }
+rand = "0.8"
+tokio-util = { version = "0.7", features = ["compat"] }
+hyper-util = "0.1.9"
+hyper = "1.6.0"
+metrics.workspace = true
+priority-queue = "2.3.1"
+scopeguard.workspace = true
+async-trait = { version = "0.1" }
+tokio-stream = "0.1"
+dashmap = "5"
+chrono = { version = "0.4", features = ["serde"] }
+
+
+pageserver_page_api.workspace = true
+pageserver_api.workspace = true
+utils.workspace = true
--- a/pageserver/client_grpc/examples/load_test.rs
+++ b/pageserver/client_grpc/examples/load_test.rs
@@ -0,0 +1,273 @@
+// examples/load_test.rs, generated by AI
+
+use std::collections::{HashMap, HashSet};
+use std::sync::{
+    Arc, Mutex,
+    atomic::{AtomicU64, AtomicUsize, Ordering},
+};
+use std::time::{Duration, Instant};
+
+use rand::Rng;
+use tokio::task;
+use tokio::time::sleep;
+use tonic::Status;
+
+// Pull in your ConnectionPool and PooledItemFactory from the pageserver_client_grpc crate.
+// Adjust these paths if necessary.
+use pageserver_client_grpc::client_cache::ConnectionPool;
+use pageserver_client_grpc::client_cache::PooledItemFactory;
+
+// --------------------------------------
+// GLOBAL COUNTERS FOR “CREATED” / “DROPPED” MockConnections
+// --------------------------------------
+static CREATED: AtomicU64 = AtomicU64::new(0);
+static DROPPED: AtomicU64 = AtomicU64::new(0);
+
+// --------------------------------------
+// MockConnection + Factory
+// --------------------------------------
+
+#[derive(Debug)]
+pub struct MockConnection {
+    pub id: u64,
+}
+
+impl Clone for MockConnection {
+    fn clone(&self) -> Self {
+        // Cloning a MockConnection does NOT count as “creating” a brand‐new connection,
+        // so we do NOT bump CREATED here. We only bump CREATED in the factory’s `create()`.
+        CREATED.fetch_add(1, Ordering::Relaxed);
+        MockConnection { id: self.id }
+    }
+}
+
+impl Drop for MockConnection {
+    fn drop(&mut self) {
+        // When a MockConnection actually gets dropped, bump the counter.
+        DROPPED.fetch_add(1, Ordering::SeqCst);
+    }
+}
+
+#[derive(Default)]
+pub struct MockConnectionFactory {
+    counter: AtomicU64,
+}
+
+#[async_trait::async_trait]
+impl PooledItemFactory<MockConnection> for MockConnectionFactory {
+    /// The trait on ConnectionPool expects:
+    ///   async fn create(&self, timeout: Duration)
+    ///       -> Result<Result<MockConnection, Status>, tokio::time::error::Elapsed>;
+    ///
+    /// On success: Ok(Ok(MockConnection))
+    /// On a simulated “gRPC” failure: Ok(Err(Status::…))
+    /// On a transport/factory error: Err(Box<…>)
+    async fn create(
+        &self,
+        _timeout: Duration,
+    ) -> Result<Result<MockConnection, Status>, tokio::time::error::Elapsed> {
+        // Simulate connection creation immediately succeeding.
+        CREATED.fetch_add(1, Ordering::SeqCst);
+        let next_id = self.counter.fetch_add(1, Ordering::Relaxed);
+        Ok(Ok(MockConnection { id: next_id }))
+    }
+}
+
+// --------------------------------------
+// CLIENT WORKER
+// --------------------------------------
+//
+// Each worker repeatedly calls `pool.get_client().await`. When it succeeds, we:
+//  1. Lock the shared Mutex<HashMap<u64, Arc<AtomicUsize>>> to fetch/insert an Arc<AtomicUsize> for this conn_id.
+//  2. Lock the shared Mutex<HashSet<u64>> to record this conn_id as “seen.”
+//  3. Drop both locks, then atomically increment that counter and assert it ≤ max_consumers.
+//  4. Sleep 10–100 ms to simulate “work.”
+//  5. Atomically decrement the counter.
+//  6. Call `pooled.finish(Ok(()))` to return to the pool.
+
+async fn client_worker(
+    pool: Arc<ConnectionPool<MockConnection>>,
+    usage_map: Arc<Mutex<HashMap<u64, Arc<AtomicUsize>>>>,
+    seen_set: Arc<Mutex<HashSet<u64>>>,
+    max_consumers: usize,
+    worker_id: usize,
+) {
+    for iteration in 0..10 {
+        match pool.clone().get_client().await {
+            Ok(pooled) => {
+                let conn: MockConnection = pooled.channel();
+                let conn_id = conn.id;
+
+                // 1. Fetch or insert the Arc<AtomicUsize> for this conn_id:
+                let counter_arc: Arc<AtomicUsize> = {
+                    let mut guard = usage_map.lock().unwrap();
+                    guard
+                        .entry(conn_id)
+                        .or_insert_with(|| Arc::new(AtomicUsize::new(0)))
+                        .clone()
+                    // MutexGuard is dropped here
+                };
+
+                // 2. Record this conn_id in the shared HashSet of “seen” IDs:
+                {
+                    let mut seen_guard = seen_set.lock().unwrap();
+                    seen_guard.insert(conn_id);
+                    // MutexGuard is dropped immediately
+                }
+
+                // 3. Atomically bump the count for this connection ID
+                let prev = counter_arc.fetch_add(1, Ordering::SeqCst);
+                let current = prev + 1;
+                assert!(
+                    current <= max_consumers,
+                    "Connection {conn_id} exceeded max_consumers (got {current})",
+                );
+
+                println!(
+                    "[worker {worker_id}][iter {iteration}] got MockConnection id={conn_id} ({current} concurrent)",
+                );
+
+                // 4. Simulate some work (10–100 ms)
+                let delay_ms = rand::thread_rng().gen_range(10..100);
+                sleep(Duration::from_millis(delay_ms)).await;
+
+                // 5. Decrement the usage counter
+                let prev2 = counter_arc.fetch_sub(1, Ordering::SeqCst);
+                let after = prev2 - 1;
+                println!(
+                    "[worker {worker_id}][iter {iteration}] returning MockConnection id={conn_id} (now {after} remain)",
+                );
+
+                // 6. Return to the pool (mark success)
+                pooled.finish(Ok(())).await;
+            }
+            Err(status) => {
+                eprintln!(
+                    "[worker {worker_id}][iter {iteration}] failed to get client: {status:?}",
+                );
+            }
+        }
+
+        // Small random pause before next iteration to spread out load
+        let pause = rand::thread_rng().gen_range(0..20);
+        sleep(Duration::from_millis(pause)).await;
+    }
+}
+
+#[tokio::main(flavor = "multi_thread", worker_threads = 8)]
+async fn main() {
+    // --------------------------------------
+    // 1. Create factory and shared instrumentation
+    // --------------------------------------
+    let factory = Arc::new(MockConnectionFactory::default());
+
+    // Shared map: connection ID → Arc<AtomicUsize>
+    let usage_map: Arc<Mutex<HashMap<u64, Arc<AtomicUsize>>>> =
+        Arc::new(Mutex::new(HashMap::new()));
+
+    // Shared set: record each unique connection ID we actually saw
+    let seen_set: Arc<Mutex<HashSet<u64>>> = Arc::new(Mutex::new(HashSet::new()));
+
+    // --------------------------------------
+    // 2. Pool parameters
+    // --------------------------------------
+    let connect_timeout = Duration::from_millis(500);
+    let connect_backoff = Duration::from_millis(100);
+    let max_consumers = 100; // test limit
+    let error_threshold = 2; // mock never fails
+    let max_idle_duration = Duration::from_secs(2);
+    let max_total_connections = 3;
+    let aggregate_metrics = None;
+
+    let pool: Arc<ConnectionPool<MockConnection>> = ConnectionPool::new(
+        factory,
+        connect_timeout,
+        connect_backoff,
+        max_consumers,
+        error_threshold,
+        max_idle_duration,
+        max_total_connections,
+        aggregate_metrics,
+    );
+
+    // --------------------------------------
+    // 3. Spawn worker tasks
+    // --------------------------------------
+    let num_workers = 10000;
+    let mut handles = Vec::with_capacity(num_workers);
+    let start_time = Instant::now();
+
+    for worker_id in 0..num_workers {
+        let pool_clone = Arc::clone(&pool);
+        let usage_clone = Arc::clone(&usage_map);
+        let seen_clone = Arc::clone(&seen_set);
+        let mc = max_consumers;
+
+        let handle = task::spawn(async move {
+            client_worker(pool_clone, usage_clone, seen_clone, mc, worker_id).await;
+        });
+        handles.push(handle);
+    }
+
+    // --------------------------------------
+    // 4. Wait for workers to finish
+    // --------------------------------------
+    for handle in handles {
+        let _ = handle.await;
+    }
+    let elapsed = Instant::now().duration_since(start_time);
+    println!("All {num_workers} workers completed in {elapsed:?}");
+
+    // --------------------------------------
+    // 5. Print the total number of unique connections seen so far
+    // --------------------------------------
+    let unique_count = {
+        let seen_guard = seen_set.lock().unwrap();
+        seen_guard.len()
+    };
+    println!("Total unique connections used by workers: {unique_count}");
+
+    // --------------------------------------
+    // 6. Sleep so the background sweeper can run (max_idle_duration = 2 s)
+    // --------------------------------------
+    sleep(Duration::from_secs(3)).await;
+
+    // --------------------------------------
+    // 7. Shutdown the pool
+    // --------------------------------------
+    let shutdown_pool = Arc::clone(&pool);
+    shutdown_pool.shutdown().await;
+    println!("Pool.shutdown() returned.");
+
+    // --------------------------------------
+    // 8. Verify that no background task still holds an Arc clone of `pool`.
+    //    If any task is still alive (sweeper/create_connection), strong_count > 1.
+    // --------------------------------------
+    sleep(Duration::from_secs(1)).await; // give tasks time to exit
+    let sc = Arc::strong_count(&pool);
+    assert!(
+        sc == 1,
+        "Pool tasks did not all terminate: Arc::strong_count = {sc} (expected 1)",
+    );
+    println!("Verified: all pool tasks have terminated (strong_count == 1).");
+
+    // --------------------------------------
+    // 9. Verify no MockConnection was leaked:
+    //    CREATED must equal DROPPED.
+    // --------------------------------------
+    let created = CREATED.load(Ordering::SeqCst);
+    let dropped = DROPPED.load(Ordering::SeqCst);
+    assert!(
+        created == dropped,
+        "Leaked connections: created={created} but dropped={dropped}",
+    );
+    println!("Verified: no connections leaked (created = {created}, dropped = {dropped}).");
+
+    // --------------------------------------
+    // 10. Because `client_worker` asserted inside that no connection
+    //     ever exceeded `max_consumers`, reaching this point means that check passed.
+    // --------------------------------------
+    println!("All per-connection usage stayed within max_consumers = {max_consumers}.");
+
+    println!("Load test complete; exiting cleanly.");
+}
--- a/pageserver/client_grpc/src/client_cache.rs
+++ b/pageserver/client_grpc/src/client_cache.rs
@@ -0,0 +1,705 @@
+use std::{
+    collections::HashMap,
+    io::{self, Error, ErrorKind},
+    sync::Arc,
+    time::{Duration, Instant},
+};
+
+use priority_queue::PriorityQueue;
+
+use tokio::{
+    io::{AsyncRead, AsyncWrite, ReadBuf},
+    net::TcpStream,
+    sync::{Mutex, OwnedSemaphorePermit, Semaphore},
+    time::sleep,
+};
+use tonic::transport::{Channel, Endpoint};
+
+use uuid;
+
+use std::{
+    pin::Pin,
+    task::{Context, Poll},
+};
+
+use futures::future;
+use rand::{Rng, SeedableRng, rngs::StdRng};
+
+use bytes::BytesMut;
+use http::Uri;
+use hyper_util::rt::TokioIo;
+use tower::service_fn;
+
+use async_trait::async_trait;
+use tokio_util::sync::CancellationToken;
+
+//
+// The "TokioTcp" is flakey TCP network for testing purposes, in order
+// to simulate network errors and delays.
+//
+
+/// Wraps a `TcpStream`, buffers incoming data, and injects a random delay per fresh read/write.
+pub struct TokioTcp {
+    tcp: TcpStream,
+    /// Maximum randomized delay in milliseconds
+    delay_ms: u64,
+
+    /// Next deadline instant for delay
+    deadline: Instant,
+    /// Internal buffer of previously-read data
+    buffer: BytesMut,
+}
+
+impl TokioTcp {
+    /// Create a new wrapper with given max delay (ms)
+    pub fn new(stream: TcpStream, delay_ms: u64) -> Self {
+        let initial = if delay_ms > 0 {
+            rand::thread_rng().gen_range(0..delay_ms)
+        } else {
+            0
+        };
+        let deadline = Instant::now() + Duration::from_millis(initial);
+        TokioTcp {
+            tcp: stream,
+            delay_ms,
+            deadline,
+            buffer: BytesMut::new(),
+        }
+    }
+}
+
+impl AsyncRead for TokioTcp {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        // Safe because TokioTcp is Unpin
+        let this = self.get_mut();
+
+        // 1) Drain any buffered data
+        if !this.buffer.is_empty() {
+            let to_copy = this.buffer.len().min(buf.remaining());
+            buf.put_slice(&this.buffer.split_to(to_copy));
+            return Poll::Ready(Ok(()));
+        }
+
+        // 2) If we're still before the deadline, schedule a wake and return Pending
+        let now = Instant::now();
+        if this.delay_ms > 0 && now < this.deadline {
+            let waker = cx.waker().clone();
+            let wait = this.deadline - now;
+            tokio::spawn(async move {
+                sleep(wait).await;
+                waker.wake_by_ref();
+            });
+            return Poll::Pending;
+        }
+
+        // 3) Past deadline: compute next random deadline
+        if this.delay_ms > 0 {
+            let next_ms = rand::thread_rng().gen_range(0..=this.delay_ms);
+            this.deadline = Instant::now() + Duration::from_millis(next_ms);
+        }
+
+        // 4) Perform actual read into a temporary buffer
+        let mut tmp = [0u8; 4096];
+        let mut rb = ReadBuf::new(&mut tmp);
+        match Pin::new(&mut this.tcp).poll_read(cx, &mut rb) {
+            Poll::Pending => Poll::Pending,
+            Poll::Ready(Ok(())) => {
+                let filled = rb.filled();
+                if filled.is_empty() {
+                    // EOF or zero bytes
+                    Poll::Ready(Ok(()))
+                } else {
+                    this.buffer.extend_from_slice(filled);
+                    let to_copy = this.buffer.len().min(buf.remaining());
+                    buf.put_slice(&this.buffer.split_to(to_copy));
+                    Poll::Ready(Ok(()))
+                }
+            }
+            Poll::Ready(Err(e)) => Poll::Ready(Err(e)),
+        }
+    }
+}
+
+impl AsyncWrite for TokioTcp {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        data: &[u8],
+    ) -> Poll<io::Result<usize>> {
+        let this = self.get_mut();
+
+        // 1) If before deadline, schedule wake and return Pending
+        let now = Instant::now();
+        if this.delay_ms > 0 && now < this.deadline {
+            let waker = cx.waker().clone();
+            let wait = this.deadline - now;
+            tokio::spawn(async move {
+                sleep(wait).await;
+                waker.wake_by_ref();
+            });
+            return Poll::Pending;
+        }
+
+        // 2) Past deadline: compute next random deadline
+        if this.delay_ms > 0 {
+            let next_ms = rand::thread_rng().gen_range(0..=this.delay_ms);
+            this.deadline = Instant::now() + Duration::from_millis(next_ms);
+        }
+
+        // 3) Actual write
+        Pin::new(&mut this.tcp).poll_write(cx, data)
+    }
+
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        let this = self.get_mut();
+        Pin::new(&mut this.tcp).poll_flush(cx)
+    }
+
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        let this = self.get_mut();
+        Pin::new(&mut this.tcp).poll_shutdown(cx)
+    }
+}
+
+#[async_trait]
+pub trait PooledItemFactory<T>: Send + Sync + 'static {
+    /// Create a new pooled item.
+    async fn create(
+        &self,
+        connect_timeout: Duration,
+    ) -> Result<Result<T, tonic::Status>, tokio::time::error::Elapsed>;
+}
+
+pub struct ChannelFactory {
+    endpoint: String,
+    max_delay_ms: u64,
+    drop_rate: f64,
+    hang_rate: f64,
+}
+
+impl ChannelFactory {
+    pub fn new(endpoint: String, max_delay_ms: u64, drop_rate: f64, hang_rate: f64) -> Self {
+        ChannelFactory {
+            endpoint,
+            max_delay_ms,
+            drop_rate,
+            hang_rate,
+        }
+    }
+}
+
+#[async_trait]
+impl PooledItemFactory<Channel> for ChannelFactory {
+    async fn create(
+        &self,
+        connect_timeout: Duration,
+    ) -> Result<Result<Channel, tonic::Status>, tokio::time::error::Elapsed> {
+        let max_delay_ms = self.max_delay_ms;
+        let drop_rate = self.drop_rate;
+        let hang_rate = self.hang_rate;
+
+        // This is a custom connector that inserts delays and errors, for
+        // testing purposes. It would normally be disabled by the config.
+        let connector = service_fn(move |uri: Uri| {
+            let drop_rate = drop_rate;
+            let hang_rate = hang_rate;
+            async move {
+                let mut rng = StdRng::from_entropy();
+                // Simulate an indefinite hang
+                if hang_rate > 0.0 && rng.gen_bool(hang_rate) {
+                    // never completes, to test timeout
+                    return future::pending::<Result<TokioIo<TokioTcp>, std::io::Error>>().await;
+                }
+
+                // Random drop (connect error)
+                if drop_rate > 0.0 && rng.gen_bool(drop_rate) {
+                    return Err(std::io::Error::other("simulated connect drop"));
+                }
+
+                // Otherwise perform real TCP connect
+                let addr = match (uri.host(), uri.port()) {
+                    // host + explicit port
+                    (Some(host), Some(port)) => format!("{}:{}", host, port.as_str()),
+                    // host only (no port)
+                    (Some(host), None) => host.to_string(),
+                    // neither? error out
+                    _ => return Err(Error::new(ErrorKind::InvalidInput, "no host or port")),
+                };
+
+                let tcp = TcpStream::connect(addr).await?;
+                let tcpwrapper = TokioTcp::new(tcp, max_delay_ms);
+                Ok(TokioIo::new(tcpwrapper))
+            }
+        });
+
+        let attempt = tokio::time::timeout(
+            connect_timeout,
+            Endpoint::from_shared(self.endpoint.clone())
+                .expect("invalid endpoint")
+                .timeout(connect_timeout)
+                .connect_with_connector(connector),
+        )
+        .await;
+        match attempt {
+            Ok(Ok(channel)) => {
+                // Connection succeeded
+                Ok(Ok(channel))
+            }
+            Ok(Err(e)) => Ok(Err(tonic::Status::new(
+                tonic::Code::Unavailable,
+                format!("Failed to connect: {e}"),
+            ))),
+            Err(e) => Err(e),
+        }
+    }
+}
+
+/// A pooled gRPC client with capacity tracking and error handling.
+pub struct ConnectionPool<T> {
+    inner: Mutex<Inner<T>>,
+
+    fact: Arc<dyn PooledItemFactory<T> + Send + Sync>,
+
+    connect_timeout: Duration,
+    connect_backoff: Duration,
+    /// The maximum number of consumers that can use a single connection.
+    max_consumers: usize,
+    /// The number of consecutive errors before a connection is removed from the pool.
+    error_threshold: usize,
+    /// The maximum duration a connection can be idle before being removed.
+    max_idle_duration: Duration,
+    max_total_connections: usize,
+
+    channel_semaphore: Arc<Semaphore>,
+
+    shutdown_token: CancellationToken,
+    aggregate_metrics: Option<Arc<crate::PageserverClientAggregateMetrics>>,
+}
+
+struct Inner<T> {
+    entries: HashMap<uuid::Uuid, ConnectionEntry<T>>,
+    pq: PriorityQueue<uuid::Uuid, usize>,
+    // This is updated when a connection is dropped, or we fail
+    // to create a new connection.
+    last_connect_failure: Option<Instant>,
+    waiters: usize,
+    in_progress: usize,
+}
+struct ConnectionEntry<T> {
+    channel: T,
+    active_consumers: usize,
+    consecutive_errors: usize,
+    last_used: Instant,
+}
+
+/// A client borrowed from the pool.
+pub struct PooledClient<T> {
+    pub channel: T,
+    pool: Arc<ConnectionPool<T>>,
+    is_ok: bool,
+    id: uuid::Uuid,
+    permit: OwnedSemaphorePermit,
+}
+
+impl<T: Clone + Send + 'static> ConnectionPool<T> {
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        fact: Arc<dyn PooledItemFactory<T> + Send + Sync>,
+        connect_timeout: Duration,
+        connect_backoff: Duration,
+        max_consumers: usize,
+        error_threshold: usize,
+        max_idle_duration: Duration,
+        max_total_connections: usize,
+        aggregate_metrics: Option<Arc<crate::PageserverClientAggregateMetrics>>,
+    ) -> Arc<Self> {
+        let shutdown_token = CancellationToken::new();
+        let pool = Arc::new(Self {
+            inner: Mutex::new(Inner::<T> {
+                entries: HashMap::new(),
+                pq: PriorityQueue::new(),
+                last_connect_failure: None,
+                waiters: 0,
+                in_progress: 0,
+            }),
+            fact: Arc::clone(&fact),
+            connect_timeout,
+            connect_backoff,
+            max_consumers,
+            error_threshold,
+            max_idle_duration,
+            max_total_connections,
+            channel_semaphore: Arc::new(Semaphore::new(0)),
+            shutdown_token: shutdown_token.clone(),
+            aggregate_metrics: aggregate_metrics.clone(),
+        });
+
+        // Cancelable background task to sweep idle connections
+        let sweeper_token = shutdown_token.clone();
+        let sweeper_pool = Arc::clone(&pool);
+        tokio::spawn(async move {
+            loop {
+                tokio::select! {
+                    _ = sweeper_token.cancelled() => break,
+                    _ = async {
+                        sweeper_pool.sweep_idle_connections().await;
+                        sleep(Duration::from_secs(5)).await;
+                    } => {}
+                }
+            }
+        });
+
+        pool
+    }
+
+    pub async fn shutdown(self: Arc<Self>) {
+        self.shutdown_token.cancel();
+
+        loop {
+            let all_idle = {
+                let inner = self.inner.lock().await;
+                inner.entries.values().all(|e| e.active_consumers == 0)
+            };
+            if all_idle {
+                break;
+            }
+            sleep(Duration::from_millis(100)).await;
+        }
+
+        // 4. Remove all entries
+        let mut inner = self.inner.lock().await;
+        inner.entries.clear();
+    }
+
+    /// Sweep and remove idle connections safely, burning their permits.
+    async fn sweep_idle_connections(self: &Arc<Self>) {
+        let mut ids_to_remove = Vec::new();
+        let now = Instant::now();
+
+        // Remove idle entries. First collect permits for those connections so that
+        // no consumer will reserve them, then remove them from the pool.
+        {
+            let mut inner = self.inner.lock().await;
+            inner.entries.retain(|id, entry| {
+                if entry.active_consumers == 0
+                    && now.duration_since(entry.last_used) > self.max_idle_duration
+                {
+                    // metric
+                    if let Some(ref metrics) = self.aggregate_metrics {
+                        metrics
+                            .retry_counters
+                            .with_label_values(&["connection_swept"])
+                            .inc();
+                    }
+                    ids_to_remove.push(*id);
+                    return false; // remove this entry
+                }
+                true
+            });
+            // Remove the entries from the priority queue
+            for id in ids_to_remove {
+                inner.pq.remove(&id);
+            }
+        }
+    }
+
+    // If we have a permit already, get a connection out of the heap
+    async fn get_conn_with_permit(
+        self: Arc<Self>,
+        permit: OwnedSemaphorePermit,
+    ) -> Option<PooledClient<T>> {
+        let mut inner = self.inner.lock().await;
+
+        // Pop the highest-active-consumers connection. There are no connections
+        // in the heap that have more than max_consumers active consumers.
+        if let Some((id, _cons)) = inner.pq.pop() {
+            let entry = inner
+                .entries
+                .get_mut(&id)
+                .expect("pq and entries got out of sync");
+
+            let mut active_consumers = entry.active_consumers;
+            entry.active_consumers += 1;
+            entry.last_used = Instant::now();
+
+            let client = PooledClient::<T> {
+                channel: entry.channel.clone(),
+                pool: Arc::clone(&self),
+                is_ok: true,
+                id,
+                permit,
+            };
+
+            // re‐insert with updated priority
+            active_consumers += 1;
+            if active_consumers < self.max_consumers {
+                inner.pq.push(id, active_consumers as usize);
+            }
+            Some(client)
+        } else {
+            // If there is no connection to take, it is because permits for a connection
+            // need to drain. This can happen if a connection is removed because it has
+            // too many errors. It is taken out of the heap/hash table in this case, but
+            // we can't remove it's permits until now.
+            //
+            // Just forget the permit and retry.
+            permit.forget();
+            None
+        }
+    }
+
+    pub async fn get_client(self: Arc<Self>) -> Result<PooledClient<T>, tonic::Status> {
+        // The pool is shutting down. Don't accept new connections.
+        if self.shutdown_token.is_cancelled() {
+            return Err(tonic::Status::unavailable("Pool is shutting down"));
+        }
+
+        // A loop is necessary because when a connection is draining, we have to return
+        // a permit and retry.
+        loop {
+            let self_clone = Arc::clone(&self);
+            let mut semaphore = Arc::clone(&self_clone.channel_semaphore);
+
+            match semaphore.try_acquire_owned() {
+                Ok(permit_) => {
+                    // We got a permit, so check the heap for a connection
+                    // we can use.
+                    let pool_conn = self_clone.get_conn_with_permit(permit_).await;
+                    match pool_conn {
+                        Some(pool_conn_) => {
+                            return Ok(pool_conn_);
+                        }
+                        None => {
+                            // No connection available. Forget the permit and retry.
+                            continue;
+                        }
+                    }
+                }
+                Err(_) => {
+                    if let Some(ref metrics) = self_clone.aggregate_metrics {
+                        metrics
+                            .retry_counters
+                            .with_label_values(&["sema_acquire_success"])
+                            .inc();
+                    }
+
+                    {
+                        //
+                        // This is going to generate enough connections to handle a burst,
+                        // but it may generate up to twice the number of connections needed
+                        // in the worst case. Extra connections will go idle and be cleaned
+                        // up.
+                        //
+                        let mut inner = self_clone.inner.lock().await;
+                        inner.waiters += 1;
+                        if inner.waiters > (inner.in_progress * self_clone.max_consumers)
+                            && (inner.entries.len() + inner.in_progress)
+                                < self_clone.max_total_connections
+                        {
+                            let self_clone_spawn = Arc::clone(&self_clone);
+                            tokio::task::spawn(async move {
+                                self_clone_spawn.create_connection().await;
+                            });
+                            inner.in_progress += 1;
+                        }
+                    }
+                    // Wait for a connection to become available, either because it
+                    // was created or because a connection was returned to the pool
+                    // by another consumer.
+                    semaphore = Arc::clone(&self_clone.channel_semaphore);
+                    let conn_permit = semaphore.acquire_owned().await.unwrap();
+                    {
+                        let mut inner = self_clone.inner.lock().await;
+                        inner.waiters -= 1;
+                    }
+                    // We got a permit, check the heap for a connection.
+                    let pool_conn = self_clone.get_conn_with_permit(conn_permit).await;
+                    match pool_conn {
+                        Some(pool_conn_) => {
+                            return Ok(pool_conn_);
+                        }
+                        None => {
+                            // No connection was found, forget the permit and retry.
+                            continue;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    async fn create_connection(&self) {
+        // Generate a random backoff to add some jitter so that connections
+        // don't all retry at the same time.
+        let mut backoff_delay = Duration::from_millis(
+            rand::thread_rng().gen_range(0..=self.connect_backoff.as_millis() as u64),
+        );
+
+        loop {
+            if self.shutdown_token.is_cancelled() {
+                return;
+            }
+
+            // Back off.
+            // Loop because failure can occur while we are sleeping, so wait
+            // until the failure stopped for at least one backoff period. Backoff
+            // period includes some jitter, so that if multiple connections are
+            // failing, they don't all retry at the same time.
+            while let Some(delay) = {
+                let inner = self.inner.lock().await;
+                inner.last_connect_failure.and_then(|at| {
+                    (at.elapsed() < backoff_delay).then(|| backoff_delay - at.elapsed())
+                })
+            } {
+                sleep(delay).await;
+            }
+
+            //
+            // Create a new connection.
+            //
+            // The connect timeout is also the timeout for an individual gRPC request
+            // on this connection. (Requests made later on this channel will time out
+            // with the same timeout.)
+            //
+            if let Some(ref metrics) = self.aggregate_metrics {
+                metrics
+                    .retry_counters
+                    .with_label_values(&["connection_attempt"])
+                    .inc();
+            }
+
+            let attempt = self.fact.create(self.connect_timeout).await;
+
+            match attempt {
+                // Connection succeeded
+                Ok(Ok(channel)) => {
+                    {
+                        if let Some(ref metrics) = self.aggregate_metrics {
+                            metrics
+                                .retry_counters
+                                .with_label_values(&["connection_success"])
+                                .inc();
+                        }
+                        let mut inner = self.inner.lock().await;
+                        let id = uuid::Uuid::new_v4();
+                        inner.entries.insert(
+                            id,
+                            ConnectionEntry::<T> {
+                                channel: channel.clone(),
+                                active_consumers: 0,
+                                consecutive_errors: 0,
+                                last_used: Instant::now(),
+                            },
+                        );
+                        inner.pq.push(id, 0);
+                        inner.in_progress -= 1;
+                        self.channel_semaphore.add_permits(self.max_consumers);
+                        return;
+                    };
+                }
+                // Connection failed, back off and retry
+                Ok(Err(_)) | Err(_) => {
+                    if let Some(ref metrics) = self.aggregate_metrics {
+                        metrics
+                            .retry_counters
+                            .with_label_values(&["connect_failed"])
+                            .inc();
+                    }
+                    let mut inner = self.inner.lock().await;
+                    inner.last_connect_failure = Some(Instant::now());
+                    // Add some jitter so that every connection doesn't retry at once
+                    let jitter = rand::thread_rng().gen_range(0..=backoff_delay.as_millis() as u64);
+                    backoff_delay =
+                        Duration::from_millis(backoff_delay.as_millis() as u64 + jitter);
+
+                    // Do not backoff longer than one minute
+                    if backoff_delay > Duration::from_secs(60) {
+                        backoff_delay = Duration::from_secs(60);
+                    }
+                    // continue the loop to retry
+                }
+            }
+        }
+    }
+
+    /// Return client to the pool, indicating success or error.
+    pub async fn return_client(&self, id: uuid::Uuid, success: bool, permit: OwnedSemaphorePermit) {
+        let mut inner = self.inner.lock().await;
+        if let Some(entry) = inner.entries.get_mut(&id) {
+            entry.last_used = Instant::now();
+            if entry.active_consumers == 0 {
+                panic!("A consumer completed when active_consumers was zero!")
+            }
+            entry.active_consumers -= 1;
+            if success {
+                if entry.consecutive_errors < self.error_threshold {
+                    entry.consecutive_errors = 0;
+                }
+            } else {
+                entry.consecutive_errors += 1;
+                if entry.consecutive_errors == self.error_threshold {
+                    if let Some(ref metrics) = self.aggregate_metrics {
+                        metrics
+                            .retry_counters
+                            .with_label_values(&["connection_dropped"])
+                            .inc();
+                    }
+                }
+            }
+
+            //
+            // Too many errors on this connection. If there are no active users,
+            // remove it. Otherwise just wait for active_consumers to go to zero.
+            // This connection will not be selected for new consumers.
+            //
+            let active_consumers = entry.active_consumers;
+            if entry.consecutive_errors >= self.error_threshold {
+                // too many errors, remove the connection permanently. Once it drains,
+                // it will be dropped.
+                if inner.pq.get_priority(&id).is_some() {
+                    inner.pq.remove(&id);
+                }
+
+                // remove from entries
+                // check if entry is in inner
+                if inner.entries.contains_key(&id) {
+                    inner.entries.remove(&id);
+                }
+                inner.last_connect_failure = Some(Instant::now());
+
+                // The connection has been removed, it's permits will be
+                // drained because if we look for a connection and it's not there
+                // we just forget the permit. However, this process can be a little
+                // bit faster if we just forget permits as the connections are returned.
+                permit.forget();
+            } else {
+                // update its priority in the queue
+                if inner.pq.get_priority(&id).is_some() {
+                    inner.pq.change_priority(&id, active_consumers);
+                } else {
+                    // This connection is not in the heap, but it has space
+                    // for more consumers. Put it back in the heap.
+                    if active_consumers < self.max_consumers {
+                        inner.pq.push(id, active_consumers);
+                    }
+                }
+            }
+        }
+    }
+}
+
+impl<T: Clone + Send + 'static> PooledClient<T> {
+    pub fn channel(&self) -> T {
+        self.channel.clone()
+    }
+    pub async fn finish(mut self, result: Result<(), tonic::Status>) {
+        self.is_ok = result.is_ok();
+        self.pool
+            .return_client(self.id, self.is_ok, self.permit)
+            .await;
+    }
+}
--- a/pageserver/client_grpc/src/lib.rs
+++ b/pageserver/client_grpc/src/lib.rs
@@ -0,0 +1,450 @@
+//! Pageserver Data API client
+//!
+//! - Manage connections to pageserver
+//! - Send requests to correct shards
+//!
+use std::collections::HashMap;
+use std::fmt::Debug;
+use std::sync::Arc;
+use std::sync::RwLock;
+use std::time::Duration;
+
+use bytes::Bytes;
+use futures::{Stream, StreamExt};
+use thiserror::Error;
+use tonic::metadata::AsciiMetadataValue;
+use tonic::transport::Channel;
+
+use pageserver_page_api::proto;
+use pageserver_page_api::proto::PageServiceClient;
+use pageserver_page_api::*;
+use utils::shard::ShardIndex;
+
+pub mod client_cache;
+pub mod pool;
+pub mod request_tracker;
+
+use metrics::{IntCounterVec, core::Collector};
+
+#[derive(Error, Debug)]
+pub enum PageserverClientError {
+    #[error("could not connect to service: {0}")]
+    ConnectError(#[from] tonic::transport::Error),
+    #[error("could not perform request: {0}`")]
+    RequestError(#[from] tonic::Status),
+    #[error("protocol error: {0}")]
+    ProtocolError(#[from] ProtocolError),
+
+    #[error("could not perform request: {0}`")]
+    InvalidUri(#[from] http::uri::InvalidUri),
+
+    #[error("could not perform request: {0}`")]
+    Other(String),
+}
+
+#[derive(Clone, Debug)]
+pub struct PageserverClientAggregateMetrics {
+    pub request_counters: IntCounterVec,
+    pub retry_counters: IntCounterVec,
+}
+
+impl Default for PageserverClientAggregateMetrics {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl PageserverClientAggregateMetrics {
+    pub fn new() -> Self {
+        let request_counters = IntCounterVec::new(
+            metrics::core::Opts::new(
+                "backend_requests_total",
+                "Number of requests from backends.",
+            ),
+            &["request_kind"],
+        )
+        .unwrap();
+
+        let retry_counters = IntCounterVec::new(
+            metrics::core::Opts::new(
+                "backend_requests_retries_total",
+                "Number of retried requests from backends.",
+            ),
+            &["request_kind"],
+        )
+        .unwrap();
+        Self {
+            request_counters,
+            retry_counters,
+        }
+    }
+
+    pub fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
+        let mut metrics = Vec::new();
+        metrics.append(&mut self.request_counters.collect());
+        metrics.append(&mut self.retry_counters.collect());
+        metrics
+    }
+}
+
+pub struct PageserverClient {
+    _tenant_id: String,
+    _timeline_id: String,
+
+    _auth_token: Option<String>,
+
+    shard_map: HashMap<ShardIndex, String>,
+
+    channels: RwLock<HashMap<ShardIndex, Arc<client_cache::ConnectionPool<Channel>>>>,
+
+    auth_interceptor: AuthInterceptor,
+
+    client_cache_options: ClientCacheOptions,
+
+    aggregate_metrics: Option<Arc<PageserverClientAggregateMetrics>>,
+}
+#[derive(Clone)]
+pub struct ClientCacheOptions {
+    pub max_consumers: usize,
+    pub error_threshold: usize,
+    pub connect_timeout: Duration,
+    pub connect_backoff: Duration,
+    pub max_idle_duration: Duration,
+    pub max_total_connections: usize,
+    pub max_delay_ms: u64,
+    pub drop_rate: f64,
+    pub hang_rate: f64,
+}
+
+impl PageserverClient {
+    /// TODO: this doesn't currently react to changes in the shard map.
+    pub fn new(
+        tenant_id: &str,
+        timeline_id: &str,
+        auth_token: &Option<String>,
+        shard_map: HashMap<ShardIndex, String>,
+    ) -> Self {
+        let options = ClientCacheOptions {
+            max_consumers: 5000,
+            error_threshold: 5,
+            connect_timeout: Duration::from_secs(5),
+            connect_backoff: Duration::from_secs(1),
+            max_idle_duration: Duration::from_secs(60),
+            max_total_connections: 100000,
+            max_delay_ms: 0,
+            drop_rate: 0.0,
+            hang_rate: 0.0,
+        };
+        Self::new_with_config(tenant_id, timeline_id, auth_token, shard_map, options, None)
+    }
+    pub fn new_with_config(
+        tenant_id: &str,
+        timeline_id: &str,
+        auth_token: &Option<String>,
+        shard_map: HashMap<ShardIndex, String>,
+        options: ClientCacheOptions,
+        metrics: Option<Arc<PageserverClientAggregateMetrics>>,
+    ) -> Self {
+        Self {
+            _tenant_id: tenant_id.to_string(),
+            _timeline_id: timeline_id.to_string(),
+            _auth_token: auth_token.clone(),
+            shard_map,
+            channels: RwLock::new(HashMap::new()),
+            auth_interceptor: AuthInterceptor::new(tenant_id, timeline_id, auth_token.as_deref()),
+            client_cache_options: options,
+            aggregate_metrics: metrics,
+        }
+    }
+    pub async fn process_check_rel_exists_request(
+        &self,
+        request: CheckRelExistsRequest,
+    ) -> Result<bool, PageserverClientError> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        let request = proto::CheckRelExistsRequest::from(request);
+        let response = client.check_rel_exists(tonic::Request::new(request)).await;
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                Err(PageserverClientError::RequestError(status))
+            }
+            Ok(resp) => {
+                pooled_client.finish(Ok(())).await; // Pass success to finish
+                Ok(resp.get_ref().exists)
+            }
+        }
+    }
+
+    pub async fn process_get_rel_size_request(
+        &self,
+        request: GetRelSizeRequest,
+    ) -> Result<u32, PageserverClientError> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        let request = proto::GetRelSizeRequest::from(request);
+        let response = client.get_rel_size(tonic::Request::new(request)).await;
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                Err(PageserverClientError::RequestError(status))
+            }
+            Ok(resp) => {
+                pooled_client.finish(Ok(())).await; // Pass success to finish
+                Ok(resp.get_ref().num_blocks)
+            }
+        }
+    }
+
+    // Request a single batch of pages
+    //
+    // TODO: This opens a new gRPC stream for every request, which is extremely inefficient
+    pub async fn get_page(
+        &self,
+        request: GetPageRequest,
+    ) -> Result<Vec<Bytes>, PageserverClientError> {
+        // FIXME: calculate the shard number correctly
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        let request = proto::GetPageRequest::from(request);
+
+        let request_stream = futures::stream::once(std::future::ready(request));
+
+        let mut response_stream = client
+            .get_pages(tonic::Request::new(request_stream))
+            .await?
+            .into_inner();
+
+        let Some(response) = response_stream.next().await else {
+            return Err(PageserverClientError::Other(
+                "no response received for getpage request".to_string(),
+            ));
+        };
+
+        if let Some(ref metrics) = self.aggregate_metrics {
+            metrics
+                .request_counters
+                .with_label_values(&["get_page"])
+                .inc();
+        }
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                Err(PageserverClientError::RequestError(status))
+            }
+            Ok(resp) => {
+                pooled_client.finish(Ok(())).await; // Pass success to finish
+                let response: GetPageResponse = resp.into();
+                Ok(response.page_images.to_vec())
+            }
+        }
+    }
+
+    // Open a stream for requesting pages
+    //
+    // TODO: This is a pretty low level interface, the caller should not need to be concerned
+    // with streams. But 'get_page' is currently very naive and inefficient.
+    pub async fn get_pages(
+        &self,
+        requests: impl Stream<Item = proto::GetPageRequest> + Send + 'static,
+    ) -> std::result::Result<
+        tonic::Response<tonic::codec::Streaming<proto::GetPageResponse>>,
+        PageserverClientError,
+    > {
+        // FIXME: calculate the shard number correctly
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        let response = client.get_pages(tonic::Request::new(requests)).await;
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                Err(PageserverClientError::RequestError(status))
+            }
+            Ok(resp) => Ok(resp),
+        }
+    }
+
+    /// Process a request to get the size of a database.
+    pub async fn process_get_dbsize_request(
+        &self,
+        request: GetDbSizeRequest,
+    ) -> Result<u64, PageserverClientError> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        let request = proto::GetDbSizeRequest::from(request);
+        let response = client.get_db_size(tonic::Request::new(request)).await;
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                Err(PageserverClientError::RequestError(status))
+            }
+            Ok(resp) => {
+                pooled_client.finish(Ok(())).await; // Pass success to finish
+                Ok(resp.get_ref().num_bytes)
+            }
+        }
+    }
+    /// Process a request to get the size of a database.
+    pub async fn get_base_backup(
+        &self,
+        request: GetBaseBackupRequest,
+        gzip: bool,
+    ) -> std::result::Result<
+        tonic::Response<tonic::codec::Streaming<proto::GetBaseBackupResponseChunk>>,
+        PageserverClientError,
+    > {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let shard = ShardIndex::unsharded();
+        let pooled_client = self.get_client(shard).await;
+        let chan = pooled_client.channel();
+
+        let mut client =
+            PageServiceClient::with_interceptor(chan, self.auth_interceptor.for_shard(shard));
+
+        if gzip {
+            client = client.accept_compressed(tonic::codec::CompressionEncoding::Gzip);
+        }
+
+        let request = proto::GetBaseBackupRequest::from(request);
+        let response = client.get_base_backup(tonic::Request::new(request)).await;
+
+        match response {
+            Err(status) => {
+                pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                Err(PageserverClientError::RequestError(status))
+            }
+            Ok(resp) => {
+                pooled_client.finish(Ok(())).await; // Pass success to finish
+                Ok(resp)
+            }
+        }
+    }
+    /// Get a client for given shard
+    ///
+    /// Get a client from the pool for this shard, also creating the pool if it doesn't exist.
+    ///
+    async fn get_client(&self, shard: ShardIndex) -> client_cache::PooledClient<Channel> {
+        let reused_pool: Option<Arc<client_cache::ConnectionPool<Channel>>> = {
+            let channels = self.channels.read().unwrap();
+            channels.get(&shard).cloned()
+        };
+
+        let usable_pool = match reused_pool {
+            Some(pool) => {
+                let pooled_client = pool.get_client().await.unwrap();
+                return pooled_client;
+            }
+            None => {
+                // Create a new pool using client_cache_options
+                // declare new_pool
+
+                let channel_fact = Arc::new(client_cache::ChannelFactory::new(
+                    self.shard_map.get(&shard).unwrap().clone(),
+                    self.client_cache_options.max_delay_ms,
+                    self.client_cache_options.drop_rate,
+                    self.client_cache_options.hang_rate,
+                ));
+                let new_pool = client_cache::ConnectionPool::new(
+                    channel_fact,
+                    self.client_cache_options.connect_timeout,
+                    self.client_cache_options.connect_backoff,
+                    self.client_cache_options.max_consumers,
+                    self.client_cache_options.error_threshold,
+                    self.client_cache_options.max_idle_duration,
+                    self.client_cache_options.max_total_connections,
+                    self.aggregate_metrics.clone(),
+                );
+                let mut write_pool = self.channels.write().unwrap();
+                write_pool.insert(shard, new_pool.clone());
+                new_pool.clone()
+            }
+        };
+
+        usable_pool.get_client().await.unwrap()
+    }
+}
+
+/// Inject tenant_id, timeline_id and authentication token to all pageserver requests.
+#[derive(Clone)]
+pub struct AuthInterceptor {
+    tenant_id: AsciiMetadataValue,
+    shard_id: Option<AsciiMetadataValue>,
+    timeline_id: AsciiMetadataValue,
+
+    auth_header: Option<AsciiMetadataValue>, // including "Bearer " prefix
+}
+
+impl AuthInterceptor {
+    pub fn new(tenant_id: &str, timeline_id: &str, auth_token: Option<&str>) -> Self {
+        Self {
+            tenant_id: tenant_id.parse().expect("could not parse tenant id"),
+            shard_id: None,
+            timeline_id: timeline_id.parse().expect("could not parse timeline id"),
+            auth_header: auth_token
+                .map(|t| format!("Bearer {t}"))
+                .map(|t| t.parse().expect("could not parse auth token")),
+        }
+    }
+
+    fn for_shard(&self, shard_id: ShardIndex) -> Self {
+        let mut with_shard = self.clone();
+        with_shard.shard_id = Some(
+            shard_id
+                .to_string()
+                .parse()
+                .expect("could not parse shard id"),
+        );
+        with_shard
+    }
+}
+
+impl tonic::service::Interceptor for AuthInterceptor {
+    fn call(&mut self, mut req: tonic::Request<()>) -> Result<tonic::Request<()>, tonic::Status> {
+        req.metadata_mut()
+            .insert("neon-tenant-id", self.tenant_id.clone());
+        if let Some(shard_id) = &self.shard_id {
+            req.metadata_mut().insert("neon-shard-id", shard_id.clone());
+        }
+        req.metadata_mut()
+            .insert("neon-timeline-id", self.timeline_id.clone());
+        if let Some(auth_header) = &self.auth_header {
+            req.metadata_mut()
+                .insert("authorization", auth_header.clone());
+        }
+
+        Ok(req)
+    }
+}
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -0,0 +1,548 @@
+//! This module provides various Pageserver gRPC client resource pools.
+//!
+//! These pools are designed to reuse gRPC resources (connections, clients, and streams) across
+//! multiple callers (i.e. Postgres backends). This avoids the resource cost and latency of creating
+//! a dedicated TCP connection and server task for every Postgres backend.
+//!
+//! Each resource has its own, nested pool. The pools are custom-built for the properties of each
+//! resource -- these are different enough that a generic pool isn't suitable.
+//!
+//! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients
+//!   can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a
+//!   per-channel limit. Channels may be closed when they are no longer used by any clients.
+//!
+//! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared)
+//!   channel from the ChannelPool for client's lifetime. A client can only be acquired by a single
+//!   caller at a time, and is returned to the pool when dropped. Idle clients may be removed from
+//!   the pool after some time, to free up the channel.
+//!
+//! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from
+//!   the ClientPool for the stream's lifetime. Internal streams are not exposed to callers;
+//!   instead, callers submit individual GetPage requests to the pool and await a response.
+//!   Internally, the pool will reuse or spin up a suitable stream for the request, possibly
+//!   pipelining multiple requests from multiple callers on the same stream (up to some queue
+//!   depth), and route the response back to the original caller. Idle streams may be removed from
+//!   the pool after some time, to free up the client.
+
+use std::collections::{BTreeMap, HashMap};
+use std::ops::{Deref, DerefMut};
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::{Arc, Mutex, Weak};
+
+use futures::StreamExt as _;
+use scopeguard::defer;
+use tokio::sync::mpsc::{Receiver, Sender};
+use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
+use tonic::transport::{Channel, Endpoint};
+use tracing::warn;
+
+use pageserver_page_api::{self as page_api, GetPageRequest, GetPageResponse};
+use utils::id::{TenantId, TimelineId};
+use utils::shard::ShardIndex;
+
+// TODO: tune these constants, and consider making them configurable.
+
+/// Max number of concurrent clients per channel.
+///
+/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels
+/// with only streams.
+const CLIENTS_PER_CHANNEL: usize = 16;
+
+/// Maximum number of concurrent clients per `ClientPool`. This bounds the number of channels as
+/// CLIENT_LIMIT / CLIENTS_PER_CHANNEL.
+const CLIENT_LIMIT: usize = 64;
+
+/// Max number of pipelined requests per gRPC GetPage stream.
+const STREAM_QUEUE_DEPTH: usize = 2;
+
+/// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2
+/// stream multiplexing), up to `CLIENTS_PER_CHANNEL`. The pool does not limit the number of
+/// channels, and instead relies on `ClientPool` to limit the number of concurrent clients.
+///
+/// The pool is always wrapped in an outer `Arc`, to allow long-lived references from guards.
+///
+/// Tonic will automatically retry the underlying connection if it fails, so there is no need
+/// to re-establish connections on errors.
+///
+/// TODO: reap idle channels.
+/// TODO: consider adding a circuit breaker for errors and fail fast.
+pub struct ChannelPool {
+    /// Pageserver endpoint to connect to.
+    endpoint: Endpoint,
+    /// Open channels.
+    channels: Mutex<BTreeMap<ChannelID, ChannelEntry>>,
+}
+
+type ChannelID = usize;
+
+struct ChannelEntry {
+    /// The gRPC channel (i.e. TCP connection). Shared by multiple clients.
+    channel: Channel,
+    /// Number of clients using this channel.
+    clients: usize,
+}
+
+impl ChannelPool {
+    /// Creates a new channel pool for the given Pageserver endpoint.
+    pub fn new<E>(endpoint: E) -> anyhow::Result<Arc<Self>>
+    where
+        E: TryInto<Endpoint> + Send + Sync + 'static,
+        <E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
+    {
+        Ok(Arc::new(Self {
+            endpoint: endpoint.try_into()?,
+            channels: Default::default(),
+        }))
+    }
+
+    /// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel.
+    ///
+    /// This never blocks (except for sync mutex acquisition). The channel is connected lazily on
+    /// first use, and the `ChannelPool` does not have a channel limit.
+    ///
+    /// Callers should not clone the returned channel, and must hold onto the returned guard as long
+    /// as the channel is in use. It is unfortunately not possible to enforce this: the Protobuf
+    /// client requires an owned `Channel` and we don't have access to the channel's internal
+    /// refcount.
+    ///
+    /// NB: this is not very performance-sensitive. It is only called when creating a new client,
+    /// and clients are cached and reused by ClientPool. The total number of channels will also be
+    /// small. O(n) performance is therefore okay.
+    pub fn get(self: &Arc<Self>) -> anyhow::Result<ChannelGuard> {
+        let mut channels = self.channels.lock().unwrap();
+
+        // Try to find an existing channel with available capacity. We check entries in BTreeMap
+        // order, to fill up the lower-ordered channels first. The ClientPool also uses clients with
+        // lower-ordered channel IDs first. This will cluster clients in lower-ordered channels, and
+        // free up higher-ordered channels such that they can be reaped.
+        for (&id, entry) in channels.iter_mut() {
+            assert!(entry.clients <= CLIENTS_PER_CHANNEL, "channel overflow");
+            if entry.clients < CLIENTS_PER_CHANNEL {
+                entry.clients += 1;
+                return Ok(ChannelGuard {
+                    pool: Arc::downgrade(self),
+                    id,
+                    channel: Some(entry.channel.clone()),
+                });
+            }
+        }
+
+        // Create a new channel. We connect lazily on the first use, such that we don't block here
+        // and other clients can join onto the same channel while it's connecting.
+        let channel = self.endpoint.connect_lazy();
+
+        let id = channels.keys().last().copied().unwrap_or_default();
+        let entry = ChannelEntry {
+            channel: channel.clone(),
+            clients: 1, // we're returning the guard below
+        };
+        channels.insert(id, entry);
+
+        Ok(ChannelGuard {
+            pool: Arc::downgrade(self),
+            id,
+            channel: Some(channel.clone()),
+        })
+    }
+}
+
+/// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`.
+/// However, the caller must hold onto the guard as long as it's using the channel, and should not
+/// clone it.
+pub struct ChannelGuard {
+    pool: Weak<ChannelPool>,
+    id: ChannelID,
+    channel: Option<Channel>,
+}
+
+impl ChannelGuard {
+    /// Returns the inner channel. Panics if called more than once. The caller must hold onto the
+    /// guard as long as the channel is in use, and should not clone it.
+    pub fn take(&mut self) -> Channel {
+        self.channel.take().expect("channel already taken")
+    }
+}
+
+/// Returns the channel to the pool.
+impl Drop for ChannelGuard {
+    fn drop(&mut self) {
+        let Some(pool) = self.pool.upgrade() else {
+            return; // pool was dropped
+        };
+        let mut channels = pool.channels.lock().unwrap();
+        let entry = channels.get_mut(&self.id).expect("unknown channel");
+        assert!(entry.clients > 0, "channel underflow");
+        entry.clients -= 1;
+    }
+}
+
+/// A pool of gRPC clients for a single tenant shard. Each client acquires a channel from the inner
+/// `ChannelPool`. A client is only acquired by a single caller at a time. The pool limits the total
+/// number of concurrent clients to `CLIENT_LIMIT` via semaphore.
+///
+/// The pool is always wrapped in an outer `Arc`, to allow long-lived references from guards.
+///
+/// TODO: reap idle clients.
+/// TODO: error handling (but channel will be reconnected automatically).
+/// TODO: rate limiting.
+pub struct ClientPool {
+    /// Tenant ID.
+    tenant_id: TenantId,
+    /// Timeline ID.
+    timeline_id: TimelineId,
+    /// Shard ID.
+    shard_id: ShardIndex,
+    /// Authentication token, if any.
+    auth_token: Option<String>,
+    /// Channel pool to acquire channels from.
+    channel_pool: Arc<ChannelPool>,
+    /// Limits the max number of concurrent clients for this pool.
+    limiter: Arc<Semaphore>,
+    /// Idle pooled clients. Acquired clients are removed from here and returned on drop.
+    ///
+    /// The first client in the map will be acquired next. The map is sorted by client ID, which in
+    /// turn is sorted by the channel ID, such that we prefer acquiring idle clients from
+    /// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle
+    /// clients are reaped.
+    idle: Mutex<BTreeMap<ClientID, ClientEntry>>,
+    /// Unique client ID generator.
+    next_client_id: AtomicUsize,
+}
+
+type ClientID = (ChannelID, usize);
+
+struct ClientEntry {
+    client: page_api::Client,
+    channel_guard: ChannelGuard,
+}
+
+impl ClientPool {
+    /// Creates a new client pool for the given tenant shard. Channels are acquired from the given
+    /// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard.
+    pub fn new(
+        channel_pool: Arc<ChannelPool>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_id: ShardIndex,
+        auth_token: Option<String>,
+    ) -> Arc<Self> {
+        Arc::new(Self {
+            tenant_id,
+            timeline_id,
+            shard_id,
+            auth_token,
+            channel_pool,
+            idle: Mutex::default(),
+            limiter: Arc::new(Semaphore::new(CLIENT_LIMIT)),
+            next_client_id: AtomicUsize::default(),
+        })
+    }
+
+    /// Gets a client from the pool, or creates a new one if necessary. Blocks if the pool is at
+    /// `CLIENT_LIMIT`. The client is returned to the pool when the guard is dropped.
+    ///
+    /// This is moderately performance-sensitive. It is called for every unary request, but recall
+    /// that these establish a new gRPC stream per request so it's already expensive. GetPage
+    /// requests use the `StreamPool` instead.
+    pub async fn get(self: &Arc<Self>) -> anyhow::Result<ClientGuard> {
+        let permit = self
+            .limiter
+            .clone()
+            .acquire_owned()
+            .await
+            .expect("never closed");
+
+        // Fast path: acquire an idle client from the pool.
+        if let Some((id, entry)) = self.idle.lock().unwrap().pop_first() {
+            return Ok(ClientGuard {
+                pool: Arc::downgrade(self),
+                id,
+                client: Some(entry.client),
+                channel_guard: Some(entry.channel_guard),
+                permit,
+            });
+        }
+
+        // Slow path: construct a new client.
+        let mut channel_guard = self.channel_pool.get()?;
+        let client = page_api::Client::new(
+            channel_guard.take(),
+            self.tenant_id,
+            self.timeline_id,
+            self.shard_id,
+            self.auth_token.clone(),
+            None,
+        )?;
+
+        Ok(ClientGuard {
+            pool: Arc::downgrade(self),
+            id: (
+                channel_guard.id,
+                self.next_client_id.fetch_add(1, Ordering::Relaxed),
+            ),
+            client: Some(client),
+            channel_guard: Some(channel_guard),
+            permit,
+        })
+    }
+}
+
+/// A client acquired from the pool. The inner client can be accessed via derefs. The client is
+/// returned to the pool when dropped.
+pub struct ClientGuard {
+    pool: Weak<ClientPool>,
+    id: ClientID,
+    client: Option<page_api::Client>,    // Some until dropped
+    channel_guard: Option<ChannelGuard>, // Some until dropped
+    permit: OwnedSemaphorePermit,
+}
+
+impl Deref for ClientGuard {
+    type Target = page_api::Client;
+
+    fn deref(&self) -> &Self::Target {
+        self.client.as_ref().expect("not dropped")
+    }
+}
+
+impl DerefMut for ClientGuard {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.client.as_mut().expect("not dropped")
+    }
+}
+
+// Returns the client to the pool.
+impl Drop for ClientGuard {
+    fn drop(&mut self) {
+        let Some(pool) = self.pool.upgrade() else {
+            return; // pool was dropped
+        };
+        let entry = ClientEntry {
+            client: self.client.take().expect("dropped once"),
+            channel_guard: self.channel_guard.take().expect("dropped once"),
+        };
+        pool.idle.lock().unwrap().insert(self.id, entry);
+
+        // The permit will be returned by its drop handler. Tag it here for visibility.
+        _ = self.permit;
+    }
+}
+
+/// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream
+/// acquires a client from the inner `ClientPool` for the stream's lifetime.
+///
+/// Individual streams are not exposed to callers -- instead, callers submit invididual requests to
+/// the pool and await a response. Internally, requests are multiplexed across streams and channels.
+///
+/// TODO: reap idle streams.
+/// TODO: error handling (but channel will be reconnected automatically).
+/// TODO: rate limiting.
+/// TODO: consider making this generic over request and response types; not currently needed.
+pub struct StreamPool {
+    /// The client pool to acquire clients from.
+    client_pool: Arc<ClientPool>,
+    /// All pooled streams.
+    ///
+    /// Incoming requests will be sent over an existing stream with available capacity, or a new
+    /// stream is spun up and added to the pool. Each stream has an associated Tokio task that
+    /// processes requests and responses.
+    streams: Arc<Mutex<HashMap<StreamID, StreamEntry>>>,
+    /// Limits the max number of concurrent requests (not streams).
+    limiter: Semaphore,
+    /// Stream ID generator.
+    next_stream_id: AtomicUsize,
+}
+
+type StreamID = usize;
+type RequestSender = Sender<(GetPageRequest, ResponseSender)>;
+type RequestReceiver = Receiver<(GetPageRequest, ResponseSender)>;
+type ResponseSender = oneshot::Sender<tonic::Result<GetPageResponse>>;
+
+struct StreamEntry {
+    /// Sends caller requests to the stream task. The stream task exits when this is dropped.
+    sender: RequestSender,
+    /// Number of in-flight requests on this stream. This is an atomic to allow decrementing it on
+    /// completion without acquiring the `StreamPool::streams` lock.
+    queue_depth: Arc<AtomicUsize>,
+}
+
+impl StreamPool {
+    /// Creates a new stream pool, using the given client pool.
+    pub fn new(client_pool: Arc<ClientPool>) -> Self {
+        Self {
+            client_pool,
+            streams: Arc::default(),
+            limiter: Semaphore::new(CLIENT_LIMIT * STREAM_QUEUE_DEPTH),
+            next_stream_id: AtomicUsize::default(),
+        }
+    }
+
+    /// Sends a request via the stream pool and awaits the response. Blocks if the pool is at
+    /// capacity (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight). The
+    /// `GetPageRequest::request_id` must be unique across in-flight request.
+    ///
+    /// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status`
+    /// to avoid tearing down the stream for per-request errors. Callers must check this.
+    ///
+    /// This is very performance-sensitive, as it is on the GetPage hot path.
+    ///
+    /// TODO: this must do something more sophisticated for performance. We want:
+    /// * Cheap, concurrent access in the common case where we can use a pooled stream.
+    /// * Quick acquisition of pooled streams with available capacity.
+    /// * Prefer streams that belong to lower-numbered channels, to reap idle channels.
+    /// * Prefer filling up existing streams' queue depth before spinning up new streams.
+    /// * Don't hold a lock while spinning up new streams.
+    /// * Allow concurrent clients to join onto streams while they're spun up.
+    /// * Allow spinning up multiple streams concurrently, but don't overshoot limits.
+    ///
+    /// For now, we just do something simple and functional, but very inefficient (linear scan).
+    pub async fn send(&self, req: GetPageRequest) -> tonic::Result<GetPageResponse> {
+        // Acquire a permit. For simplicity, we drop it when this method returns. This may exceed
+        // the queue depth if a caller goes away while a request is in flight, but that's okay. We
+        // do the same for queue depth tracking.
+        let _permit = self.limiter.acquire().await.expect("never closed");
+
+        // Acquire a stream sender. We increment and decrement the queue depth here instead of in
+        // the stream task to ensure we don't exceed the queue depth limit.
+        #[allow(clippy::await_holding_lock)] // TODO: Clippy doesn't understand drop()
+        let (req_tx, queue_depth) = async {
+            let mut streams = self.streams.lock().unwrap();
+
+            // Try to find an existing stream with available capacity.
+            for entry in streams.values() {
+                assert!(
+                    entry.queue_depth.load(Ordering::Relaxed) <= STREAM_QUEUE_DEPTH,
+                    "stream overflow"
+                );
+                if entry
+                    .queue_depth
+                    .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |queue_depth| {
+                        // Increment the queue depth via compare-and-swap.
+                        // TODO: review ordering.
+                        (queue_depth < STREAM_QUEUE_DEPTH).then_some(queue_depth + 1)
+                    })
+                    .is_ok()
+                {
+                    return anyhow::Ok((entry.sender.clone(), entry.queue_depth.clone()));
+                }
+            }
+
+            // No available stream, spin up a new one. We install the stream entry first and release
+            // the lock, to allow other callers to join onto this stream and also create additional
+            // streams concurrently when this fills up.
+            let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
+            let queue_depth = Arc::new(AtomicUsize::new(1)); // account for this request
+            let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH);
+            let entry = StreamEntry {
+                sender: req_tx.clone(),
+                queue_depth: queue_depth.clone(),
+            };
+            streams.insert(id, entry);
+
+            drop(streams); // drop lock before spinning up stream
+
+            let client_pool = self.client_pool.clone();
+            let streams = self.streams.clone();
+
+            tokio::spawn(async move {
+                if let Err(err) = Self::run_stream(client_pool, req_rx).await {
+                    warn!("stream failed: {err}");
+                }
+                // Remove stream from pool on exit.
+                let entry = streams.lock().unwrap().remove(&id);
+                assert!(entry.is_some(), "unknown stream ID: {id}");
+            });
+
+            anyhow::Ok((req_tx, queue_depth))
+        }
+        .await
+        .map_err(|err| tonic::Status::internal(err.to_string()))?;
+
+        // Decrement the queue depth on return. This may prematurely decrement it if the caller goes
+        // away while the request is in flight, but that's okay.
+        defer!(
+            let prev_queue_depth = queue_depth.fetch_sub(1, Ordering::SeqCst);
+            assert!(prev_queue_depth > 0, "stream underflow");
+        );
+
+        // Send the request and wait for the response.
+        let (resp_tx, resp_rx) = oneshot::channel();
+
+        req_tx
+            .send((req, resp_tx))
+            .await
+            .map_err(|_| tonic::Status::unavailable("stream closed"))?;
+
+        resp_rx
+            .await
+            .map_err(|_| tonic::Status::unavailable("stream closed"))?
+    }
+
+    /// Runs a stream task. This acquires a client from the `ClientPool` and establishes a
+    /// bidirectional GetPage stream, then forwards requests and responses between callers and the
+    /// stream. It does not track or enforce queue depths, see `send()`.
+    ///
+    /// The task exits when the request channel is closed, or on a stream error. The caller is
+    /// responsible for removing the stream from the pool on exit.
+    async fn run_stream(
+        client_pool: Arc<ClientPool>,
+        mut caller_rx: RequestReceiver,
+    ) -> anyhow::Result<()> {
+        // Acquire a client from the pool and create a stream.
+        let mut client = client_pool.get().await?;
+
+        let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH);
+        let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx);
+        let mut resp_stream = client.get_pages(req_stream).await?;
+
+        // Track caller response channels by request ID. If the task returns early, these response
+        // channels will be dropped and the callers will receive an error.
+        let mut callers = HashMap::with_capacity(STREAM_QUEUE_DEPTH);
+
+        // Process requests and responses.
+        loop {
+            // NB: this can trip if the server doesn't respond to a request, so only debug_assert.
+            debug_assert!(callers.len() <= STREAM_QUEUE_DEPTH, "stream overflow");
+
+            tokio::select! {
+                // Receive requests from callers and send them to the stream.
+                req = caller_rx.recv() => {
+                    // Shut down if request channel is closed.
+                    let Some((req, resp_tx)) = req else {
+                        return Ok(());
+                    };
+
+                    // Store the response channel by request ID.
+                    if callers.contains_key(&req.request_id) {
+                        // Error on request ID duplicates. Ignore callers that went away.
+                        _ = resp_tx.send(Err(tonic::Status::invalid_argument(
+                            format!("duplicate request ID: {}", req.request_id),
+                        )));
+                        continue;
+                    }
+                    callers.insert(req.request_id, resp_tx);
+
+                    // Send the request on the stream. Bail out if the send fails.
+                    req_tx.send(req).await.map_err(|_| {
+                        tonic::Status::unavailable("stream closed")
+                    })?;
+                }
+
+                // Receive responses from the stream and send them to callers.
+                resp = resp_stream.next() => {
+                    // Shut down if the stream is closed, and bail out on stream errors.
+                    let Some(resp) = resp.transpose()? else {
+                        return Ok(())
+                    };
+
+                    // Send the response to the caller. Ignore errors if the caller went away.
+                    let Some(resp_tx) = callers.remove(&resp.request_id) else {
+                        warn!("received response for unknown request ID: {}", resp.request_id);
+                        continue;
+                    };
+                    _ = resp_tx.send(Ok(resp));
+                }
+            }
+        }
+    }
+}
--- a/pageserver/client_grpc/src/request_tracker.rs
+++ b/pageserver/client_grpc/src/request_tracker.rs
@@ -0,0 +1,577 @@
+//! The request tracker dispatches GetPage- and other requests to pageservers, managing a pool of
+//! connections and gRPC streams.
+//!
+//! There is usually one global instance of ShardedRequestTracker in an application, in particular
+//! in the neon extension's communicator process. The application calls the async functions in
+//! ShardedRequestTracker, which routes them to the correct pageservers, taking sharding into
+//! account. In the future, there can be multiple pageservers per shard, and RequestTracker manages
+//! load balancing between them, but that's not implemented yet.
+
+use crate::AuthInterceptor;
+use crate::ClientCacheOptions;
+use crate::PageserverClientAggregateMetrics;
+use crate::client_cache;
+use crate::client_cache::ChannelFactory;
+use crate::client_cache::ConnectionPool;
+use pageserver_page_api::GetPageRequest;
+use pageserver_page_api::GetPageResponse;
+use pageserver_page_api::proto;
+use pageserver_page_api::*;
+use std::sync::Arc;
+use std::sync::atomic::AtomicU64;
+use tonic::{Request, transport::Channel};
+
+use utils::shard::ShardIndex;
+
+use pageserver_page_api::proto::PageServiceClient;
+use tokio_stream::wrappers::ReceiverStream;
+
+use tonic::{Code, Status};
+
+use async_trait::async_trait;
+use std::time::Duration;
+
+use client_cache::PooledItemFactory;
+
+/// StreamReturner represents a gRPC stream to a pageserver.
+///
+/// To send a request:
+/// 1. insert the request's ID, along with a channel to receive the response
+/// 2. send the request to 'sender'
+#[derive(Clone)]
+pub struct StreamReturner {
+    sender: tokio::sync::mpsc::Sender<proto::GetPageRequest>,
+    #[allow(clippy::type_complexity)]
+    sender_hashmap: Arc<
+        tokio::sync::Mutex<
+            Option<
+                std::collections::HashMap<
+                    u64,
+                    tokio::sync::mpsc::Sender<Result<proto::GetPageResponse, Status>>,
+                >,
+            >,
+        >,
+    >,
+}
+
+pub struct StreamFactory {
+    connection_pool: Arc<client_cache::ConnectionPool<Channel>>,
+    auth_interceptor: AuthInterceptor,
+    shard: ShardIndex,
+}
+
+impl StreamFactory {
+    pub fn new(
+        connection_pool: Arc<ConnectionPool<Channel>>,
+        auth_interceptor: AuthInterceptor,
+        shard: ShardIndex,
+    ) -> Self {
+        StreamFactory {
+            connection_pool,
+            auth_interceptor,
+            shard,
+        }
+    }
+}
+
+#[async_trait]
+impl PooledItemFactory<StreamReturner> for StreamFactory {
+    async fn create(
+        &self,
+        _connect_timeout: Duration,
+    ) -> Result<Result<StreamReturner, tonic::Status>, tokio::time::error::Elapsed> {
+        let pool_clone: Arc<ConnectionPool<Channel>> = Arc::clone(&self.connection_pool);
+        let pooled_client = pool_clone.get_client().await;
+        let channel = pooled_client.unwrap().channel();
+        let mut client = PageServiceClient::with_interceptor(
+            channel,
+            self.auth_interceptor.for_shard(self.shard),
+        );
+
+        let (sender, receiver) = tokio::sync::mpsc::channel::<proto::GetPageRequest>(1000);
+        let outbound = ReceiverStream::new(receiver);
+
+        let client_resp = client.get_pages(Request::new(outbound)).await;
+
+        match client_resp {
+            Err(status) => {
+                // TODO: Convert this error correctly
+                Ok(Err(tonic::Status::new(
+                    status.code(),
+                    format!("Failed to connect to pageserver: {}", status.message()),
+                )))
+            }
+            Ok(resp) => {
+                let stream_returner = StreamReturner {
+                    sender: sender.clone(),
+                    sender_hashmap: Arc::new(tokio::sync::Mutex::new(Some(
+                        std::collections::HashMap::new(),
+                    ))),
+                };
+                let map = Arc::clone(&stream_returner.sender_hashmap);
+
+                tokio::spawn(async move {
+                    let map_clone = Arc::clone(&map);
+                    let mut inner = resp.into_inner();
+                    loop {
+                        match inner.message().await {
+                            Err(e) => {
+                                tracing::info!("error received on getpage stream: {e}");
+                                break; // Exit the loop if no more messages
+                            }
+                            Ok(None) => {
+                                break; // Sender closed the stream
+                            }
+                            Ok(Some(response)) => {
+                                // look up stream in hash map
+                                let mut hashmap = map_clone.lock().await;
+                                let hashmap =
+                                    hashmap.as_mut().expect("no other task clears the hashmap");
+                                if let Some(sender) = hashmap.get(&response.request_id) {
+                                    // Send the response to the original request sender
+                                    if let Err(e) = sender.send(Ok(response.clone())).await {
+                                        eprintln!("Failed to send response: {e}");
+                                    }
+                                    hashmap.remove(&response.request_id);
+                                } else {
+                                    eprintln!(
+                                        "No sender found for request ID: {}",
+                                        response.request_id
+                                    );
+                                }
+                            }
+                        }
+                    }
+                    // Don't accept any more requests
+
+                    // Close every sender stream in the hashmap
+                    let mut hashmap_opt = map_clone.lock().await;
+                    let hashmap = hashmap_opt
+                        .as_mut()
+                        .expect("no other task clears the hashmap");
+                    for sender in hashmap.values() {
+                        let error = Status::new(Code::Unknown, "Stream closed");
+                        if let Err(e) = sender.send(Err(error)).await {
+                            eprintln!("Failed to send close response: {e}");
+                        }
+                    }
+                    *hashmap_opt = None;
+                });
+
+                Ok(Ok(stream_returner))
+            }
+        }
+    }
+}
+
+#[derive(Clone)]
+pub struct RequestTracker {
+    _cur_id: Arc<AtomicU64>,
+    stream_pool: Arc<ConnectionPool<StreamReturner>>,
+    unary_pool: Arc<ConnectionPool<Channel>>,
+    auth_interceptor: AuthInterceptor,
+    shard: ShardIndex,
+}
+
+impl RequestTracker {
+    pub fn new(
+        stream_pool: Arc<ConnectionPool<StreamReturner>>,
+        unary_pool: Arc<ConnectionPool<Channel>>,
+        auth_interceptor: AuthInterceptor,
+        shard: ShardIndex,
+    ) -> Self {
+        let cur_id = Arc::new(AtomicU64::new(0));
+
+        RequestTracker {
+            _cur_id: cur_id.clone(),
+            stream_pool,
+            unary_pool,
+            auth_interceptor,
+            shard,
+        }
+    }
+
+    pub async fn send_process_check_rel_exists_request(
+        &self,
+        req: CheckRelExistsRequest,
+    ) -> Result<bool, tonic::Status> {
+        loop {
+            let unary_pool = Arc::clone(&self.unary_pool);
+            let pooled_client = unary_pool.get_client().await.unwrap();
+            let channel = pooled_client.channel();
+            let mut ps_client = PageServiceClient::with_interceptor(
+                channel,
+                self.auth_interceptor.for_shard(self.shard),
+            );
+            let request = proto::CheckRelExistsRequest::from(req);
+            let response = ps_client
+                .check_rel_exists(tonic::Request::new(request))
+                .await;
+
+            match response {
+                Err(status) => {
+                    pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                    continue;
+                }
+                Ok(resp) => {
+                    pooled_client.finish(Ok(())).await; // Pass success to finish
+                    return Ok(resp.get_ref().exists);
+                }
+            }
+        }
+    }
+
+    pub async fn send_process_get_rel_size_request(
+        &self,
+        req: GetRelSizeRequest,
+    ) -> Result<u32, tonic::Status> {
+        loop {
+            // Current sharding model assumes that all metadata is present only at shard 0.
+            let unary_pool = Arc::clone(&self.unary_pool);
+            let pooled_client = unary_pool.get_client().await.unwrap();
+            let channel = pooled_client.channel();
+            let mut ps_client = PageServiceClient::with_interceptor(
+                channel,
+                self.auth_interceptor.for_shard(self.shard),
+            );
+
+            let request = proto::GetRelSizeRequest::from(req);
+            let response = ps_client.get_rel_size(tonic::Request::new(request)).await;
+
+            match response {
+                Err(status) => {
+                    pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                    continue;
+                }
+                Ok(resp) => {
+                    pooled_client.finish(Ok(())).await; // Pass success to finish
+                    return Ok(resp.get_ref().num_blocks);
+                }
+            }
+        }
+    }
+
+    pub async fn send_process_get_dbsize_request(
+        &self,
+        req: GetDbSizeRequest,
+    ) -> Result<u64, tonic::Status> {
+        loop {
+            // Current sharding model assumes that all metadata is present only at shard 0.
+            let unary_pool = Arc::clone(&self.unary_pool);
+            let pooled_client = unary_pool.get_client().await.unwrap();
+            let channel = pooled_client.channel();
+            let mut ps_client = PageServiceClient::with_interceptor(
+                channel,
+                self.auth_interceptor.for_shard(self.shard),
+            );
+
+            let request = proto::GetDbSizeRequest::from(req);
+            let response = ps_client.get_db_size(tonic::Request::new(request)).await;
+
+            match response {
+                Err(status) => {
+                    pooled_client.finish(Err(status.clone())).await; // Pass error to finish
+                    continue;
+                }
+                Ok(resp) => {
+                    pooled_client.finish(Ok(())).await; // Pass success to finish
+                    return Ok(resp.get_ref().num_bytes);
+                }
+            }
+        }
+    }
+
+    pub async fn send_getpage_request(
+        &mut self,
+        req: GetPageRequest,
+    ) -> Result<GetPageResponse, tonic::Status> {
+        loop {
+            let request = req.clone();
+            // Increment cur_id
+            //let request_id = self.cur_id.fetch_add(1, Ordering::SeqCst) + 1;
+            let request_id = request.request_id;
+            let response_sender: tokio::sync::mpsc::Sender<Result<proto::GetPageResponse, Status>>;
+            let mut response_receiver: tokio::sync::mpsc::Receiver<
+                Result<proto::GetPageResponse, Status>,
+            >;
+
+            (response_sender, response_receiver) = tokio::sync::mpsc::channel(1);
+            //request.request_id = request_id;
+
+            // Get a stream from the stream pool
+            let pool_clone = Arc::clone(&self.stream_pool);
+            let sender_stream_pool = pool_clone.get_client().await;
+            let stream_returner = match sender_stream_pool {
+                Ok(stream_ret) => stream_ret,
+                Err(_e) => {
+                    // retry
+                    continue;
+                }
+            };
+            let returner = stream_returner.channel();
+            let map = returner.sender_hashmap.clone();
+            // Insert the response sender into the hashmap
+            {
+                if let Some(map_inner) = map.lock().await.as_mut() {
+                    let old = map_inner.insert(request_id, response_sender);
+
+                    // request IDs must be unique
+                    if old.is_some() {
+                        panic!("request with ID {request_id} is already in-flight");
+                    }
+                } else {
+                    // The stream was closed. Try a different one.
+                    tracing::info!("stream was concurrently closed");
+                    continue;
+                }
+            }
+            let sent = returner
+                .sender
+                .send(proto::GetPageRequest::from(request))
+                .await;
+
+            if let Err(_e) = sent {
+                // Remove the request from the map if sending failed
+                {
+                    if let Some(map_inner) = map.lock().await.as_mut() {
+                        // remove from hashmap
+                        map_inner.remove(&request_id);
+                    }
+                }
+                stream_returner
+                    .finish(Err(Status::new(Code::Unknown, "Failed to send request")))
+                    .await;
+                continue;
+            }
+
+            let response = response_receiver.recv().await;
+            match response {
+                Some(resp) => {
+                    match resp {
+                        Err(_status) => {
+                            // Handle the case where the response was not received
+                            stream_returner
+                                .finish(Err(Status::new(
+                                    Code::Unknown,
+                                    "Failed to receive response",
+                                )))
+                                .await;
+                            continue;
+                        }
+                        Ok(resp) => {
+                            stream_returner.finish(Result::Ok(())).await;
+                            return Ok(resp.clone().into());
+                        }
+                    }
+                }
+                None => {
+                    // Handle the case where the response channel was closed
+                    stream_returner
+                        .finish(Err(Status::new(Code::Unknown, "Response channel closed")))
+                        .await;
+                    continue;
+                }
+            }
+        }
+    }
+}
+
+struct ShardedRequestTrackerInner {
+    // Hashmap of shard index to RequestTracker
+    trackers: std::collections::HashMap<ShardIndex, RequestTracker>,
+}
+pub struct ShardedRequestTracker {
+    inner: Arc<std::sync::Mutex<ShardedRequestTrackerInner>>,
+    tcp_client_cache_options: ClientCacheOptions,
+    stream_client_cache_options: ClientCacheOptions,
+}
+
+//
+// TODO: Functions in the ShardedRequestTracker should be able to timeout and
+// cancel a reqeust. The request should return an error if it is cancelled.
+//
+
+impl Default for ShardedRequestTracker {
+    fn default() -> Self {
+        ShardedRequestTracker::new()
+    }
+}
+
+impl ShardedRequestTracker {
+    pub fn new() -> Self {
+        //
+        // Default configuration for the client. These could be added to a config file
+        //
+        let tcp_client_cache_options = ClientCacheOptions {
+            max_delay_ms: 0,
+            drop_rate: 0.0,
+            hang_rate: 0.0,
+            connect_timeout: Duration::from_secs(1),
+            connect_backoff: Duration::from_millis(100),
+            max_consumers: 8, // Streams per connection
+            error_threshold: 10,
+            max_idle_duration: Duration::from_secs(5),
+            max_total_connections: 8,
+        };
+        let stream_client_cache_options = ClientCacheOptions {
+            max_delay_ms: 0,
+            drop_rate: 0.0,
+            hang_rate: 0.0,
+            connect_timeout: Duration::from_secs(1),
+            connect_backoff: Duration::from_millis(100),
+            max_consumers: 64, // Requests per stream
+            error_threshold: 10,
+            max_idle_duration: Duration::from_secs(5),
+            max_total_connections: 64, // Total allowable number of streams
+        };
+        ShardedRequestTracker {
+            inner: Arc::new(std::sync::Mutex::new(ShardedRequestTrackerInner {
+                trackers: std::collections::HashMap::new(),
+            })),
+            tcp_client_cache_options,
+            stream_client_cache_options,
+        }
+    }
+
+    pub async fn update_shard_map(
+        &self,
+        shard_urls: std::collections::HashMap<ShardIndex, String>,
+        metrics: Option<Arc<PageserverClientAggregateMetrics>>,
+        tenant_id: String,
+        timeline_id: String,
+        auth_str: Option<&str>,
+    ) {
+        let mut trackers = std::collections::HashMap::new();
+        for (shard, endpoint_url) in shard_urls {
+            //
+            // Create a pool of streams for streaming get_page requests
+            //
+            let channel_fact: Arc<dyn PooledItemFactory<Channel> + Send + Sync> =
+                Arc::new(ChannelFactory::new(
+                    endpoint_url.clone(),
+                    self.tcp_client_cache_options.max_delay_ms,
+                    self.tcp_client_cache_options.drop_rate,
+                    self.tcp_client_cache_options.hang_rate,
+                ));
+            let new_pool = ConnectionPool::new(
+                Arc::clone(&channel_fact),
+                self.tcp_client_cache_options.connect_timeout,
+                self.tcp_client_cache_options.connect_backoff,
+                self.tcp_client_cache_options.max_consumers,
+                self.tcp_client_cache_options.error_threshold,
+                self.tcp_client_cache_options.max_idle_duration,
+                self.tcp_client_cache_options.max_total_connections,
+                metrics.clone(),
+            );
+
+            let auth_interceptor =
+                AuthInterceptor::new(tenant_id.as_str(), timeline_id.as_str(), auth_str);
+
+            let stream_pool = ConnectionPool::<StreamReturner>::new(
+                Arc::new(StreamFactory::new(
+                    new_pool.clone(),
+                    auth_interceptor.clone(),
+                    ShardIndex::unsharded(),
+                )),
+                self.stream_client_cache_options.connect_timeout,
+                self.stream_client_cache_options.connect_backoff,
+                self.stream_client_cache_options.max_consumers,
+                self.stream_client_cache_options.error_threshold,
+                self.stream_client_cache_options.max_idle_duration,
+                self.stream_client_cache_options.max_total_connections,
+                metrics.clone(),
+            );
+
+            //
+            // Create a client pool for unary requests
+            //
+
+            let unary_pool = ConnectionPool::new(
+                Arc::clone(&channel_fact),
+                self.tcp_client_cache_options.connect_timeout,
+                self.tcp_client_cache_options.connect_backoff,
+                self.tcp_client_cache_options.max_consumers,
+                self.tcp_client_cache_options.error_threshold,
+                self.tcp_client_cache_options.max_idle_duration,
+                self.tcp_client_cache_options.max_total_connections,
+                metrics.clone(),
+            );
+            //
+            // Create a new RequestTracker for this shard
+            //
+            let new_tracker = RequestTracker::new(stream_pool, unary_pool, auth_interceptor, shard);
+            trackers.insert(shard, new_tracker);
+        }
+        let mut inner = self.inner.lock().unwrap();
+        inner.trackers = trackers;
+    }
+
+    pub async fn get_page(&self, req: GetPageRequest) -> Result<GetPageResponse, tonic::Status> {
+        // Get shard index from the request and look up the RequestTracker instance for that shard
+        let shard_index = ShardIndex::unsharded(); // TODO!
+        let mut tracker = self.lookup_tracker_for_shard(shard_index)?;
+
+        let response = tracker.send_getpage_request(req).await;
+        match response {
+            Ok(resp) => Ok(resp),
+            Err(e) => Err(tonic::Status::unknown(format!("Failed to get page: {e}"))),
+        }
+    }
+
+    pub async fn process_get_dbsize_request(
+        &self,
+        request: GetDbSizeRequest,
+    ) -> Result<u64, tonic::Status> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let tracker = self.lookup_tracker_for_shard(ShardIndex::unsharded())?;
+
+        let response = tracker.send_process_get_dbsize_request(request).await;
+        match response {
+            Ok(resp) => Ok(resp),
+            Err(e) => Err(e),
+        }
+    }
+
+    pub async fn process_get_rel_size_request(
+        &self,
+        request: GetRelSizeRequest,
+    ) -> Result<u32, tonic::Status> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let tracker = self.lookup_tracker_for_shard(ShardIndex::unsharded())?;
+
+        let response = tracker.send_process_get_rel_size_request(request).await;
+        match response {
+            Ok(resp) => Ok(resp),
+            Err(e) => Err(e),
+        }
+    }
+
+    pub async fn process_check_rel_exists_request(
+        &self,
+        request: CheckRelExistsRequest,
+    ) -> Result<bool, tonic::Status> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let tracker = self.lookup_tracker_for_shard(ShardIndex::unsharded())?;
+
+        let response = tracker.send_process_check_rel_exists_request(request).await;
+        match response {
+            Ok(resp) => Ok(resp),
+            Err(e) => Err(e),
+        }
+    }
+
+    #[allow(clippy::result_large_err)]
+    fn lookup_tracker_for_shard(
+        &self,
+        shard_index: ShardIndex,
+    ) -> Result<RequestTracker, tonic::Status> {
+        let inner = self.inner.lock().unwrap();
+        if let Some(t) = inner.trackers.get(&shard_index) {
+            Ok(t.clone())
+        } else {
+            Err(tonic::Status::not_found(format!(
+                "Shard {shard_index} not found",
+            )))
+        }
+    }
+}
--- a/pageserver/page_api/src/client.rs
+++ b/pageserver/page_api/src/client.rs
@@ -1,23 +1,153 @@
-use anyhow::Result;
+use anyhow::anyhow;
 use futures::{Stream, StreamExt as _, TryStreamExt as _};
 use tokio::io::AsyncRead;
 use tokio_util::io::StreamReader;
+use tonic::codec::CompressionEncoding;
 use tonic::metadata::AsciiMetadataValue;
-use tonic::metadata::errors::InvalidMetadataValue;
-use tonic::transport::Channel;
-use tonic::{Request, Streaming};
+use tonic::service::Interceptor;
+use tonic::service::interceptor::InterceptedService;
+use tonic::transport::{Channel, Endpoint};

-use utils::id::TenantId;
-use utils::id::TimelineId;
+use utils::id::{TenantId, TimelineId};
 use utils::shard::ShardIndex;

-use crate::model;
+use crate::model::*;
 use crate::proto;

-///
-/// AuthInterceptor adds tenant, timeline, and auth header to the channel. These
-/// headers are required at the pageserver.
-///
+/// A basic Pageserver gRPC client, for a single tenant shard. This API uses native Rust domain
+/// types from `model` rather than generated Protobuf types.
+pub struct Client {
+    inner: proto::PageServiceClient<InterceptedService<Channel, AuthInterceptor>>,
+}
+
+impl Client {
+    /// Connects to the given gRPC endpoint.
+    pub async fn connect<E>(
+        endpoint: E,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_id: ShardIndex,
+        auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
+    ) -> anyhow::Result<Self>
+    where
+        E: TryInto<Endpoint> + Send + Sync + 'static,
+        <E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
+    {
+        let endpoint: Endpoint = endpoint
+            .try_into()
+            .map_err(|err| anyhow!("invalid endpoint: {err}"))?;
+        let channel = endpoint.connect().await?;
+        Self::new(
+            channel,
+            tenant_id,
+            timeline_id,
+            shard_id,
+            auth_token,
+            compression,
+        )
+    }
+
+    /// Creates a new client using the given gRPC channel.
+    pub fn new(
+        channel: Channel,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_id: ShardIndex,
+        auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
+    ) -> anyhow::Result<Self> {
+        let auth = AuthInterceptor::new(tenant_id, timeline_id, shard_id, auth_token)?;
+        let mut inner = proto::PageServiceClient::with_interceptor(channel, auth);
+
+        if let Some(compression) = compression {
+            // TODO: benchmark this (including network latency).
+            inner = inner
+                .accept_compressed(compression)
+                .send_compressed(compression);
+        }
+
+        Ok(Self { inner })
+    }
+
+    /// Returns whether a relation exists.
+    pub async fn check_rel_exists(
+        &mut self,
+        req: CheckRelExistsRequest,
+    ) -> tonic::Result<CheckRelExistsResponse> {
+        let req = proto::CheckRelExistsRequest::from(req);
+        let resp = self.inner.check_rel_exists(req).await?.into_inner();
+        Ok(resp.into())
+    }
+
+    /// Fetches a base backup.
+    pub async fn get_base_backup(
+        &mut self,
+        req: GetBaseBackupRequest,
+    ) -> tonic::Result<impl AsyncRead + use<>> {
+        let req = proto::GetBaseBackupRequest::from(req);
+        let chunks = self.inner.get_base_backup(req).await?.into_inner();
+        Ok(StreamReader::new(
+            chunks
+                .map_ok(|resp| resp.chunk)
+                .map_err(std::io::Error::other),
+        ))
+    }
+
+    /// Returns the total size of a database, as # of bytes.
+    pub async fn get_db_size(&mut self, req: GetDbSizeRequest) -> tonic::Result<GetDbSizeResponse> {
+        let req = proto::GetDbSizeRequest::from(req);
+        let resp = self.inner.get_db_size(req).await?.into_inner();
+        Ok(resp.into())
+    }
+
+    /// Fetches pages.
+    ///
+    /// This is implemented as a bidirectional streaming RPC for performance. Per-request errors are
+    /// typically returned as status_code instead of errors, to avoid tearing down the entire stream
+    /// via a tonic::Status error.
+    pub async fn get_pages(
+        &mut self,
+        reqs: impl Stream<Item = GetPageRequest> + Send + 'static,
+    ) -> tonic::Result<impl Stream<Item = tonic::Result<GetPageResponse>> + Send + 'static> {
+        let reqs = reqs.map(proto::GetPageRequest::from);
+        let resps = self.inner.get_pages(reqs).await?.into_inner();
+        Ok(resps.map_ok(GetPageResponse::from))
+    }
+
+    /// Returns the size of a relation, as # of blocks.
+    pub async fn get_rel_size(
+        &mut self,
+        req: GetRelSizeRequest,
+    ) -> tonic::Result<GetRelSizeResponse> {
+        let req = proto::GetRelSizeRequest::from(req);
+        let resp = self.inner.get_rel_size(req).await?.into_inner();
+        Ok(resp.into())
+    }
+
+    /// Fetches an SLRU segment.
+    pub async fn get_slru_segment(
+        &mut self,
+        req: GetSlruSegmentRequest,
+    ) -> tonic::Result<GetSlruSegmentResponse> {
+        let req = proto::GetSlruSegmentRequest::from(req);
+        let resp = self.inner.get_slru_segment(req).await?.into_inner();
+        Ok(resp.try_into()?)
+    }
+
+    /// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't
+    /// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards.
+    ///
+    /// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be
+    /// acquired because the LSN has already been garbage collected.
+    pub async fn lease_lsn(&mut self, req: LeaseLsnRequest) -> tonic::Result<LeaseLsnResponse> {
+        let req = proto::LeaseLsnRequest::from(req);
+        let resp = self.inner.lease_lsn(req).await?.into_inner();
+        Ok(resp.try_into()?)
+    }
+}
+
+/// Adds authentication metadata to gRPC requests.
 #[derive(Clone)]
 struct AuthInterceptor {
    tenant_id: AsciiMetadataValue,
@@ -30,174 +160,29 @@ impl AuthInterceptor {
    fn new(
        tenant_id: TenantId,
        timeline_id: TimelineId,
-        auth_token: Option<String>,
        shard_id: ShardIndex,
-    ) -> Result<Self, InvalidMetadataValue> {
-        let tenant_ascii: AsciiMetadataValue = tenant_id.to_string().try_into()?;
-        let timeline_ascii: AsciiMetadataValue = timeline_id.to_string().try_into()?;
-        let shard_ascii: AsciiMetadataValue = shard_id.to_string().try_into()?;
-
-        let auth_header: Option<AsciiMetadataValue> = match auth_token {
-            Some(token) => Some(format!("Bearer {token}").try_into()?),
-            None => None,
-        };
-
+        auth_token: Option<String>,
+    ) -> anyhow::Result<Self> {
        Ok(Self {
-            tenant_id: tenant_ascii,
-            shard_id: shard_ascii,
-            timeline_id: timeline_ascii,
-            auth_header,
+            tenant_id: tenant_id.to_string().try_into()?,
+            timeline_id: timeline_id.to_string().try_into()?,
+            shard_id: shard_id.to_string().try_into()?,
+            auth_header: auth_token
+                .map(|token| format!("Bearer {token}").try_into())
+                .transpose()?,
        })
    }
 }

-impl tonic::service::Interceptor for AuthInterceptor {
-    fn call(&mut self, mut req: tonic::Request<()>) -> Result<tonic::Request<()>, tonic::Status> {
-        req.metadata_mut()
-            .insert("neon-tenant-id", self.tenant_id.clone());
-        req.metadata_mut()
-            .insert("neon-shard-id", self.shard_id.clone());
-        req.metadata_mut()
-            .insert("neon-timeline-id", self.timeline_id.clone());
-        if let Some(auth_header) = &self.auth_header {
-            req.metadata_mut()
-                .insert("authorization", auth_header.clone());
+impl Interceptor for AuthInterceptor {
+    fn call(&mut self, mut req: tonic::Request<()>) -> tonic::Result<tonic::Request<()>> {
+        let metadata = req.metadata_mut();
+        metadata.insert("neon-tenant-id", self.tenant_id.clone());
+        metadata.insert("neon-timeline-id", self.timeline_id.clone());
+        metadata.insert("neon-shard-id", self.shard_id.clone());
+        if let Some(ref auth_header) = self.auth_header {
+            metadata.insert("authorization", auth_header.clone());
        }
        Ok(req)
    }
 }
-
-#[derive(Clone)]
-pub struct Client {
-    client: proto::PageServiceClient<
-        tonic::service::interceptor::InterceptedService<Channel, AuthInterceptor>,
-    >,
-}
-
-impl Client {
-    pub async fn new<T: TryInto<tonic::transport::Endpoint> + Send + Sync + 'static>(
-        into_endpoint: T,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        shard_id: ShardIndex,
-        auth_header: Option<String>,
-        compression: Option<tonic::codec::CompressionEncoding>,
-    ) -> anyhow::Result<Self> {
-        let endpoint: tonic::transport::Endpoint = into_endpoint
-            .try_into()
-            .map_err(|_e| anyhow::anyhow!("failed to convert endpoint"))?;
-        let channel = endpoint.connect().await?;
-        let auth = AuthInterceptor::new(tenant_id, timeline_id, auth_header, shard_id)
-            .map_err(|e| anyhow::anyhow!(e.to_string()))?;
-        let mut client = proto::PageServiceClient::with_interceptor(channel, auth);
-
-        if let Some(compression) = compression {
-            // TODO: benchmark this (including network latency).
-            client = client
-                .accept_compressed(compression)
-                .send_compressed(compression);
-        }
-
-        Ok(Self { client })
-    }
-
-    /// Returns whether a relation exists.
-    pub async fn check_rel_exists(
-        &mut self,
-        req: model::CheckRelExistsRequest,
-    ) -> Result<model::CheckRelExistsResponse, tonic::Status> {
-        let proto_req = proto::CheckRelExistsRequest::from(req);
-
-        let response = self.client.check_rel_exists(proto_req).await?;
-
-        let proto_resp = response.into_inner();
-        Ok(proto_resp.into())
-    }
-
-    /// Fetches a base backup.
-    pub async fn get_base_backup(
-        &mut self,
-        req: model::GetBaseBackupRequest,
-    ) -> Result<impl AsyncRead + use<>, tonic::Status> {
-        let req = proto::GetBaseBackupRequest::from(req);
-        let chunks = self.client.get_base_backup(req).await?.into_inner();
-        let reader = StreamReader::new(
-            chunks
-                .map_ok(|resp| resp.chunk)
-                .map_err(std::io::Error::other),
-        );
-        Ok(reader)
-    }
-
-    /// Returns the total size of a database, as # of bytes.
-    pub async fn get_db_size(
-        &mut self,
-        req: model::GetDbSizeRequest,
-    ) -> Result<u64, tonic::Status> {
-        let proto_req = proto::GetDbSizeRequest::from(req);
-
-        let response = self.client.get_db_size(proto_req).await?;
-        Ok(response.into_inner().into())
-    }
-
-    /// Fetches pages.
-    ///
-    /// This is implemented as a bidirectional streaming RPC for performance.
-    /// Per-request errors are often returned as status_code instead of errors,
-    /// to avoid tearing down the entire stream via tonic::Status.
-    pub async fn get_pages<ReqSt>(
-        &mut self,
-        inbound: ReqSt,
-    ) -> Result<
-        impl Stream<Item = Result<model::GetPageResponse, tonic::Status>> + Send + 'static,
-        tonic::Status,
-    >
-    where
-        ReqSt: Stream<Item = model::GetPageRequest> + Send + 'static,
-    {
-        let outbound_proto = inbound.map(|domain_req| domain_req.into());
-
-        let req_new = Request::new(outbound_proto);
-
-        let response_stream: Streaming<proto::GetPageResponse> =
-            self.client.get_pages(req_new).await?.into_inner();
-
-        let domain_stream = response_stream.map_ok(model::GetPageResponse::from);
-
-        Ok(domain_stream)
-    }
-
-    /// Returns the size of a relation, as # of blocks.
-    pub async fn get_rel_size(
-        &mut self,
-        req: model::GetRelSizeRequest,
-    ) -> Result<model::GetRelSizeResponse, tonic::Status> {
-        let proto_req = proto::GetRelSizeRequest::from(req);
-        let response = self.client.get_rel_size(proto_req).await?;
-        let proto_resp = response.into_inner();
-        Ok(proto_resp.into())
-    }
-
-    /// Fetches an SLRU segment.
-    pub async fn get_slru_segment(
-        &mut self,
-        req: model::GetSlruSegmentRequest,
-    ) -> Result<model::GetSlruSegmentResponse, tonic::Status> {
-        let proto_req = proto::GetSlruSegmentRequest::from(req);
-        let response = self.client.get_slru_segment(proto_req).await?;
-        Ok(response.into_inner().try_into()?)
-    }
-
-    /// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't
-    /// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards.
-    ///
-    /// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be
-    /// acquired because the LSN has already been garbage collected.
-    pub async fn lease_lsn(
-        &mut self,
-        req: model::LeaseLsnRequest,
-    ) -> Result<model::LeaseLsnResponse, tonic::Status> {
-        let req = proto::LeaseLsnRequest::from(req);
-        Ok(self.client.lease_lsn(req).await?.into_inner().try_into()?)
-    }
-}
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -24,10 +24,14 @@ tracing.workspace = true
 tokio.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
+axum.workspace = true
+http.workspace = true
+metrics.workspace = true
 tonic.workspace = true
 url.workspace = true

 pageserver_client.workspace = true
+pageserver_client_grpc.workspace = true
 pageserver_api.workspace = true
 pageserver_page_api.workspace = true
 utils = { path = "../../libs/utils/" }
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -326,7 +326,7 @@ impl GrpcClient {
        ttid: TenantTimelineId,
        compression: bool,
    ) -> anyhow::Result<Self> {
-        let inner = page_api::Client::new(
+        let inner = page_api::Client::connect(
            connstring.to_string(),
            ttid.tenant_id,
            ttid.timeline_id,
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -26,12 +26,27 @@ use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
 use utils::shard::ShardIndex;

+use axum::Router;
+use axum::body::Body;
+use axum::extract::State;
+use axum::response::Response;
+
+use http::StatusCode;
+use http::header::CONTENT_TYPE;
+
+use metrics::proto::MetricFamily;
+use metrics::{Encoder, TextEncoder};
+
 use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
 use crate::util::{request_stats, tokio_thread_local_stats};

 /// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
 #[derive(clap::Parser)]
 pub(crate) struct Args {
+    #[clap(long, default_value = "false")]
+    grpc: bool,
+    #[clap(long, default_value = "false")]
+    grpc_stream: bool,
    #[clap(long, default_value = "http://localhost:9898")]
    mgmt_api_endpoint: String,
    /// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
@@ -72,6 +87,9 @@ pub(crate) struct Args {
    #[clap(long)]
    set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,

+    #[clap(long)]
+    only_relnode: Option<u32>,
+
    /// Queue depth generated in each client.
    #[clap(long, default_value = "1")]
    queue_depth: NonZeroUsize,
@@ -86,10 +104,31 @@ pub(crate) struct Args {
    #[clap(long, default_value = "1")]
    batch_size: NonZeroUsize,

-    #[clap(long)]
-    only_relnode: Option<u32>,
-
    targets: Option<Vec<TenantTimelineId>>,
+
+    #[clap(long, default_value = "100")]
+    pool_max_consumers: NonZeroUsize,
+
+    #[clap(long, default_value = "5")]
+    pool_error_threshold: NonZeroUsize,
+
+    #[clap(long, default_value = "5000")]
+    pool_connect_timeout: NonZeroUsize,
+
+    #[clap(long, default_value = "1000")]
+    pool_connect_backoff: NonZeroUsize,
+
+    #[clap(long, default_value = "60000")]
+    pool_max_idle_duration: NonZeroUsize,
+
+    #[clap(long, default_value = "0")]
+    max_delay_ms: usize,
+
+    #[clap(long, default_value = "0")]
+    percent_drops: usize,
+
+    #[clap(long, default_value = "0")]
+    percent_hangs: usize,
 }

 /// State shared by all clients
@@ -146,6 +185,37 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
        main_impl(args, thread_local_stats)
    })
 }
+async fn get_metrics(
+    State(state): State<Arc<pageserver_client_grpc::PageserverClientAggregateMetrics>>,
+) -> Response {
+    let metrics = state.collect();
+
+    info!("metrics: {metrics:?}");
+    // When we call TextEncoder::encode() below, it will immediately return an
+    // error if a metric family has no metrics, so we need to preemptively
+    // filter out metric families with no metrics.
+    let metrics = metrics
+        .into_iter()
+        .filter(|m| !m.get_metric().is_empty())
+        .collect::<Vec<MetricFamily>>();
+
+    let encoder = TextEncoder::new();
+    let mut buffer = vec![];
+
+    if let Err(e) = encoder.encode(&metrics, &mut buffer) {
+        Response::builder()
+            .status(StatusCode::INTERNAL_SERVER_ERROR)
+            .header(CONTENT_TYPE, "application/text")
+            .body(Body::from(e.to_string()))
+            .unwrap()
+    } else {
+        Response::builder()
+            .status(StatusCode::OK)
+            .header(CONTENT_TYPE, encoder.format_type())
+            .body(Body::from(buffer))
+            .unwrap()
+    }
+}

 async fn main_impl(
    args: Args,
@@ -153,6 +223,24 @@ async fn main_impl(
 ) -> anyhow::Result<()> {
    let args: &'static Args = Box::leak(Box::new(args));

+    // Vector of pageserver clients
+    let client_metrics = Arc::new(pageserver_client_grpc::PageserverClientAggregateMetrics::new());
+
+    use axum::routing::get;
+    let app = Router::new()
+        .route("/metrics", get(get_metrics))
+        .with_state(client_metrics.clone());
+
+    // TODO: make configurable. Or listen on unix domain socket?
+    let listener = tokio::net::TcpListener::bind("127.0.0.1:9090")
+        .await
+        .unwrap();
+
+    tokio::spawn(async {
+        tracing::info!("metrics listener spawned");
+        axum::serve(listener, app).await.unwrap()
+    });
+
    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
        reqwest::Client::new(), // TODO: support ssl_ca_file for https APIs in pagebench.
        args.mgmt_api_endpoint.clone(),
@@ -311,6 +399,7 @@ async fn main_impl(
    let rps_period = args
        .per_client_rate
        .map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
+
    let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
        let ss = shared_state.clone();
        let cancel = cancel.clone();
@@ -625,7 +714,7 @@ impl GrpcClient {
        ttid: TenantTimelineId,
        compression: bool,
    ) -> anyhow::Result<Self> {
-        let mut client = page_api::Client::new(
+        let mut client = page_api::Client::connect(
            connstring.to_string(),
            ttid.tenant_id,
            ttid.timeline_id,
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -5,6 +5,7 @@ MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
 	communicator.o \
+	communicator_new.o \
 	extension_server.o \
 	file_cache.o \
 	hll.o \
@@ -22,12 +23,18 @@ OBJS = \
 	walproposer.o \
 	walproposer_pg.o \
 	neon_ddl_handler.o \
-	walsender_hooks.o
+	walsender_hooks.o \
+	$(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a

 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl

+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S), Darwin)
+    SHLIB_LINK += -framework Security -framework CoreFoundation -framework SystemConfiguration
+endif
+
 EXTENSION = neon
 DATA = \
 	neon--1.0.sql \
@@ -54,6 +61,17 @@ WALPROP_OBJS = \
 	neon_utils.o \
 	walproposer_compat.o

+# libcommunicator.a is built by cargo from the Rust sources under communicator/
+# subdirectory. `cargo build` also generates communicator_bindings.h.
+neon.o: communicator/communicator_bindings.h
+
+$(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h &:
+	(cd $(srcdir)/communicator && cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE))
+
+# Force `cargo build` every time. Some of the Rust sources might have
+# changed.
+.PHONY: $(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h
+
 .PHONY: walproposer-lib
 walproposer-lib: CPPFLAGS += -DWALPROPOSER_LIB
 walproposer-lib: libwalproposer.a;
--- a/pgxn/neon/communicator/Cargo.lock
+++ b/pgxn/neon/communicator/Cargo.lock
@@ -0,0 +1,372 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "addr2line"
+version = "0.24.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
+dependencies = [
+ "gimli",
+]
+
+[[package]]
+name = "adler2"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
+
+[[package]]
+name = "backtrace"
+version = "0.3.74"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a"
+dependencies = [
+ "addr2line",
+ "cfg-if",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+ "windows-targets",
+]
+
+[[package]]
+name = "base64"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
+
+[[package]]
+name = "bytes"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "communicator"
+version = "0.1.0"
+dependencies = [
+ "tonic",
+]
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "futures-core"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
+
+[[package]]
+name = "gimli"
+version = "0.31.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
+
+[[package]]
+name = "http"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
+dependencies = [
+ "bytes",
+ "fnv",
+ "itoa",
+]
+
+[[package]]
+name = "http-body"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
+dependencies = [
+ "bytes",
+ "http",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
+
+[[package]]
+name = "libc"
+version = "0.2.171"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6"
+
+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.8.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff70ce3e48ae43fa075863cef62e8b43b71a4f2382229920e0df362592919430"
+dependencies = [
+ "adler2",
+]
+
+[[package]]
+name = "object"
+version = "0.36.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
+
+[[package]]
+name = "percent-encoding"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
+
+[[package]]
+name = "pin-project"
+version = "1.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
+dependencies = [
+ "pin-project-internal",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "1.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.94"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rustc-demangle"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
+
+[[package]]
+name = "syn"
+version = "2.0.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "tokio"
+version = "1.44.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48"
+dependencies = [
+ "backtrace",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "tokio-stream"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047"
+dependencies = [
+ "futures-core",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tonic"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85839f0b32fd242bb3209262371d07feda6d780d16ee9d2bc88581b89da1549b"
+dependencies = [
+ "base64",
+ "bytes",
+ "http",
+ "http-body",
+ "http-body-util",
+ "percent-encoding",
+ "pin-project",
+ "tokio-stream",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+
+[[package]]
+name = "tower-service"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
+
+[[package]]
+name = "tracing"
+version = "0.1.41"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
+dependencies = [
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
+dependencies = [
+ "once_cell",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
--- a/pgxn/neon/communicator/Cargo.toml
+++ b/pgxn/neon/communicator/Cargo.toml
@@ -0,0 +1,39 @@
+[package]
+name = "communicator"
+version = "0.1.0"
+edition = "2024"
+
+[features]
+testing = []
+
+[lib]
+crate-type = ["staticlib"]
+
+[dependencies]
+axum.workspace = true
+bytes.workspace = true
+clashmap.workspace = true
+http.workspace = true
+libc.workspace = true
+nix.workspace = true
+atomic_enum = "0.3.0"
+prometheus.workspace = true
+prost.workspace = true
+tonic = { version = "0.12.0", default-features = false, features=["codegen", "prost", "transport"] }
+tokio = { version = "1.43.1", features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] }
+tokio-pipe = { version = "0.2.12" }
+thiserror.workspace = true
+tracing.workspace = true
+tracing-subscriber.workspace = true
+
+metrics.workspace = true
+uring-common = { workspace = true, features = ["bytes"] }
+
+pageserver_client_grpc.workspace = true
+pageserver_page_api.workspace = true
+
+neon-shmem.workspace = true
+utils.workspace = true
+
+[build-dependencies]
+cbindgen.workspace = true
--- a/pgxn/neon/communicator/README.md
+++ b/pgxn/neon/communicator/README.md
@@ -0,0 +1,123 @@
+# Communicator
+
+This package provides the so-called "compute-pageserver communicator",
+or just "communicator" in short. It runs in a PostgreSQL server, as
+part of the neon extension, and handles the communication with the
+pageservers. On the PostgreSQL side, the glue code in pgxn/neon/ uses
+the communicator to implement the PostgreSQL Storage Manager (SMGR)
+interface.
+
+## Design criteria
+
+- Low latency
+- Saturate a 10 Gbit / s network interface without becoming a bottleneck
+
+## Source code view
+
+pgxn/neon/communicator_new.c
+	Contains the glue that interact with PostgreSQL code and the Rust
+	communicator code.
+
+pgxn/neon/communicator/src/backend_interface.rs
+	The entry point for calls from each backend.
+
+pgxn/neon/communicator/src/init.rs
+	Initialization at server startup
+
+pgxn/neon/communicator/src/worker_process/
+    Worker process main loop and glue code
+
+At compilation time, pgxn/neon/communicator/ produces a static
+library, libcommunicator.a. It is linked to the neon.so extension
+library.
+
+The real networking code, which is independent of PostgreSQL, is in
+the pageserver/client_grpc crate.
+
+## Process view
+
+The communicator runs in a dedicated background worker process, the
+"communicator process". The communicator uses a multi-threaded Tokio
+runtime to execute the IO requests. So the communicator process has
+multiple threads running. That's unusual for Postgres processes and
+care must be taken to make that work.
+
+### Backend <-> worker communication
+
+Each backend has a number of I/O request slots in shared memory. The
+slots are statically allocated for each backend, and must not be
+accessed by other backends. The worker process reads requests from the
+shared memory slots, and writes responses back to the slots.
+
+To submit an IO request, first pick one of your backend's free slots,
+and write the details of the IO request in the slot. Finally, update
+the 'state' field of the slot to Submitted. That informs the worker
+process that it can start processing the request. Once the state has
+been set to Submitted, the backend *must not* access the slot anymore,
+until the worker process sets its state to 'Completed'. In other
+words, each slot is owned by either the backend or the worker process
+at all times, and the 'state' field indicates who has ownership at the
+moment.
+
+To inform the worker process that a request slot has a pending IO
+request, there's a pipe shared by the worker process and all backend
+processes. After you have changed the slot's state to Submitted, write
+the index of the request slot to the pipe. This wakes up the worker
+process.
+
+(Note that the pipe is just used for wakeups, but the worker process
+is free to pick up Submitted IO requests even without receiving the
+wakeup. As of this writing, it doesn't do that, but it might be useful
+in the future to reduce latency even further, for example.)
+
+When the worker process has completed processing the request, it
+writes the result back in the request slot. A GetPage request can also
+contain a pointer to buffer in the shared buffer cache. In that case,
+the worker process writes the resulting page contents directly to the
+buffer, and just a result code in the request slot. It then updates
+the 'state' field to Completed, which passes the owner ship back to
+the originating backend. Finally, it signals the process Latch of the
+originating backend, waking it up.
+
+### Differences between PostgreSQL v16, v17 and v18
+
+PostgreSQL v18 introduced the new AIO mechanism. The PostgreSQL AIO
+mechanism uses a very similar mechanism as described in the previous
+section, for the communication between AIO worker processes and
+backends. With our communicator, the AIO worker processes are not
+used, but we use the same PgAioHandle request slots as in upstream.
+For Neon-specific IO requests like GetDbSize, a neon request slot is
+used. But for the actual IO requests, the request slot merely contains
+a pointer to the PgAioHandle slot. The worker process updates the
+status of that, calls the IO callbacks upon completionetc, just like
+the upstream AIO worker processes do.
+
+## Sequence diagram
+
+                      neon
+    PostgreSQL     extension       backend_interface.rs  worker_process.rs    processor    tonic
+       |               .                    .                   .                 .
+	   | smgr_read()   .                    .                   .                 .
+	   +-------------> +                    .                   .                 .
+	   .               |                    .                   .                 .
+	   .               |  rcommunicator_    .                   .                 .
+	   .               | get_page_at_lsn    .                   .                 .
+	   .               +------------------> +                   .                 .
+                                            |                   .                 .
+                                            | write request to  .                 .                 .
+                                            | slot              .                 .
+                                            |                   .                 .
+                                            |                   .                 .
+											| submit_request()  .                 .
+											+-----------------> +                 .
+											|                   |                 .
+											|					| db_size_request .               .
+																+---------------->.
+																                  . TODO
+
+
+
+### Compute <-> pageserver protocol
+
+The protocol between Compute and the pageserver is based on gRPC. See `protos/`.
+
--- a/pgxn/neon/communicator/build.rs
+++ b/pgxn/neon/communicator/build.rs
@@ -0,0 +1,22 @@
+use std::env;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
+
+    cbindgen::generate(crate_dir).map_or_else(
+        |error| match error {
+            cbindgen::Error::ParseSyntaxError { .. } => {
+                // This means there was a syntax error in the Rust sources. Don't panic, because
+                // we want the build to continue and the Rust compiler to hit the error. The
+                // Rust compiler produces a better error message than cbindgen.
+                eprintln!("Generating C bindings failed because of a Rust syntax error");
+            }
+            e => panic!("Unable to generate C bindings: {e:?}"),
+        },
+        |bindings| {
+            bindings.write_to_file("communicator_bindings.h");
+        },
+    );
+
+    Ok(())
+}
--- a/pgxn/neon/communicator/cbindgen.toml
+++ b/pgxn/neon/communicator/cbindgen.toml
@@ -0,0 +1,4 @@
+language = "C"
+
+[enum]
+prefix_with_name = true
--- a/pgxn/neon/communicator/src/backend_comms.rs
+++ b/pgxn/neon/communicator/src/backend_comms.rs
@@ -0,0 +1,204 @@
+//! This module implements a request/response "slot" for submitting requests from backends
+//! to the communicator process.
+//!
+//! NB: The "backend" side of this code runs in Postgres backend processes,
+//! which means that it is not safe to use the 'tracing' crate for logging, nor
+//! to launch threads or use tokio tasks.
+use std::cell::UnsafeCell;
+use std::sync::atomic::fence;
+use std::sync::atomic::{AtomicI32, Ordering};
+
+use crate::neon_request::{NeonIORequest, NeonIOResult};
+
+use atomic_enum::atomic_enum;
+
+/// One request/response slot. Each backend has its own set of slots that it uses.
+///
+/// This is the moral equivalent of PgAioHandle for Postgres AIO requests
+/// Like PgAioHandle, try to keep this small.
+///
+/// There is an array of these in shared memory. Therefore, this must be Sized.
+///
+/// ## Lifecycle of a request
+///
+/// The slot is always owned by either the backend process or the communicator
+/// process, depending on the 'state'. Only the owning process is allowed to
+/// read or modify the slot, except for reading the 'state' itself to check who
+/// owns it.
+///
+/// A slot begins in the Idle state, where it is owned by the backend process.
+/// To submit a request, the backend process fills the slot with the request
+/// data, and changes it to the Submitted state. After changing the state, the
+/// slot is owned by the communicator process, and the backend is not allowed
+/// to access it until the communicator process marks it as Completed.
+///
+/// When the communicator process sees that the slot is in Submitted state, it
+/// starts to process the request. After processing the request, it stores the
+/// result in the slot, and changes the state to Completed. It is now owned by
+/// the backend process again, which may now read the result, and reuse the
+/// slot for a new request.
+///
+/// For correctness of the above protocol, we really only need two states:
+/// "owned by backend" and "owned by communicator process. But to help with
+/// debugging, there are a few more states. When the backend starts to fill in
+/// the request details in the slot, it first sets the state from Idle to
+/// Filling, and when it's done with that, from Filling to Submitted. In the
+/// Filling state, the slot is still owned by the backend. Similarly, when the
+/// communicator process starts to process a request, it sets it to Processing
+/// state first, but the slot is still owned by the communicator process.
+///
+/// This struct doesn't handle waking up the communicator process when a request
+/// has been submitted or when a response is ready. We only store the 'owner_procno'
+/// which can be used for waking up the backend on completion, but the wakeups are
+/// performed elsewhere.
+pub struct NeonIOHandle {
+    /// similar to PgAioHandleState
+    state: AtomicNeonIOHandleState,
+
+    /// The owning process's ProcNumber. The worker process uses this to set the process's
+    /// latch on completion.
+    ///
+    /// (This could be calculated from num_neon_request_slots_per_backend and the index of
+    /// this slot in the overall 'neon_requst_slots array')
+    owner_procno: AtomicI32,
+
+    /// SAFETY: This is modified by fill_request(), after it has established ownership
+    /// of the slot by setting state from Idle to Filling
+    request: UnsafeCell<NeonIORequest>,
+
+    /// valid when state is Completed
+    ///
+    /// SAFETY: This is modified by RequestProcessingGuard::complete(). There can be
+    /// only one RequestProcessingGuard outstanding for a slot at a time, because
+    /// it is returned by start_processing_request() which checks the state, so
+    /// RequestProcessingGuard has exclusive access to the slot.
+    result: UnsafeCell<NeonIOResult>,
+}
+
+// The protocol described in the "Lifecycle of a request" section above ensures
+// the safe access to the fields
+unsafe impl Send for NeonIOHandle {}
+unsafe impl Sync for NeonIOHandle {}
+
+impl Default for NeonIOHandle {
+    fn default() -> NeonIOHandle {
+        NeonIOHandle {
+            owner_procno: AtomicI32::new(-1),
+            request: UnsafeCell::new(NeonIORequest::Empty),
+            result: UnsafeCell::new(NeonIOResult::Empty),
+            state: AtomicNeonIOHandleState::new(NeonIOHandleState::Idle),
+        }
+    }
+}
+
+#[atomic_enum]
+#[derive(Eq, PartialEq)]
+pub enum NeonIOHandleState {
+    Idle,
+
+    /// backend is filling in the request
+    Filling,
+
+    /// Backend has submitted the request to the communicator, but the
+    /// communicator process has not yet started processing it.
+    Submitted,
+
+    /// Communicator is processing the request
+    Processing,
+
+    /// Communicator has completed the request, and the 'result' field is now
+    /// valid, but the backend has not read the result yet.
+    Completed,
+}
+
+pub struct RequestProcessingGuard<'a>(&'a NeonIOHandle);
+
+unsafe impl<'a> Send for RequestProcessingGuard<'a> {}
+unsafe impl<'a> Sync for RequestProcessingGuard<'a> {}
+
+impl<'a> RequestProcessingGuard<'a> {
+    pub fn get_request(&self) -> &NeonIORequest {
+        unsafe { &*self.0.request.get() }
+    }
+
+    pub fn get_owner_procno(&self) -> i32 {
+        self.0.owner_procno.load(Ordering::Relaxed)
+    }
+
+    pub fn completed(self, result: NeonIOResult) {
+        unsafe {
+            *self.0.result.get() = result;
+        };
+
+        // Ok, we have completed the IO. Mark the request as completed. After that,
+        // we no longer have ownership of the slot, and must not modify it.
+        let old_state = self
+            .0
+            .state
+            .swap(NeonIOHandleState::Completed, Ordering::Release);
+        assert!(old_state == NeonIOHandleState::Processing);
+    }
+}
+
+impl NeonIOHandle {
+    pub fn fill_request(&self, request: &NeonIORequest, proc_number: i32) {
+        // Verify that the slot is in Idle state previously, and start filling it.
+        //
+        // XXX: This step isn't strictly necessary. Assuming the caller didn't screw up
+        // and try to use a slot that's already in use, we could fill the slot and
+        // switch it directly from Idle to Submitted state.
+        if let Err(s) = self.state.compare_exchange(
+            NeonIOHandleState::Idle,
+            NeonIOHandleState::Filling,
+            Ordering::Relaxed,
+            Ordering::Relaxed,
+        ) {
+            panic!("unexpected state in request slot: {s:?}");
+        }
+
+        // This fence synchronizes-with store/swap in `communicator_process_main_loop`.
+        fence(Ordering::Acquire);
+
+        self.owner_procno.store(proc_number, Ordering::Relaxed);
+        unsafe { *self.request.get() = *request }
+        self.state
+            .store(NeonIOHandleState::Submitted, Ordering::Release);
+    }
+
+    pub fn try_get_result(&self) -> Option<NeonIOResult> {
+        // FIXME: ordering?
+        let state = self.state.load(Ordering::Relaxed);
+        if state == NeonIOHandleState::Completed {
+            // This fence synchronizes-with store/swap in `communicator_process_main_loop`.
+            fence(Ordering::Acquire);
+            let result = unsafe { *self.result.get() };
+            self.state.store(NeonIOHandleState::Idle, Ordering::Relaxed);
+            Some(result)
+        } else {
+            None
+        }
+    }
+
+    pub fn start_processing_request<'a>(&'a self) -> Option<RequestProcessingGuard<'a>> {
+        // Read the IO request from the slot indicated in the wakeup
+        //
+        // XXX: using compare_exchange for this is not strictly necessary, as long as
+        // the communicator process has _some_ means of tracking which requests it's
+        // already processing. That could be a flag somewhere in communicator's private
+        // memory, for example.
+        if let Err(s) = self.state.compare_exchange(
+            NeonIOHandleState::Submitted,
+            NeonIOHandleState::Processing,
+            Ordering::Relaxed,
+            Ordering::Relaxed,
+        ) {
+            // FIXME surprising state. This is unexpected at the moment, but if we
+            // started to process requests more aggressively, without waiting for the
+            // read from the pipe, then this could happen
+            panic!("unexpected state in request slot: {s:?}");
+        }
+        fence(Ordering::Acquire);
+
+        Some(RequestProcessingGuard(self))
+    }
+}
--- a/pgxn/neon/communicator/src/backend_interface.rs
+++ b/pgxn/neon/communicator/src/backend_interface.rs
@@ -0,0 +1,222 @@
+//! This code runs in each backend process. That means that launching Rust threads, panicking
+//! etc. is forbidden!
+
+use std::os::fd::OwnedFd;
+
+use crate::backend_comms::NeonIOHandle;
+use crate::init::CommunicatorInitStruct;
+use crate::integrated_cache::{BackendCacheReadOp, IntegratedCacheReadAccess};
+use crate::neon_request::{CCachedGetPageVResult, COid};
+use crate::neon_request::{NeonIORequest, NeonIOResult};
+
+pub struct CommunicatorBackendStruct<'t> {
+    my_proc_number: i32,
+
+    next_neon_request_idx: u32,
+
+    my_start_idx: u32, // First request slot that belongs to this backend
+    my_end_idx: u32,   // end + 1 request slot that belongs to this backend
+
+    neon_request_slots: &'t [NeonIOHandle],
+
+    submission_pipe_write_fd: OwnedFd,
+
+    pending_cache_read_op: Option<BackendCacheReadOp<'t>>,
+
+    integrated_cache: &'t IntegratedCacheReadAccess<'t>,
+}
+
+#[unsafe(no_mangle)]
+pub extern "C" fn rcommunicator_backend_init(
+    cis: Box<CommunicatorInitStruct>,
+    my_proc_number: i32,
+) -> &'static mut CommunicatorBackendStruct<'static> {
+    let start_idx = my_proc_number as u32 * cis.num_neon_request_slots_per_backend;
+    let end_idx = start_idx + cis.num_neon_request_slots_per_backend;
+
+    let integrated_cache = Box::leak(Box::new(cis.integrated_cache_init_struct.backend_init()));
+
+    let bs: &'static mut CommunicatorBackendStruct =
+        Box::leak(Box::new(CommunicatorBackendStruct {
+            my_proc_number,
+            next_neon_request_idx: start_idx,
+            my_start_idx: start_idx,
+            my_end_idx: end_idx,
+            neon_request_slots: cis.neon_request_slots,
+
+            submission_pipe_write_fd: cis.submission_pipe_write_fd,
+            pending_cache_read_op: None,
+
+            integrated_cache,
+        }));
+    bs
+}
+
+/// Start a request. You can poll for its completion and get the result by
+/// calling bcomm_poll_dbsize_request_completion(). The communicator will wake
+/// us up by setting our process latch, so to wait for the completion, wait on
+/// the latch and call bcomm_poll_dbsize_request_completion() every time the
+/// latch is set.
+///
+/// Safety: The C caller must ensure that the references are valid.
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_start_io_request(
+    bs: &'_ mut CommunicatorBackendStruct,
+    request: &NeonIORequest,
+    immediate_result_ptr: &mut NeonIOResult,
+) -> i32 {
+    assert!(bs.pending_cache_read_op.is_none());
+
+    // Check if the request can be satisfied from the cache first
+    if let NeonIORequest::RelSize(req) = request {
+        if let Some(nblocks) = bs.integrated_cache.get_rel_size(&req.reltag()) {
+            *immediate_result_ptr = NeonIOResult::RelSize(nblocks);
+            return -1;
+        }
+    }
+
+    // Create neon request and submit it
+    let request_idx = bs.start_neon_request(request);
+
+    // Tell the communicator about it
+    bs.submit_request(request_idx);
+
+    request_idx
+}
+
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_start_get_page_v_request(
+    bs: &mut CommunicatorBackendStruct,
+    request: &NeonIORequest,
+    immediate_result_ptr: &mut CCachedGetPageVResult,
+) -> i32 {
+    let NeonIORequest::GetPageV(get_pagev_request) = request else {
+        panic!("invalid request passed to bcomm_start_get_page_v_request()");
+    };
+    assert!(matches!(request, NeonIORequest::GetPageV(_)));
+    assert!(bs.pending_cache_read_op.is_none());
+
+    // Check if the request can be satisfied from the cache first
+    let mut all_cached = true;
+    let mut read_op = bs.integrated_cache.start_read_op();
+    for i in 0..get_pagev_request.nblocks {
+        if let Some(cache_block) = read_op.get_page(
+            &get_pagev_request.reltag(),
+            get_pagev_request.block_number + i as u32,
+        ) {
+            immediate_result_ptr.cache_block_numbers[i as usize] = cache_block;
+        } else {
+            // not found in cache
+            all_cached = false;
+            break;
+        }
+    }
+    if all_cached {
+        bs.pending_cache_read_op = Some(read_op);
+        return -1;
+    }
+
+    // Create neon request and submit it
+    let request_idx = bs.start_neon_request(request);
+
+    // Tell the communicator about it
+    bs.submit_request(request_idx);
+
+    request_idx
+}
+
+/// Check if a request has completed. Returns:
+///
+/// -1 if the request is still being processed
+/// 0 on success
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_poll_request_completion(
+    bs: &mut CommunicatorBackendStruct,
+    request_idx: u32,
+    result_p: &mut NeonIOResult,
+) -> i32 {
+    match bs.neon_request_slots[request_idx as usize].try_get_result() {
+        None => -1, // still processing
+        Some(result) => {
+            *result_p = result;
+            0
+        }
+    }
+}
+
+// LFC functions
+
+/// Finish a local file cache read
+///
+//
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_finish_cache_read(bs: &mut CommunicatorBackendStruct) -> bool {
+    if let Some(op) = bs.pending_cache_read_op.take() {
+        op.finish()
+    } else {
+        panic!("bcomm_finish_cache_read() called with no cached read pending");
+    }
+}
+
+
+/// Check if the local file cache contians the given block
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_cache_contains(
+    bs: &mut CommunicatorBackendStruct,
+    spc_oid: COid,
+    db_oid: COid,
+    rel_number: u32,
+    fork_number: u8,
+    block_number: u32,
+) -> bool {
+    bs.integrated_cache.cache_contains_page(
+        &pageserver_page_api::RelTag {
+            spcnode: spc_oid,
+            dbnode: db_oid,
+            relnode: rel_number,
+            forknum: fork_number,
+        },
+        block_number
+    )
+}
+
+
+impl<'t> CommunicatorBackendStruct<'t> {
+    /// Send a wakeup to the communicator process
+    fn submit_request(self: &CommunicatorBackendStruct<'t>, request_idx: i32) {
+        // wake up communicator by writing the idx to the submission pipe
+        //
+        // This can block, if the pipe is full. That should be very rare,
+        // because the communicator tries hard to drain the pipe to prevent
+        // that. Also, there's a natural upper bound on how many wakeups can be
+        // queued up: there is only a limited number of request slots for each
+        // backend.
+        //
+        // If it does block very briefly, that's not too serious.
+        let idxbuf = request_idx.to_ne_bytes();
+
+        let _res = nix::unistd::write(&self.submission_pipe_write_fd, &idxbuf);
+        // FIXME: check result, return any errors
+    }
+
+    /// Note: there's no guarantee on when the communicator might pick it up. You should ring
+    /// the doorbell. But it might pick it up immediately.
+    pub(crate) fn start_neon_request(&mut self, request: &NeonIORequest) -> i32 {
+        let my_proc_number = self.my_proc_number;
+
+        // Grab next free slot
+        // FIXME: any guarantee that there will be any?
+        let idx = self.next_neon_request_idx;
+
+        let next_idx = idx + 1;
+        self.next_neon_request_idx = if next_idx == self.my_end_idx {
+            self.my_start_idx
+        } else {
+            next_idx
+        };
+
+        self.neon_request_slots[idx as usize].fill_request(request, my_proc_number);
+
+        idx as i32
+    }
+}
--- a/pgxn/neon/communicator/src/file_cache.rs
+++ b/pgxn/neon/communicator/src/file_cache.rs
@@ -0,0 +1,160 @@
+//! Implement the "low-level" parts of the file cache.
+//!
+//! This module just deals with reading and writing the file, and keeping track
+//! which blocks in the cache file are in use and which are free. The "high
+//! level" parts of tracking which block in the cache file corresponds to which
+//! relation block is handled in 'integrated_cache' instead.
+//!
+//! This module is only used to access the file from the communicator
+//! process. The backend processes *also* read the file (and sometimes also
+//! write it? ), but the backends use direct C library calls for that.
+use std::fs::File;
+use std::os::unix::fs::FileExt;
+use std::path::Path;
+use std::sync::Arc;
+use std::sync::Mutex;
+
+use crate::BLCKSZ;
+
+use tokio::task::spawn_blocking;
+
+pub type CacheBlock = u64;
+
+pub const INVALID_CACHE_BLOCK: CacheBlock = u64::MAX;
+
+pub struct FileCache {
+    file: Arc<File>,
+
+    free_list: Mutex<FreeList>,
+
+    // metrics
+    max_blocks_gauge: metrics::IntGauge,
+    num_free_blocks_gauge: metrics::IntGauge,
+}
+
+// TODO: We keep track of all free blocks in this vec. That doesn't really scale.
+// Idea: when free_blocks fills up with more than 1024 entries, write them all to
+// one block on disk.
+struct FreeList {
+    next_free_block: CacheBlock,
+    max_blocks: u64,
+
+    free_blocks: Vec<CacheBlock>,
+}
+
+impl FileCache {
+    pub fn new(file_cache_path: &Path, mut initial_size: u64) -> Result<FileCache, std::io::Error> {
+        if initial_size < 100 {
+            tracing::warn!(
+                "min size for file cache is 100 blocks, {} requested",
+                initial_size
+            );
+            initial_size = 100;
+        }
+
+        let file = std::fs::OpenOptions::new()
+            .read(true)
+            .write(true)
+            .truncate(true)
+            .create(true)
+            .open(file_cache_path)?;
+
+        let max_blocks_gauge = metrics::IntGauge::new(
+            "file_cache_max_blocks",
+            "Local File Cache size in 8KiB blocks",
+        )
+        .unwrap();
+        let num_free_blocks_gauge = metrics::IntGauge::new(
+            "file_cache_num_free_blocks",
+            "Number of free 8KiB blocks in Local File Cache",
+        )
+        .unwrap();
+
+        tracing::info!("initialized file cache with {} blocks", initial_size);
+
+        Ok(FileCache {
+            file: Arc::new(file),
+            free_list: Mutex::new(FreeList {
+                next_free_block: 0,
+                max_blocks: initial_size,
+                free_blocks: Vec::new(),
+            }),
+            max_blocks_gauge,
+            num_free_blocks_gauge,
+        })
+    }
+
+    // File cache management
+
+    pub async fn read_block(
+        &self,
+        cache_block: CacheBlock,
+        mut dst: impl uring_common::buf::IoBufMut + Send + Sync,
+    ) -> Result<(), std::io::Error> {
+        assert!(dst.bytes_total() == BLCKSZ);
+        let file = self.file.clone();
+
+        let dst_ref = unsafe { std::slice::from_raw_parts_mut(dst.stable_mut_ptr(), BLCKSZ) };
+
+        spawn_blocking(move || file.read_exact_at(dst_ref, cache_block * BLCKSZ as u64)).await??;
+        Ok(())
+    }
+
+    pub async fn write_block(
+        &self,
+        cache_block: CacheBlock,
+        src: impl uring_common::buf::IoBuf + Send + Sync,
+    ) -> Result<(), std::io::Error> {
+        assert!(src.bytes_init() == BLCKSZ);
+        let file = self.file.clone();
+
+        let src_ref = unsafe { std::slice::from_raw_parts(src.stable_ptr(), BLCKSZ) };
+
+        spawn_blocking(move || file.write_all_at(src_ref, cache_block * BLCKSZ as u64)).await??;
+
+        Ok(())
+    }
+
+    pub fn alloc_block(&self) -> Option<CacheBlock> {
+        let mut free_list = self.free_list.lock().unwrap();
+        if let Some(x) = free_list.free_blocks.pop() {
+            return Some(x);
+        }
+        if free_list.next_free_block < free_list.max_blocks {
+            let result = free_list.next_free_block;
+            free_list.next_free_block += 1;
+            return Some(result);
+        }
+        None
+    }
+
+    pub fn dealloc_block(&self, cache_block: CacheBlock) {
+        let mut free_list = self.free_list.lock().unwrap();
+        free_list.free_blocks.push(cache_block);
+    }
+}
+
+impl metrics::core::Collector for FileCache {
+    fn desc(&self) -> Vec<&metrics::core::Desc> {
+        let mut descs = Vec::new();
+        descs.append(&mut self.max_blocks_gauge.desc());
+        descs.append(&mut self.num_free_blocks_gauge.desc());
+        descs
+    }
+    fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
+        // Update the gauges with fresh values first
+        {
+            let free_list = self.free_list.lock().unwrap();
+            self.max_blocks_gauge.set(free_list.max_blocks as i64);
+
+            let total_free_blocks: i64 = free_list.free_blocks.len() as i64
+                + (free_list.max_blocks as i64 - free_list.next_free_block as i64);
+            self.num_free_blocks_gauge.set(total_free_blocks);
+        }
+
+        let mut values = Vec::new();
+        values.append(&mut self.max_blocks_gauge.collect());
+        values.append(&mut self.num_free_blocks_gauge.collect());
+        values
+    }
+}
--- a/pgxn/neon/communicator/src/global_allocator.rs
+++ b/pgxn/neon/communicator/src/global_allocator.rs
@@ -0,0 +1,109 @@
+//! Global allocator, for tracking memory usage of the Rust parts
+//!
+//! Postgres is designed to handle allocation failure (ie. malloc() returning NULL) gracefully.  It
+//! rolls backs the transaction and gives the user an "ERROR: out of memory" error. Rust code
+//! however panics if an allocation fails. We don't want that to ever happen, because an unhandled
+//! panic leads to Postgres crash and restart. Our strategy is to pre-allocate a large enough chunk
+//! of memory for use by the Rust code, so that the allocations never fail.
+//!
+//! To pick the size for the pre-allocated chunk, we have a metric to track the high watermark
+//! memory usage of all the Rust allocations in total.
+//!
+//! TODO:
+//!
+//! - Currently we just export the metrics. Actual allocations are still just passed through to
+//!   the system allocator.
+//! - Take padding etc. overhead into account
+
+use std::alloc::{GlobalAlloc, Layout, System};
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
+
+use metrics::IntGauge;
+
+struct MyAllocator {
+    allocations: AtomicU64,
+    deallocations: AtomicU64,
+
+    allocated: AtomicUsize,
+    high: AtomicUsize,
+}
+
+unsafe impl GlobalAlloc for MyAllocator {
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        self.allocations.fetch_add(1, Ordering::Relaxed);
+        let mut allocated = self.allocated.fetch_add(layout.size(), Ordering::Relaxed);
+        allocated += layout.size();
+        self.high.fetch_max(allocated, Ordering::Relaxed);
+        unsafe { System.alloc(layout) }
+    }
+
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+        self.deallocations.fetch_add(1, Ordering::Relaxed);
+        self.allocated.fetch_sub(layout.size(), Ordering::Relaxed);
+        unsafe { System.dealloc(ptr, layout) }
+    }
+}
+
+#[global_allocator]
+static GLOBAL: MyAllocator = MyAllocator {
+    allocations: AtomicU64::new(0),
+    deallocations: AtomicU64::new(0),
+    allocated: AtomicUsize::new(0),
+    high: AtomicUsize::new(0),
+};
+
+pub struct MyAllocatorCollector {
+    allocations: IntGauge,
+    deallocations: IntGauge,
+    allocated: IntGauge,
+    high: IntGauge,
+}
+
+impl MyAllocatorCollector {
+    pub fn new() -> MyAllocatorCollector {
+        MyAllocatorCollector {
+            allocations: IntGauge::new("allocations_total", "Number of allocations in Rust code")
+                .unwrap(),
+            deallocations: IntGauge::new(
+                "deallocations_total",
+                "Number of deallocations in Rust code",
+            )
+            .unwrap(),
+            allocated: IntGauge::new("allocated_total", "Bytes currently allocated").unwrap(),
+            high: IntGauge::new("allocated_high", "High watermark of allocated bytes").unwrap(),
+        }
+    }
+}
+
+impl metrics::core::Collector for MyAllocatorCollector {
+    fn desc(&self) -> Vec<&metrics::core::Desc> {
+        let mut descs = Vec::new();
+
+        descs.append(&mut self.allocations.desc());
+        descs.append(&mut self.deallocations.desc());
+        descs.append(&mut self.allocated.desc());
+        descs.append(&mut self.high.desc());
+
+        descs
+    }
+
+    fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
+        let mut values = Vec::new();
+
+        // update the gauges
+        self.allocations
+            .set(GLOBAL.allocations.load(Ordering::Relaxed) as i64);
+        self.deallocations
+            .set(GLOBAL.allocations.load(Ordering::Relaxed) as i64);
+        self.allocated
+            .set(GLOBAL.allocated.load(Ordering::Relaxed) as i64);
+        self.high.set(GLOBAL.high.load(Ordering::Relaxed) as i64);
+
+        values.append(&mut self.allocations.collect());
+        values.append(&mut self.deallocations.collect());
+        values.append(&mut self.allocated.collect());
+        values.append(&mut self.high.collect());
+
+        values
+    }
+}
--- a/pgxn/neon/communicator/src/init.rs
+++ b/pgxn/neon/communicator/src/init.rs
@@ -0,0 +1,184 @@
+//! Initialization functions. These are executed in the postmaster process,
+//! at different stages of server startup.
+//!
+//!
+//! Communicator initialization steps:
+//!
+//! 1. At postmaster startup, before shared memory is allocated,
+//!    rcommunicator_shmem_size() is called to get the amount of
+//!    shared memory that this module needs.
+//!
+//! 2. Later, after the shared memory has been allocated,
+//!    rcommunicator_shmem_init() is called to initialize the shmem
+//!    area.
+//!
+//! Per process initialization:
+//!
+//! When a backend process starts up, it calls rcommunicator_backend_init().
+//! In the communicator worker process, other functions are called, see
+//! `worker_process` module.
+
+use std::ffi::c_int;
+use std::mem;
+use std::mem::MaybeUninit;
+use std::os::fd::OwnedFd;
+
+use crate::backend_comms::NeonIOHandle;
+use crate::integrated_cache::IntegratedCacheInitStruct;
+
+const NUM_NEON_REQUEST_SLOTS_PER_BACKEND: u32 = 5;
+
+/// This struct is created in the postmaster process, and inherited to
+/// the communicator process and all backend processes through fork()
+#[repr(C)]
+pub struct CommunicatorInitStruct {
+    #[allow(dead_code)]
+    pub max_procs: u32,
+
+    pub submission_pipe_read_fd: OwnedFd,
+    pub submission_pipe_write_fd: OwnedFd,
+
+    // Shared memory data structures
+    pub num_neon_request_slots_per_backend: u32,
+
+    pub neon_request_slots: &'static [NeonIOHandle],
+
+    pub integrated_cache_init_struct: IntegratedCacheInitStruct<'static>,
+}
+
+impl std::fmt::Debug for CommunicatorInitStruct {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        fmt.debug_struct("CommunicatorInitStruct")
+            .field("max_procs", &self.max_procs)
+            .field("submission_pipe_read_fd", &self.submission_pipe_read_fd)
+            .field("submission_pipe_write_fd", &self.submission_pipe_write_fd)
+            .field(
+                "num_neon_request_slots_per_backend",
+                &self.num_neon_request_slots_per_backend,
+            )
+            .field("neon_request_slots length", &self.neon_request_slots.len())
+            .finish()
+    }
+}
+
+#[unsafe(no_mangle)]
+pub extern "C" fn rcommunicator_shmem_size(max_procs: u32) -> u64 {
+    let mut size = 0;
+
+    let num_neon_request_slots = max_procs * NUM_NEON_REQUEST_SLOTS_PER_BACKEND;
+    size += mem::size_of::<NeonIOHandle>() * num_neon_request_slots as usize;
+
+    // For integrated_cache's Allocator. TODO: make this adjustable
+    size += IntegratedCacheInitStruct::shmem_size(max_procs);
+
+    size as u64
+}
+
+/// Initialize the shared memory segment. Returns a backend-private
+/// struct, which will be inherited by backend processes through fork
+#[unsafe(no_mangle)]
+pub extern "C" fn rcommunicator_shmem_init(
+    submission_pipe_read_fd: c_int,
+    submission_pipe_write_fd: c_int,
+    max_procs: u32,
+    shmem_area_ptr: *mut MaybeUninit<u8>,
+    shmem_area_len: u64,
+    initial_file_cache_size: u64,
+    max_file_cache_size: u64,
+) -> &'static mut CommunicatorInitStruct {
+    let shmem_area: &'static mut [MaybeUninit<u8>] =
+        unsafe { std::slice::from_raw_parts_mut(shmem_area_ptr, shmem_area_len as usize) };
+
+    // Carve out the request slots from the shmem area and initialize them
+    let num_neon_request_slots_per_backend = NUM_NEON_REQUEST_SLOTS_PER_BACKEND as usize;
+    let num_neon_request_slots = max_procs as usize * num_neon_request_slots_per_backend;
+
+    let (neon_request_slots, remaining_area) =
+        alloc_array_from_slice::<NeonIOHandle>(shmem_area, num_neon_request_slots);
+
+    for slot in neon_request_slots.iter_mut() {
+        slot.write(NeonIOHandle::default());
+    }
+
+    // 'neon_request_slots' is initialized now. (MaybeUninit::slice_assume_init_mut() is nightly-only
+    // as of this writing.)
+    let neon_request_slots = unsafe {
+        std::mem::transmute::<&mut [MaybeUninit<NeonIOHandle>], &mut [NeonIOHandle]>(
+            neon_request_slots,
+        )
+    };
+
+    // Give the rest of the area to the integrated cache
+    let integrated_cache_init_struct = IntegratedCacheInitStruct::shmem_init(
+        max_procs,
+        remaining_area,
+        initial_file_cache_size,
+        max_file_cache_size,
+    );
+
+    let (submission_pipe_read_fd, submission_pipe_write_fd) = unsafe {
+        use std::os::fd::FromRawFd;
+        (
+            OwnedFd::from_raw_fd(submission_pipe_read_fd),
+            OwnedFd::from_raw_fd(submission_pipe_write_fd),
+        )
+    };
+
+    let cis: &'static mut CommunicatorInitStruct = Box::leak(Box::new(CommunicatorInitStruct {
+        max_procs,
+        submission_pipe_read_fd,
+        submission_pipe_write_fd,
+
+        num_neon_request_slots_per_backend: NUM_NEON_REQUEST_SLOTS_PER_BACKEND,
+        neon_request_slots,
+
+        integrated_cache_init_struct,
+    }));
+
+    cis
+}
+
+// fixme: currently unused
+#[allow(dead_code)]
+pub fn alloc_from_slice<T>(
+    area: &mut [MaybeUninit<u8>],
+) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
+    let layout = std::alloc::Layout::new::<T>();
+
+    let area_start = area.as_mut_ptr();
+
+    // pad to satisfy alignment requirements
+    let padding = area_start.align_offset(layout.align());
+    if padding + layout.size() > area.len() {
+        panic!("out of memory");
+    }
+    let area = &mut area[padding..];
+    let (result_area, remain) = area.split_at_mut(layout.size());
+
+    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
+    let result = unsafe { result_ptr.as_mut().unwrap() };
+
+    (result, remain)
+}
+
+pub fn alloc_array_from_slice<T>(
+    area: &mut [MaybeUninit<u8>],
+    len: usize,
+) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
+    let layout = std::alloc::Layout::new::<T>();
+
+    let area_start = area.as_mut_ptr();
+
+    // pad to satisfy alignment requirements
+    let padding = area_start.align_offset(layout.align());
+    if padding + layout.size() * len > area.len() {
+        panic!("out of memory");
+    }
+    let area = &mut area[padding..];
+    let (result_area, remain) = area.split_at_mut(layout.size() * len);
+
+    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
+    let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
+
+    (result, remain)
+}
--- a/pgxn/neon/communicator/src/integrated_cache.rs
+++ b/pgxn/neon/communicator/src/integrated_cache.rs
@@ -0,0 +1,809 @@
+//! Integrated communicator cache
+//!
+//! It tracks:
+//! - Relation sizes and existence
+//! - Last-written LSN
+//! - Block cache (also known as LFC)
+//!
+//! TODO: limit the size
+//! TODO: concurrency
+//!
+//! Note: This deals with "relations" which is really just one "relation fork" in Postgres
+//! terms. RelFileLocator + ForkNumber is the key.
+
+//
+// TODO: Thoughts on eviction:
+//
+// There are two things we need to track, and evict if we run out of space:
+// - blocks in the file cache's file. If the file grows too large, need to evict something.
+//   Also if the cache is resized
+//
+// - entries in the cache map. If we run out of memory in the shmem area, need to evict
+//   something
+//
+
+use std::mem::MaybeUninit;
+use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
+
+use utils::lsn::{AtomicLsn, Lsn};
+
+use crate::file_cache::INVALID_CACHE_BLOCK;
+use crate::file_cache::{CacheBlock, FileCache};
+use pageserver_page_api::RelTag;
+
+use metrics::{IntCounter, IntGauge};
+
+use neon_shmem::hash::HashMapInit;
+use neon_shmem::hash::UpdateAction;
+use neon_shmem::shmem::ShmemHandle;
+
+// in # of entries
+const RELSIZE_CACHE_SIZE: u32 = 64 * 1024;
+
+/// This struct is initialized at postmaster startup, and passed to all the processes via fork().
+pub struct IntegratedCacheInitStruct<'t> {
+    relsize_cache_handle: HashMapInit<'t, RelKey, RelEntry>,
+    block_map_handle: HashMapInit<'t, BlockKey, BlockEntry>,
+}
+
+/// Represents write-access to the integrated cache. This is used by the communicator process.
+pub struct IntegratedCacheWriteAccess<'t> {
+    relsize_cache: neon_shmem::hash::HashMapAccess<'t, RelKey, RelEntry>,
+    block_map: neon_shmem::hash::HashMapAccess<'t, BlockKey, BlockEntry>,
+
+    global_lw_lsn: AtomicU64,
+
+    pub(crate) file_cache: Option<FileCache>,
+
+    // Fields for eviction
+    clock_hand: std::sync::Mutex<usize>,
+
+    // Metrics
+    page_evictions_counter: IntCounter,
+    clock_iterations_counter: IntCounter,
+
+    // metrics from the hash map
+    block_map_num_buckets: IntGauge,
+    block_map_num_buckets_in_use: IntGauge,
+
+    relsize_cache_num_buckets: IntGauge,
+    relsize_cache_num_buckets_in_use: IntGauge,
+}
+
+/// Represents read-only access to the integrated cache. Backend processes have this.
+pub struct IntegratedCacheReadAccess<'t> {
+    relsize_cache: neon_shmem::hash::HashMapAccess<'t, RelKey, RelEntry>,
+    block_map: neon_shmem::hash::HashMapAccess<'t, BlockKey, BlockEntry>,
+}
+
+impl<'t> IntegratedCacheInitStruct<'t> {
+    /// Return the desired size in bytes of the fixed-size shared memory area to reserve for the
+    /// integrated cache.
+    pub fn shmem_size(_max_procs: u32) -> usize {
+        // The relsize cache is fixed-size. The block map is allocated in a separate resizable
+        // area.
+        HashMapInit::<RelKey, RelEntry>::estimate_size(RELSIZE_CACHE_SIZE)
+    }
+
+    /// Initialize the shared memory segment. This runs once in postmaster. Returns a struct which
+    /// will be inherited by all processes through fork.
+    pub fn shmem_init(
+        _max_procs: u32,
+        shmem_area: &'t mut [MaybeUninit<u8>],
+        initial_file_cache_size: u64,
+        max_file_cache_size: u64,
+    ) -> IntegratedCacheInitStruct<'t> {
+        // Initialize the relsize cache in the fixed-size area
+        let relsize_cache_handle =
+            neon_shmem::hash::HashMapInit::init_in_fixed_area(RELSIZE_CACHE_SIZE, shmem_area);
+
+        let max_bytes =
+            HashMapInit::<BlockKey, BlockEntry>::estimate_size(max_file_cache_size as u32);
+
+        // Initialize the block map in a separate resizable shared memory area
+        let shmem_handle = ShmemHandle::new("block mapping", 0, max_bytes).unwrap();
+
+        let block_map_handle = neon_shmem::hash::HashMapInit::init_in_shmem(
+            initial_file_cache_size as u32,
+            shmem_handle,
+        );
+        IntegratedCacheInitStruct {
+            relsize_cache_handle,
+            block_map_handle,
+        }
+    }
+
+    /// Initialize access to the integrated cache for the communicator worker process
+    pub fn worker_process_init(
+        self,
+        lsn: Lsn,
+        file_cache: Option<FileCache>,
+    ) -> IntegratedCacheWriteAccess<'t> {
+        let IntegratedCacheInitStruct {
+            relsize_cache_handle,
+            block_map_handle,
+        } = self;
+        IntegratedCacheWriteAccess {
+            relsize_cache: relsize_cache_handle.attach_writer(),
+            block_map: block_map_handle.attach_writer(),
+            global_lw_lsn: AtomicU64::new(lsn.0),
+            file_cache,
+            clock_hand: std::sync::Mutex::new(0),
+
+            page_evictions_counter: metrics::IntCounter::new(
+                "integrated_cache_evictions",
+                "Page evictions from the Local File Cache",
+            )
+            .unwrap(),
+
+            clock_iterations_counter: metrics::IntCounter::new(
+                "clock_iterations",
+                "Number of times the clock hand has moved",
+            )
+            .unwrap(),
+
+            block_map_num_buckets: metrics::IntGauge::new(
+                "block_map_num_buckets",
+                "Allocated size of the block cache hash map",
+            )
+            .unwrap(),
+            block_map_num_buckets_in_use: metrics::IntGauge::new(
+                "block_map_num_buckets_in_use",
+                "Number of buckets in use in the block cache hash map",
+            )
+            .unwrap(),
+
+            relsize_cache_num_buckets: metrics::IntGauge::new(
+                "relsize_cache_num_buckets",
+                "Allocated size of the relsize cache hash map",
+            )
+            .unwrap(),
+            relsize_cache_num_buckets_in_use: metrics::IntGauge::new(
+                "relsize_cache_num_buckets_in_use",
+                "Number of buckets in use in the relsize cache hash map",
+            )
+            .unwrap(),
+        }
+    }
+
+    /// Initialize access to the integrated cache for a backend process
+    pub fn backend_init(self) -> IntegratedCacheReadAccess<'t> {
+        let IntegratedCacheInitStruct {
+            relsize_cache_handle,
+            block_map_handle,
+        } = self;
+
+        IntegratedCacheReadAccess {
+            relsize_cache: relsize_cache_handle.attach_reader(),
+            block_map: block_map_handle.attach_reader(),
+        }
+    }
+}
+
+/// Value stored in the cache mapping hash table.
+struct BlockEntry {
+    lw_lsn: AtomicLsn,
+    cache_block: AtomicU64,
+
+    pinned: AtomicU64,
+
+    // 'referenced' bit for the clock algorithm
+    referenced: AtomicBool,
+}
+
+/// Value stored in the relsize cache hash table.
+struct RelEntry {
+    /// cached size of the relation
+    /// u32::MAX means 'not known' (that's InvalidBlockNumber in Postgres)
+    nblocks: AtomicU32,
+}
+
+impl std::fmt::Debug for RelEntry {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        fmt.debug_struct("Rel")
+            .field("nblocks", &self.nblocks.load(Ordering::Relaxed))
+            .finish()
+    }
+}
+impl std::fmt::Debug for BlockEntry {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        fmt.debug_struct("Block")
+            .field("lw_lsn", &self.lw_lsn.load())
+            .field("cache_block", &self.cache_block.load(Ordering::Relaxed))
+            .field("pinned", &self.pinned.load(Ordering::Relaxed))
+            .field("referenced", &self.referenced.load(Ordering::Relaxed))
+            .finish()
+    }
+}
+
+#[derive(Clone, Debug, PartialEq, PartialOrd, Eq, Hash, Ord)]
+struct RelKey(RelTag);
+
+impl From<&RelTag> for RelKey {
+    fn from(val: &RelTag) -> RelKey {
+        RelKey(*val)
+    }
+}
+
+#[derive(Clone, Debug, PartialEq, PartialOrd, Eq, Hash, Ord)]
+struct BlockKey {
+    rel: RelTag,
+    block_number: u32,
+}
+
+impl From<(&RelTag, u32)> for BlockKey {
+    fn from(val: (&RelTag, u32)) -> BlockKey {
+        BlockKey {
+            rel: *val.0,
+            block_number: val.1,
+        }
+    }
+}
+
+/// Return type used in the cache's get_*() functions. 'Found' means that the page, or other
+/// information that was enqueried, exists in the cache. '
+pub enum CacheResult<V> {
+    /// The enqueried page or other information existed in the cache.
+    Found(V),
+
+    /// The cache doesn't contain the page (or other enqueried information, like relation size). The
+    /// Lsn is the 'not_modified_since' LSN that should be used in the request to the pageserver to
+    /// read the page.
+    NotFound(Lsn),
+}
+
+impl<'t> IntegratedCacheWriteAccess<'t> {
+    pub fn get_rel_size(&'t self, rel: &RelTag) -> CacheResult<u32> {
+        if let Some(nblocks) = get_rel_size(&self.relsize_cache, rel) {
+            CacheResult::Found(nblocks)
+        } else {
+            let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
+            CacheResult::NotFound(lsn)
+        }
+    }
+
+    pub async fn get_page(
+        &'t self,
+        rel: &RelTag,
+        block_number: u32,
+        dst: impl uring_common::buf::IoBufMut + Send + Sync,
+    ) -> Result<CacheResult<()>, std::io::Error> {
+        let x = if let Some(block_entry) = self.block_map.get(&BlockKey::from((rel, block_number)))
+        {
+            block_entry.referenced.store(true, Ordering::Relaxed);
+
+            let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
+            if cache_block != INVALID_CACHE_BLOCK {
+                // pin it and release lock
+                block_entry.pinned.fetch_add(1, Ordering::Relaxed);
+
+                (cache_block, DeferredUnpin(block_entry.pinned.as_ptr()))
+            } else {
+                return Ok(CacheResult::NotFound(block_entry.lw_lsn.load()));
+            }
+        } else {
+            let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
+            return Ok(CacheResult::NotFound(lsn));
+        };
+
+        let (cache_block, _deferred_pin) = x;
+        self.file_cache
+            .as_ref()
+            .unwrap()
+            .read_block(cache_block, dst)
+            .await?;
+
+        // unpin the entry (by implicitly dropping deferred_pin)
+        Ok(CacheResult::Found(()))
+    }
+
+    pub async fn page_is_cached(
+        &'t self,
+        rel: &RelTag,
+        block_number: u32,
+    ) -> Result<CacheResult<()>, std::io::Error> {
+        if let Some(block_entry) = self.block_map.get(&BlockKey::from((rel, block_number))) {
+            // This is used for prefetch requests. Treat the probe as an 'access', to keep it
+            // in cache.
+            block_entry.referenced.store(true, Ordering::Relaxed);
+
+            let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
+
+            if cache_block != INVALID_CACHE_BLOCK {
+                Ok(CacheResult::Found(()))
+            } else {
+                Ok(CacheResult::NotFound(block_entry.lw_lsn.load()))
+            }
+        } else {
+            let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
+            Ok(CacheResult::NotFound(lsn))
+        }
+    }
+
+    /// Does the relation exists? CacheResult::NotFound means that the cache doesn't contain that
+    /// information, i.e. we don't know if the relation exists or not.
+    pub fn get_rel_exists(&'t self, rel: &RelTag) -> CacheResult<bool> {
+        // we don't currently cache negative entries, so if the relation is in the cache, it exists
+        if let Some(_rel_entry) = self.relsize_cache.get(&RelKey::from(rel)) {
+            CacheResult::Found(true)
+        } else {
+            let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
+            CacheResult::NotFound(lsn)
+        }
+    }
+
+    pub fn get_db_size(&'t self, _db_oid: u32) -> CacheResult<u64> {
+        // TODO: it would be nice to cache database sizes too. Getting the database size
+        // is not a very common operation, but when you do it, it's often interactive, with
+        // e.g. psql \l+ command, so the user will feel the latency.
+
+        // fixme: is this right lsn?
+        let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
+        CacheResult::NotFound(lsn)
+    }
+
+    pub fn remember_rel_size(&'t self, rel: &RelTag, nblocks: u32) {
+        let result =
+            self.relsize_cache
+                .update_with_fn(&RelKey::from(rel), |existing| match existing {
+                    None => {
+                        tracing::info!("inserting rel entry for {rel:?}, {nblocks} blocks");
+                        UpdateAction::Insert(RelEntry {
+                            nblocks: AtomicU32::new(nblocks),
+                        })
+                    }
+                    Some(e) => {
+                        tracing::info!("updating rel entry for {rel:?}, {nblocks} blocks");
+                        e.nblocks.store(nblocks, Ordering::Relaxed);
+                        UpdateAction::Nothing
+                    }
+                });
+
+        // FIXME: what to do if we run out of memory? Evict other relation entries?
+        result.expect("out of memory");
+    }
+
+    /// Remember the given page contents in the cache.
+    pub async fn remember_page(
+        &'t self,
+        rel: &RelTag,
+        block_number: u32,
+        src: impl uring_common::buf::IoBuf + Send + Sync,
+        lw_lsn: Lsn,
+        is_write: bool,
+    ) {
+        let key = BlockKey::from((rel, block_number));
+
+        // FIXME: make this work when file cache is disabled. Or make it mandatory
+        let file_cache = self.file_cache.as_ref().unwrap();
+
+        if is_write {
+            // there should be no concurrent IOs. If a backend tries to read the page
+            // at the same time, they may get a torn write. That's the same as with
+            // regular POSIX filesystem read() and write()
+
+            // First check if we have a block in cache already
+            let mut old_cache_block = None;
+            let mut found_existing = false;
+
+            let res = self.block_map.update_with_fn(&key, |existing| {
+                if let Some(block_entry) = existing {
+                    found_existing = true;
+
+                    // Prevent this entry from being evicted
+                    let pin_count = block_entry.pinned.fetch_add(1, Ordering::Relaxed);
+                    if pin_count > 0 {
+                        // this is unexpected, because the caller has obtained the io-in-progress lock,
+                        // so no one else should try to modify the page at the same time.
+                        // XXX: and I think a read should not be happening either, because the postgres
+                        // buffer is held locked. TODO: check these conditions and tidy this up a little. Seems fragile to just panic.
+                        panic!("block entry was unexpectedly pinned");
+                    }
+
+                    let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
+                    old_cache_block = if cache_block != INVALID_CACHE_BLOCK {
+                        Some(cache_block)
+                    } else {
+                        None
+                    };
+                }
+                // if there was no existing entry, we will insert one, but not yet
+                UpdateAction::Nothing
+            });
+
+            // FIXME: what to do if we run out of memory? Evict other relation entries? Remove
+            // block entries first?
+            res.expect("out of memory");
+
+            // Allocate a new block if required
+            let cache_block = old_cache_block.unwrap_or_else(|| {
+                loop {
+                    if let Some(x) = file_cache.alloc_block() {
+                        break x;
+                    }
+                    if let Some(x) = self.try_evict_one_cache_block() {
+                        break x;
+                    }
+                }
+            });
+
+            // Write the page to the cache file
+            file_cache
+                .write_block(cache_block, src)
+                .await
+                .expect("error writing to cache");
+            // FIXME: handle errors gracefully.
+            // FIXME: unpin the block entry on error
+
+            // Update the block entry
+            let res = self.block_map.update_with_fn(&key, |existing| {
+                assert_eq!(found_existing, existing.is_some());
+                if let Some(block_entry) = existing {
+                    // Update the cache block
+                    let old_blk = block_entry.cache_block.compare_exchange(
+                        INVALID_CACHE_BLOCK,
+                        cache_block,
+                        Ordering::Relaxed,
+                        Ordering::Relaxed,
+                    );
+                    assert!(old_blk == Ok(INVALID_CACHE_BLOCK) || old_blk == Err(cache_block));
+
+                    block_entry.lw_lsn.store(lw_lsn);
+
+                    block_entry.referenced.store(true, Ordering::Relaxed);
+
+                    let pin_count = block_entry.pinned.fetch_sub(1, Ordering::Relaxed);
+                    assert!(pin_count > 0);
+                    UpdateAction::Nothing
+                } else {
+                    UpdateAction::Insert(BlockEntry {
+                        lw_lsn: AtomicLsn::new(lw_lsn.0),
+                        cache_block: AtomicU64::new(cache_block),
+                        pinned: AtomicU64::new(0),
+                        referenced: AtomicBool::new(true),
+                    })
+                }
+            });
+
+            // FIXME: what to do if we run out of memory? Evict other relation entries? Remove
+            // block entries first?
+            res.expect("out of memory");
+        } else {
+            // !is_write
+            //
+            // We can assume that it doesn't already exist, because the
+            // caller is assumed to have already checked it, and holds
+            // the io-in-progress lock. (The BlockEntry might exist, but no cache block)
+
+            // Allocate a new block first
+            let cache_block = {
+                loop {
+                    if let Some(x) = file_cache.alloc_block() {
+                        break x;
+                    }
+                    if let Some(x) = self.try_evict_one_cache_block() {
+                        break x;
+                    }
+                }
+            };
+
+            // Write the page to the cache file
+            file_cache
+                .write_block(cache_block, src)
+                .await
+                .expect("error writing to cache");
+            // FIXME: handle errors gracefully.
+
+            let res = self.block_map.update_with_fn(&key, |existing| {
+                if let Some(block_entry) = existing {
+                    // FIXME: could there be concurrent readers?
+                    assert!(block_entry.pinned.load(Ordering::Relaxed) == 0);
+
+                    let old_cache_block = block_entry.cache_block.swap(cache_block, Ordering::Relaxed);
+                    if old_cache_block != INVALID_CACHE_BLOCK {
+                        panic!("remember_page called in !is_write mode, but page is already cached at blk {old_cache_block}");
+                    }
+                    UpdateAction::Nothing
+                } else {
+                    UpdateAction::Insert(BlockEntry {
+                        lw_lsn: AtomicLsn::new(lw_lsn.0),
+                        cache_block: AtomicU64::new(cache_block),
+                        pinned: AtomicU64::new(0),
+                        referenced: AtomicBool::new(true),
+                    })
+                }
+            });
+
+            // FIXME: what to do if we run out of memory? Evict other relation entries? Remove
+            // block entries first?
+            res.expect("out of memory");
+        }
+    }
+
+    /// Forget information about given relation in the cache. (For DROP TABLE and such)
+    pub fn forget_rel(&'t self, rel: &RelTag) {
+        tracing::info!("forgetting rel entry for {rel:?}");
+        self.relsize_cache.remove(&RelKey::from(rel));
+
+        // also forget all cached blocks for the relation
+        // FIXME
+        /*
+            let mut iter = MapIterator::new(&key_range_for_rel_blocks(rel));
+            let r = self.cache_tree.start_read();
+            while let Some((k, _v)) = iter.next(&r) {
+                let w = self.cache_tree.start_write();
+
+                let mut evicted_cache_block = None;
+
+                let res = w.update_with_fn(&k, |e| {
+                    if let Some(e) = e {
+                        let block_entry = if let MapEntry::Block(e) = e {
+                            e
+                        } else {
+                            panic!("unexpected map entry type for block key");
+                        };
+                        let cache_block = block_entry
+                            .cache_block
+                            .swap(INVALID_CACHE_BLOCK, Ordering::Relaxed);
+                        if cache_block != INVALID_CACHE_BLOCK {
+                            evicted_cache_block = Some(cache_block);
+                        }
+                        UpdateAction::Remove
+                    } else {
+                        UpdateAction::Nothing
+                    }
+                });
+
+                // FIXME: It's pretty surprising to run out of memory while removing. But
+                // maybe it can happen because of trying to shrink a node?
+                res.expect("out of memory");
+
+                if let Some(evicted_cache_block) = evicted_cache_block {
+                    self.file_cache
+                        .as_ref()
+                        .unwrap()
+                        .dealloc_block(evicted_cache_block);
+                }
+        }
+
+            */
+    }
+
+    // Maintenance routines
+
+    /// Evict one block from the file cache. This is used when the file cache fills up
+    /// Returns the evicted block. It's not put to the free list, so it's available for the
+    /// caller to use immediately.
+    pub fn try_evict_one_cache_block(&self) -> Option<CacheBlock> {
+        let mut clock_hand = self.clock_hand.lock().unwrap();
+        for _ in 0..100 {
+            self.clock_iterations_counter.inc();
+
+            (*clock_hand) += 1;
+
+            let mut evict_this = false;
+            let num_buckets = self.block_map.get_num_buckets();
+            match self
+                .block_map
+                .get_bucket((*clock_hand) % num_buckets)
+                .as_deref()
+            {
+                None => {
+                    // This bucket was unused
+                }
+                Some(blk_entry) => {
+                    if !blk_entry.referenced.swap(false, Ordering::Relaxed) {
+                        // Evict this. Maybe.
+                        evict_this = true;
+                    }
+                }
+            };
+
+            if evict_this {
+                // grab the write lock
+                let mut evicted_cache_block = None;
+                let res =
+                    self.block_map
+                        .update_with_fn_at_bucket(*clock_hand % num_buckets, |old| {
+                            match old {
+                                None => UpdateAction::Nothing,
+                                Some(old) => {
+                                    // note: all the accesses to 'pinned' currently happen
+                                    // within update_with_fn(), or while holding ValueReadGuard, which protects from concurrent
+                                    // updates. Otherwise, another thread could set the 'pinned'
+                                    // flag just after we have checked it here.
+                                    if old.pinned.load(Ordering::Relaxed) != 0 {
+                                        return UpdateAction::Nothing;
+                                    }
+
+                                    let _ = self
+                                        .global_lw_lsn
+                                        .fetch_max(old.lw_lsn.load().0, Ordering::Relaxed);
+                                    let cache_block = old
+                                        .cache_block
+                                        .swap(INVALID_CACHE_BLOCK, Ordering::Relaxed);
+                                    if cache_block != INVALID_CACHE_BLOCK {
+                                        evicted_cache_block = Some(cache_block);
+                                    }
+                                    UpdateAction::Remove
+                                }
+                            }
+                        });
+
+                // Out of memory should not happen here, as we're only updating existing values,
+                // not inserting new entries to the map.
+                res.expect("out of memory");
+
+                if evicted_cache_block.is_some() {
+                    self.page_evictions_counter.inc();
+                    return evicted_cache_block;
+                }
+            }
+        }
+        // Give up if we didn't find anything
+        None
+    }
+
+    pub fn resize_file_cache(&self, num_blocks: u32) {
+        let old_num_blocks = self.block_map.get_num_buckets() as u32;
+
+        if old_num_blocks < num_blocks {
+            if let Err(err) = self.block_map.grow(num_blocks) {
+                tracing::warn!(
+                    "could not grow file cache to {} blocks (old size {}): {}",
+                    num_blocks,
+                    old_num_blocks,
+                    err
+                );
+            }
+        }
+    }
+
+    pub fn dump_map(&self, _dst: &mut dyn std::io::Write) {
+        //FIXME self.cache_map.start_read().dump(dst);
+    }
+}
+
+impl metrics::core::Collector for IntegratedCacheWriteAccess<'_> {
+    fn desc(&self) -> Vec<&metrics::core::Desc> {
+        let mut descs = Vec::new();
+        descs.append(&mut self.page_evictions_counter.desc());
+        descs.append(&mut self.clock_iterations_counter.desc());
+
+        descs.append(&mut self.block_map_num_buckets.desc());
+        descs.append(&mut self.block_map_num_buckets_in_use.desc());
+
+        descs.append(&mut self.relsize_cache_num_buckets.desc());
+        descs.append(&mut self.relsize_cache_num_buckets_in_use.desc());
+
+        descs
+    }
+    fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
+        // Update gauges
+        self.block_map_num_buckets
+            .set(self.block_map.get_num_buckets() as i64);
+        self.block_map_num_buckets_in_use
+            .set(self.block_map.get_num_buckets_in_use() as i64);
+        self.relsize_cache_num_buckets
+            .set(self.relsize_cache.get_num_buckets() as i64);
+        self.relsize_cache_num_buckets_in_use
+            .set(self.relsize_cache.get_num_buckets_in_use() as i64);
+
+        let mut values = Vec::new();
+        values.append(&mut self.page_evictions_counter.collect());
+        values.append(&mut self.clock_iterations_counter.collect());
+
+        values.append(&mut self.block_map_num_buckets.collect());
+        values.append(&mut self.block_map_num_buckets_in_use.collect());
+
+        values.append(&mut self.relsize_cache_num_buckets.collect());
+        values.append(&mut self.relsize_cache_num_buckets_in_use.collect());
+
+        values
+    }
+}
+
+/// Read relation size from the cache.
+///
+/// This is in a separate function so that it can be shared by
+/// IntegratedCacheReadAccess::get_rel_size() and IntegratedCacheWriteAccess::get_rel_size()
+fn get_rel_size(
+    r: &neon_shmem::hash::HashMapAccess<RelKey, RelEntry>,
+    rel: &RelTag,
+) -> Option<u32> {
+    if let Some(rel_entry) = r.get(&RelKey::from(rel)) {
+        let nblocks = rel_entry.nblocks.load(Ordering::Relaxed);
+        if nblocks != u32::MAX {
+            Some(nblocks)
+        } else {
+            None
+        }
+    } else {
+        None
+    }
+}
+
+/// Accessor for other backends
+///
+/// This allows backends to read pages from the cache directly, on their own, without making a
+/// request to the communicator process.
+impl<'t> IntegratedCacheReadAccess<'t> {
+    pub fn get_rel_size(&'t self, rel: &RelTag) -> Option<u32> {
+        get_rel_size(&self.relsize_cache, rel)
+    }
+
+    pub fn start_read_op(&'t self) -> BackendCacheReadOp<'t> {
+        BackendCacheReadOp {
+            read_guards: Vec::new(),
+            map_access: self,
+        }
+    }
+
+    /// Check if the given page is present in the cache
+    pub fn cache_contains_page(&'t self, rel: &RelTag, block_number: u32) -> bool {
+        self.block_map
+            .get(&BlockKey::from((rel, block_number))).is_some()
+    }
+}
+
+pub struct BackendCacheReadOp<'t> {
+    read_guards: Vec<DeferredUnpin>,
+    map_access: &'t IntegratedCacheReadAccess<'t>,
+}
+
+impl<'e> BackendCacheReadOp<'e> {
+    /// Initiate a read of the page from the cache.
+    ///
+    /// This returns the "cache block number", i.e. the block number within the cache file, where
+    /// the page's contents is stored. To get the page contents, the caller needs to read that block
+    /// from the cache file. This returns a guard object that you must hold while it performs the
+    /// read. It's possible that while you are performing the read, the cache block is invalidated.
+    /// After you have completed the read, call BackendCacheReadResult::finish() to check if the
+    /// read was in fact valid or not. If it was concurrently invalidated, you need to retry.
+    pub fn get_page(&mut self, rel: &RelTag, block_number: u32) -> Option<u64> {
+        if let Some(block_entry) = self
+            .map_access
+            .block_map
+            .get(&BlockKey::from((rel, block_number)))
+        {
+            block_entry.referenced.store(true, Ordering::Relaxed);
+
+            let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
+            if cache_block != INVALID_CACHE_BLOCK {
+                block_entry.pinned.fetch_add(1, Ordering::Relaxed);
+                self.read_guards
+                    .push(DeferredUnpin(block_entry.pinned.as_ptr()));
+                Some(cache_block)
+            } else {
+                None
+            }
+        } else {
+            None
+        }
+    }
+
+    pub fn finish(self) -> bool {
+        // TODO: currently, we hold a pin on the in-memory map, so concurrent invalidations are not
+        // possible. But if we switch to optimistic locking, this would return 'false' if the
+        // optimistic locking failed and you need to retry.
+        true
+    }
+}
+
+/// A hack to decrement an AtomicU64 on drop. This is used to decrement the pin count
+/// of a BlockEntry. The safety depends on the fact that the BlockEntry is not evicted
+/// or moved while it's pinned.
+struct DeferredUnpin(*mut u64);
+
+unsafe impl Sync for DeferredUnpin {}
+unsafe impl Send for DeferredUnpin {}
+
+impl Drop for DeferredUnpin {
+    fn drop(&mut self) {
+        // unpin it
+        unsafe {
+            let pin_ref = AtomicU64::from_ptr(self.0);
+            pin_ref.fetch_sub(1, Ordering::Relaxed);
+        }
+    }
+}
--- a/pgxn/neon/communicator/src/lib.rs
+++ b/pgxn/neon/communicator/src/lib.rs
@@ -0,0 +1,27 @@
+//!
+//! Three main parts:
+//! - async tokio communicator core, which receives requests and processes them.
+//! - Main loop and requests queues, which routes requests from backends to the core
+//! - the per-backend glue code, which submits requests
+//!
+
+mod backend_comms;
+
+// mark this 'pub', because these functions are called from C code. Otherwise, the compiler
+// complains about a bunch of structs and enum variants being unused, because it thinkgs
+// the functions that use them are never called. There are some C-callable functions in
+// other modules too, but marking this as pub is currently enough to silence the warnings
+//
+// TODO: perhaps collect *all* the extern "C" functions to one module?
+pub mod backend_interface;
+
+mod file_cache;
+mod init;
+mod integrated_cache;
+mod neon_request;
+mod worker_process;
+
+mod global_allocator;
+
+// FIXME get this from postgres headers somehow
+pub const BLCKSZ: usize = 8192;
--- a/pgxn/neon/communicator/src/neon_request.rs
+++ b/pgxn/neon/communicator/src/neon_request.rs
@@ -0,0 +1,347 @@
+pub type CLsn = u64;
+pub type COid = u32;
+
+// This conveniently matches PG_IOV_MAX
+pub const MAX_GETPAGEV_PAGES: usize = 32;
+
+use pageserver_page_api as page_api;
+
+#[allow(clippy::large_enum_variant)]
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub enum NeonIORequest {
+    Empty,
+
+    // Read requests. These are C-friendly variants of the corresponding structs in
+    // pageserver_page_api.
+    RelExists(CRelExistsRequest),
+    RelSize(CRelSizeRequest),
+    GetPageV(CGetPageVRequest),
+    PrefetchV(CPrefetchVRequest),
+    DbSize(CDbSizeRequest),
+
+    // Write requests. These are needed to keep the relation size cache and LFC up-to-date.
+    // They are not sent to the pageserver.
+    WritePage(CWritePageRequest),
+    RelExtend(CRelExtendRequest),
+    RelZeroExtend(CRelZeroExtendRequest),
+    RelCreate(CRelCreateRequest),
+    RelTruncate(CRelTruncateRequest),
+    RelUnlink(CRelUnlinkRequest),
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub enum NeonIOResult {
+    Empty,
+    RelExists(bool),
+    RelSize(u32),
+
+    /// the result pages are written to the shared memory addresses given in the request
+    GetPageV,
+
+    /// A prefetch request returns as soon as the request has been received by the communicator.
+    /// It is processed in the background.
+    PrefetchVLaunched,
+
+    DbSize(u64),
+
+    // FIXME design compact error codes. Can't easily pass a string or other dynamic data.
+    // currently, this is 'errno'
+    Error(i32),
+
+    Aborted,
+
+    /// used for all write requests
+    WriteOK,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CCachedGetPageVResult {
+    pub cache_block_numbers: [u64; MAX_GETPAGEV_PAGES],
+}
+
+/// ShmemBuf represents a buffer in shared memory.
+///
+/// SAFETY: The pointer must point to an area in shared memory. The functions allow you to liberally
+/// get a mutable pointer to the contents; it is the caller's responsibility to ensure that you
+/// don't access a buffer that's you're not allowed to. Inappropriate access to the buffer doesn't
+/// violate Rust's safety semantics, but it will mess up and crash Postgres.
+///
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct ShmemBuf {
+    // These fields define where the result is written. Must point into a buffer in shared memory!
+    pub ptr: *mut u8,
+}
+
+unsafe impl Send for ShmemBuf {}
+unsafe impl Sync for ShmemBuf {}
+
+unsafe impl uring_common::buf::IoBuf for ShmemBuf {
+    fn stable_ptr(&self) -> *const u8 {
+        self.ptr
+    }
+
+    fn bytes_init(&self) -> usize {
+        crate::BLCKSZ
+    }
+
+    fn bytes_total(&self) -> usize {
+        crate::BLCKSZ
+    }
+}
+
+unsafe impl uring_common::buf::IoBufMut for ShmemBuf {
+    fn stable_mut_ptr(&mut self) -> *mut u8 {
+        self.ptr
+    }
+
+    unsafe fn set_init(&mut self, pos: usize) {
+        if pos > crate::BLCKSZ {
+            panic!(
+                "set_init called past end of buffer, pos {}, buffer size {}",
+                pos,
+                crate::BLCKSZ
+            );
+        }
+    }
+}
+
+impl ShmemBuf {
+    pub fn as_mut_ptr(&self) -> *mut u8 {
+        self.ptr
+    }
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelExistsRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelSizeRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CGetPageVRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+    pub nblocks: u8,
+
+    // These fields define where the result is written. Must point into a buffer in shared memory!
+    pub dest: [ShmemBuf; MAX_GETPAGEV_PAGES],
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CPrefetchVRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+    pub nblocks: u8,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CDbSizeRequest {
+    pub db_oid: COid,
+    pub request_lsn: CLsn,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CWritePageRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+    pub lsn: CLsn,
+
+    // These fields define where the result is written. Must point into a buffer in shared memory!
+    pub src: ShmemBuf,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelExtendRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+    pub lsn: CLsn,
+
+    // These fields define page contents. Must point into a buffer in shared memory!
+    pub src_ptr: usize,
+    pub src_size: u32,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelZeroExtendRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+    pub nblocks: u32,
+    pub lsn: CLsn,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelCreateRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelTruncateRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub nblocks: u32,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelUnlinkRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+    pub nblocks: u32,
+}
+
+impl CRelExistsRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CRelSizeRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CGetPageVRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CPrefetchVRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CWritePageRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CRelExtendRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CRelZeroExtendRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CRelCreateRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CRelTruncateRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CRelUnlinkRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
--- a/pgxn/neon/communicator/src/worker_process/callbacks.rs
+++ b/pgxn/neon/communicator/src/worker_process/callbacks.rs
@@ -0,0 +1,28 @@
+//! C callbacks to PostgreSQL facilities that the neon extension needs
+//! to provide. These are implemented in `neon/pgxn/communicator_new.c`.
+//! The function signatures better match!
+//!
+//! These are called from the communicator threads! Careful what you do, most
+//! Postgres functions are not safe to call in that context.
+
+use utils::lsn::Lsn;
+
+unsafe extern "C" {
+    pub fn notify_proc_unsafe(procno: std::ffi::c_int);
+    pub fn callback_set_my_latch_unsafe();
+    pub fn callback_get_request_lsn_unsafe() -> u64;
+}
+
+// safe wrappers
+
+pub(super) fn notify_proc(procno: std::ffi::c_int) {
+    unsafe { notify_proc_unsafe(procno) };
+}
+
+pub(super) fn callback_set_my_latch() {
+    unsafe { callback_set_my_latch_unsafe() };
+}
+
+pub(super) fn get_request_lsn() -> Lsn {
+    Lsn(unsafe { callback_get_request_lsn_unsafe() })
+}
--- a/pgxn/neon/communicator/src/worker_process/in_progress_ios.rs
+++ b/pgxn/neon/communicator/src/worker_process/in_progress_ios.rs
@@ -0,0 +1,84 @@
+use std::cmp::Eq;
+use std::hash::Hash;
+use std::sync::Arc;
+
+use tokio::sync::{Mutex, OwnedMutexGuard};
+
+use clashmap::ClashMap;
+use clashmap::Entry;
+
+use pageserver_page_api::RelTag;
+
+#[derive(Clone, Eq, Hash, PartialEq)]
+pub enum RequestInProgressKey {
+    Db(u32),
+    Rel(RelTag),
+    Block(RelTag, u32),
+}
+
+pub type RequestInProgressTable = MutexHashSet<RequestInProgressKey>;
+
+// more primitive locking thingie:
+
+pub struct MutexHashSet<K>
+where
+    K: Clone + Eq + Hash,
+{
+    lock_table: ClashMap<K, Arc<Mutex<()>>>,
+}
+
+pub struct MutexHashSetGuard<'a, K>
+where
+    K: Clone + Eq + Hash,
+{
+    pub key: K,
+    set: &'a MutexHashSet<K>,
+    mutex: Arc<Mutex<()>>,
+    _guard: OwnedMutexGuard<()>,
+}
+
+impl<'a, K> Drop for MutexHashSetGuard<'a, K>
+where
+    K: Clone + Eq + Hash,
+{
+    fn drop(&mut self) {
+        let (_old_key, old_val) = self.set.lock_table.remove(&self.key).unwrap();
+        assert!(Arc::ptr_eq(&old_val, &self.mutex));
+
+        // the guard will be dropped as we return
+    }
+}
+
+impl<K> MutexHashSet<K>
+where
+    K: Clone + Eq + Hash,
+{
+    pub fn new() -> MutexHashSet<K> {
+        MutexHashSet {
+            lock_table: ClashMap::new(),
+        }
+    }
+
+    pub async fn lock<'a>(&'a self, key: K) -> MutexHashSetGuard<'a, K> {
+        let my_mutex = Arc::new(Mutex::new(()));
+        let my_guard = Arc::clone(&my_mutex).lock_owned().await;
+
+        loop {
+            let lock = match self.lock_table.entry(key.clone()) {
+                Entry::Occupied(e) => Arc::clone(e.get()),
+                Entry::Vacant(e) => {
+                    e.insert(Arc::clone(&my_mutex));
+                    break;
+                }
+            };
+            let _ = lock.lock().await;
+        }
+
+        MutexHashSetGuard {
+            key,
+            set: self,
+            mutex: my_mutex,
+            _guard: my_guard,
+        }
+    }
+}
--- a/pgxn/neon/communicator/src/worker_process/logging.rs
+++ b/pgxn/neon/communicator/src/worker_process/logging.rs
@@ -0,0 +1,229 @@
+//! Glue code to hook up Rust logging, with the `tracing` crate, to the PostgreSQL log
+//!
+//! In the Rust threads, the log messages are written to a mpsc Channel, and the Postgres
+//! process latch is raised. That wakes up the loop in the  main thread. It reads the
+//! message from the channel and ereport()s it. This ensures that only one thread, the main
+//! thread, calls the PostgreSQL logging routines at any time.
+
+use std::sync::mpsc::sync_channel;
+use std::sync::mpsc::{Receiver, SyncSender};
+use std::sync::mpsc::{TryRecvError, TrySendError};
+
+use tracing::info;
+use tracing::{Event, Level, Metadata, Subscriber};
+use tracing_subscriber::filter::LevelFilter;
+use tracing_subscriber::fmt::FmtContext;
+use tracing_subscriber::fmt::FormatEvent;
+use tracing_subscriber::fmt::FormatFields;
+use tracing_subscriber::fmt::FormattedFields;
+use tracing_subscriber::fmt::MakeWriter;
+use tracing_subscriber::fmt::format::Writer;
+use tracing_subscriber::registry::LookupSpan;
+
+use crate::worker_process::callbacks::callback_set_my_latch;
+
+pub struct LoggingState {
+    receiver: Receiver<FormattedEventWithMeta>,
+}
+
+/// Called once, at worker process startup. The returned LoggingState is passed back
+/// in the subsequent calls to `pump_logging`. It is opaque to the C code.
+#[unsafe(no_mangle)]
+pub extern "C" fn configure_logging() -> Box<LoggingState> {
+    let (sender, receiver) = sync_channel(1000);
+
+    let maker = Maker { channel: sender };
+
+    use tracing_subscriber::prelude::*;
+    let r = tracing_subscriber::registry();
+
+    let r = r.with(
+        tracing_subscriber::fmt::layer()
+            .event_format(SimpleFormatter::new())
+            .with_writer(maker)
+            // TODO: derive this from log_min_messages?
+            .with_filter(LevelFilter::from_level(Level::INFO)),
+    );
+    r.init();
+
+    info!("communicator process logging started");
+
+    let state = LoggingState { receiver };
+
+    Box::new(state)
+}
+
+/// Read one message from the logging queue. This is essentially a wrapper to Receiver,
+/// with a C-friendly signature.
+///
+/// The message is copied into *errbuf, which is a caller-supplied buffer of size `errbuf_len`.
+/// If the message doesn't fit in the buffer, it is truncated. It is always NULL-terminated.
+///
+/// The error level is returned *elevel_p. It's one of the PostgreSQL error levels, see elog.h
+#[unsafe(no_mangle)]
+pub extern "C" fn pump_logging(
+    state: &mut LoggingState,
+    errbuf: *mut u8,
+    errbuf_len: u32,
+    elevel_p: &mut i32,
+) -> i32 {
+    let msg = match state.receiver.try_recv() {
+        Err(TryRecvError::Empty) => return 0,
+        Err(TryRecvError::Disconnected) => return -1,
+        Ok(msg) => msg,
+    };
+
+    let src: &[u8] = &msg.message;
+    let dst = errbuf;
+    let len = std::cmp::min(src.len(), errbuf_len as usize - 1);
+    unsafe {
+        std::ptr::copy_nonoverlapping(src.as_ptr(), dst, len);
+        *(errbuf.add(len)) = b'\0'; // NULL terminator
+    }
+
+    // XXX: these levels are copied from PostgreSQL's elog.h. Introduce another enum
+    // to hide these?
+    *elevel_p = match msg.level {
+        Level::TRACE => 10, // DEBUG5
+        Level::DEBUG => 14, // DEBUG1
+        Level::INFO => 17,  // INFO
+        Level::WARN => 19,  // WARNING
+        Level::ERROR => 21, // ERROR
+    };
+    1
+}
+
+//---- The following functions can be called from any thread ----
+
+#[derive(Clone)]
+struct FormattedEventWithMeta {
+    message: Vec<u8>,
+    level: tracing::Level,
+}
+
+impl Default for FormattedEventWithMeta {
+    fn default() -> Self {
+        FormattedEventWithMeta {
+            message: Vec::new(),
+            level: tracing::Level::DEBUG,
+        }
+    }
+}
+
+struct EventBuilder<'a> {
+    event: FormattedEventWithMeta,
+
+    maker: &'a Maker,
+}
+
+impl std::io::Write for EventBuilder<'_> {
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        self.event.message.write(buf)
+    }
+    fn flush(&mut self) -> std::io::Result<()> {
+        self.maker.send_event(self.event.clone());
+        Ok(())
+    }
+}
+
+impl Drop for EventBuilder<'_> {
+    fn drop(&mut self) {
+        let maker = self.maker;
+        let event = std::mem::take(&mut self.event);
+
+        maker.send_event(event);
+    }
+}
+
+struct Maker {
+    channel: SyncSender<FormattedEventWithMeta>,
+}
+
+impl<'a> MakeWriter<'a> for Maker {
+    type Writer = EventBuilder<'a>;
+
+    fn make_writer(&'a self) -> Self::Writer {
+        panic!("not expected to be called when make_writer_for is implemented");
+    }
+
+    fn make_writer_for(&'a self, meta: &Metadata<'_>) -> Self::Writer {
+        EventBuilder {
+            event: FormattedEventWithMeta {
+                message: Vec::new(),
+                level: *meta.level(),
+            },
+            maker: self,
+        }
+    }
+}
+
+impl Maker {
+    fn send_event(&self, e: FormattedEventWithMeta) {
+        match self.channel.try_send(e) {
+            Ok(()) => {
+                // notify the main thread
+                callback_set_my_latch();
+            }
+            Err(TrySendError::Disconnected(_)) => {}
+            Err(TrySendError::Full(_)) => {
+                // TODO: record that some messages were lost
+            }
+        }
+    }
+}
+
+/// Simple formatter implementation for tracing_subscriber, which prints the log
+/// spans and message part like the default formatter, but no timestamp or error
+/// level. The error level is captured separately by `FormattedEventWithMeta',
+/// and when the error is printed by the main thread, with PostgreSQL ereport(),
+/// it gets a timestamp at that point. (The timestamp printed will therefore lag
+/// behind the timestamp on the event here, if the main thread doesn't process
+/// the log message promptly)
+struct SimpleFormatter;
+
+impl<S, N> FormatEvent<S, N> for SimpleFormatter
+where
+    S: Subscriber + for<'a> LookupSpan<'a>,
+    N: for<'a> FormatFields<'a> + 'static,
+{
+    fn format_event(
+        &self,
+        ctx: &FmtContext<'_, S, N>,
+        mut writer: Writer<'_>,
+        event: &Event<'_>,
+    ) -> std::fmt::Result {
+        // Format all the spans in the event's span context.
+        if let Some(scope) = ctx.event_scope() {
+            for span in scope.from_root() {
+                write!(writer, "{}", span.name())?;
+
+                // `FormattedFields` is a formatted representation of the span's
+                // fields, which is stored in its extensions by the `fmt` layer's
+                // `new_span` method. The fields will have been formatted
+                // by the same field formatter that's provided to the event
+                // formatter in the `FmtContext`.
+                let ext = span.extensions();
+                let fields = &ext
+                    .get::<FormattedFields<N>>()
+                    .expect("will never be `None`");
+
+                // Skip formatting the fields if the span had no fields.
+                if !fields.is_empty() {
+                    write!(writer, "{{{fields}}}")?;
+                }
+                write!(writer, ": ")?;
+            }
+        }
+
+        // Write fields on the event
+        ctx.field_format().format_fields(writer.by_ref(), event)?;
+
+        writeln!(writer)
+    }
+}
+
+impl SimpleFormatter {
+    fn new() -> Self {
+        SimpleFormatter {}
+    }
+}
--- a/pgxn/neon/communicator/src/worker_process/main_loop.rs
+++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs
@@ -0,0 +1,620 @@
+use std::collections::HashMap;
+use std::os::fd::AsRawFd;
+use std::os::fd::OwnedFd;
+use std::path::PathBuf;
+use std::sync::atomic::{AtomicU64, Ordering};
+
+use crate::backend_comms::NeonIOHandle;
+use crate::file_cache::FileCache;
+use crate::global_allocator::MyAllocatorCollector;
+use crate::init::CommunicatorInitStruct;
+use crate::integrated_cache::{CacheResult, IntegratedCacheWriteAccess};
+use crate::neon_request::{CGetPageVRequest, CPrefetchVRequest};
+use crate::neon_request::{NeonIORequest, NeonIOResult};
+use crate::worker_process::in_progress_ios::{RequestInProgressKey, RequestInProgressTable};
+use pageserver_client_grpc::request_tracker::ShardedRequestTracker;
+use pageserver_page_api as page_api;
+
+use metrics::{IntCounter, IntCounterVec};
+
+use tokio::io::AsyncReadExt;
+use tokio_pipe::PipeRead;
+use uring_common::buf::IoBuf;
+
+use super::callbacks::{get_request_lsn, notify_proc};
+
+use tracing::{error, info, trace};
+
+use utils::lsn::Lsn;
+
+pub struct CommunicatorWorkerProcessStruct<'a> {
+    neon_request_slots: &'a [NeonIOHandle],
+
+    request_tracker: ShardedRequestTracker,
+
+    pub(crate) cache: IntegratedCacheWriteAccess<'a>,
+
+    submission_pipe_read_fd: OwnedFd,
+
+    next_request_id: AtomicU64,
+
+    in_progress_table: RequestInProgressTable,
+
+    // Metrics
+    request_counters: IntCounterVec,
+    request_rel_exists_counter: IntCounter,
+    request_rel_size_counter: IntCounter,
+    request_get_pagev_counter: IntCounter,
+    request_prefetchv_counter: IntCounter,
+    request_db_size_counter: IntCounter,
+    request_write_page_counter: IntCounter,
+    request_rel_extend_counter: IntCounter,
+    request_rel_zero_extend_counter: IntCounter,
+    request_rel_create_counter: IntCounter,
+    request_rel_truncate_counter: IntCounter,
+    request_rel_unlink_counter: IntCounter,
+
+    getpage_cache_misses_counter: IntCounter,
+    getpage_cache_hits_counter: IntCounter,
+
+    request_nblocks_counters: IntCounterVec,
+    request_get_pagev_nblocks_counter: IntCounter,
+    request_prefetchv_nblocks_counter: IntCounter,
+    request_rel_zero_extend_nblocks_counter: IntCounter,
+
+    allocator_metrics: MyAllocatorCollector,
+}
+
+pub(super) async fn init(
+    cis: Box<CommunicatorInitStruct>,
+    tenant_id: String,
+    timeline_id: String,
+    auth_token: Option<String>,
+    shard_map: HashMap<utils::shard::ShardIndex, String>,
+    initial_file_cache_size: u64,
+    file_cache_path: Option<PathBuf>,
+) -> CommunicatorWorkerProcessStruct<'static> {
+    info!("Test log message");
+    let last_lsn = get_request_lsn();
+
+    let file_cache = if let Some(path) = file_cache_path {
+        Some(FileCache::new(&path, initial_file_cache_size).expect("could not create cache file"))
+    } else {
+        // FIXME: temporarily for testing, use LFC even if disabled
+        Some(
+            FileCache::new(&PathBuf::from("new_filecache"), 1000)
+                .expect("could not create cache file"),
+        )
+    };
+
+    // Initialize subsystems
+    let cache = cis
+        .integrated_cache_init_struct
+        .worker_process_init(last_lsn, file_cache);
+
+    let request_tracker = ShardedRequestTracker::new();
+    request_tracker
+        .update_shard_map(
+            shard_map,
+            None,
+            tenant_id,
+            timeline_id,
+            auth_token.as_deref(),
+        )
+        .await;
+
+    let request_counters = IntCounterVec::new(
+        metrics::core::Opts::new(
+            "backend_requests_total",
+            "Number of requests from backends.",
+        ),
+        &["request_kind"],
+    )
+    .unwrap();
+    let request_rel_exists_counter = request_counters.with_label_values(&["rel_exists"]);
+    let request_rel_size_counter = request_counters.with_label_values(&["rel_size"]);
+    let request_get_pagev_counter = request_counters.with_label_values(&["get_pagev"]);
+    let request_prefetchv_counter = request_counters.with_label_values(&["prefetchv"]);
+    let request_db_size_counter = request_counters.with_label_values(&["db_size"]);
+    let request_write_page_counter = request_counters.with_label_values(&["write_page"]);
+    let request_rel_extend_counter = request_counters.with_label_values(&["rel_extend"]);
+    let request_rel_zero_extend_counter = request_counters.with_label_values(&["rel_zero_extend"]);
+    let request_rel_create_counter = request_counters.with_label_values(&["rel_create"]);
+    let request_rel_truncate_counter = request_counters.with_label_values(&["rel_truncate"]);
+    let request_rel_unlink_counter = request_counters.with_label_values(&["rel_unlink"]);
+
+    let getpage_cache_misses_counter = IntCounter::new(
+        "getpage_cache_misses",
+        "Number of file cache misses in get_pagev requests.",
+    )
+    .unwrap();
+    let getpage_cache_hits_counter = IntCounter::new(
+        "getpage_cache_hits",
+        "Number of file cache hits in get_pagev requests.",
+    )
+    .unwrap();
+
+    // For the requests that affect multiple blocks, have separate counters for the # of blocks affected
+    let request_nblocks_counters = IntCounterVec::new(
+        metrics::core::Opts::new(
+            "request_nblocks_total",
+            "Number of blocks in backend requests.",
+        ),
+        &["request_kind"],
+    )
+    .unwrap();
+    let request_get_pagev_nblocks_counter =
+        request_nblocks_counters.with_label_values(&["get_pagev"]);
+    let request_prefetchv_nblocks_counter =
+        request_nblocks_counters.with_label_values(&["prefetchv"]);
+    let request_rel_zero_extend_nblocks_counter =
+        request_nblocks_counters.with_label_values(&["rel_zero_extend"]);
+
+    CommunicatorWorkerProcessStruct {
+        neon_request_slots: cis.neon_request_slots,
+        request_tracker,
+        cache,
+        submission_pipe_read_fd: cis.submission_pipe_read_fd,
+        next_request_id: AtomicU64::new(1),
+        in_progress_table: RequestInProgressTable::new(),
+
+        // metrics
+        request_counters,
+        request_rel_exists_counter,
+        request_rel_size_counter,
+        request_get_pagev_counter,
+        request_prefetchv_counter,
+        request_db_size_counter,
+        request_write_page_counter,
+        request_rel_extend_counter,
+        request_rel_zero_extend_counter,
+        request_rel_create_counter,
+        request_rel_truncate_counter,
+        request_rel_unlink_counter,
+
+        getpage_cache_misses_counter,
+        getpage_cache_hits_counter,
+
+        request_nblocks_counters,
+        request_get_pagev_nblocks_counter,
+        request_prefetchv_nblocks_counter,
+        request_rel_zero_extend_nblocks_counter,
+
+        allocator_metrics: MyAllocatorCollector::new(),
+    }
+}
+
+impl<'t> CommunicatorWorkerProcessStruct<'t> {
+    /// Main loop of the worker process. Receive requests from the backends and process them.
+    pub(super) async fn run(&'static self) {
+        let mut idxbuf: [u8; 4] = [0; 4];
+
+        let mut submission_pipe_read =
+            PipeRead::try_from(self.submission_pipe_read_fd.as_raw_fd()).expect("invalid pipe fd");
+
+        loop {
+            // Wait for a backend to ring the doorbell
+
+            match submission_pipe_read.read(&mut idxbuf).await {
+                Ok(4) => {}
+                Ok(nbytes) => panic!("short read ({nbytes} bytes) on communicator pipe"),
+                Err(e) => panic!("error reading from communicator pipe: {e}"),
+            }
+            let request_idx = u32::from_ne_bytes(idxbuf);
+
+            // Read the IO request from the slot indicated in the wakeup
+            let Some(slot) =
+                self.neon_request_slots[request_idx as usize].start_processing_request()
+            else {
+                // This currently should not happen. But if we have multiple threads picking up
+                // requests, and without waiting for the notifications, it could.
+                panic!("no request in slot");
+            };
+
+            // Ok, we have ownership of this request now. We must process
+            // it now, there's no going back.
+
+            //trace!("processing request {request_idx}: {request:?}");
+
+            // Spawn a separate task for every request. That's a little excessive for requests that
+            // can be quickly satisfied from the cache, but we expect that to be rare, because the
+            // requesting backend would have already checked the cache.
+            tokio::spawn(async {
+                let result = self.handle_request(slot.get_request()).await;
+                let owner_procno = slot.get_owner_procno();
+
+                // Ok, we have completed the IO. Mark the request as completed. After that,
+                // we no longer have ownership of the slot, and must not modify it.
+                slot.completed(result);
+
+                // Notify the backend about the completion. (Note that the backend might see
+                // the completed status even before this; this is just a wakeup)
+                notify_proc(owner_procno);
+            });
+        }
+    }
+
+    fn request_lsns(&self, not_modified_since_lsn: Lsn) -> page_api::ReadLsn {
+        page_api::ReadLsn {
+            request_lsn: get_request_lsn(),
+            not_modified_since_lsn: Some(not_modified_since_lsn),
+        }
+    }
+
+    async fn handle_request(&'static self, req: &'_ NeonIORequest) -> NeonIOResult {
+        match req {
+            NeonIORequest::Empty => {
+                error!("unexpected Empty IO request");
+                NeonIOResult::Error(0)
+            }
+            NeonIORequest::RelExists(req) => {
+                self.request_rel_exists_counter.inc();
+                let rel = req.reltag();
+
+                let _in_progress_guard =
+                    self.in_progress_table.lock(RequestInProgressKey::Rel(rel));
+
+                let not_modified_since = match self.cache.get_rel_exists(&rel) {
+                    CacheResult::Found(exists) => return NeonIOResult::RelExists(exists),
+                    CacheResult::NotFound(lsn) => lsn,
+                };
+
+                match self
+                    .request_tracker
+                    .process_check_rel_exists_request(page_api::CheckRelExistsRequest {
+                        read_lsn: self.request_lsns(not_modified_since),
+                        rel,
+                    })
+                    .await
+                {
+                    Ok(exists) => NeonIOResult::RelExists(exists),
+                    Err(err) => {
+                        info!("tonic error: {err:?}");
+                        NeonIOResult::Error(0)
+                    }
+                }
+            }
+
+            NeonIORequest::RelSize(req) => {
+                self.request_rel_size_counter.inc();
+                let rel = req.reltag();
+
+                let _in_progress_guard =
+                    self.in_progress_table.lock(RequestInProgressKey::Rel(rel));
+
+                // Check the cache first
+                let not_modified_since = match self.cache.get_rel_size(&rel) {
+                    CacheResult::Found(nblocks) => {
+                        tracing::trace!("found relsize for {:?} in cache: {}", rel, nblocks);
+                        return NeonIOResult::RelSize(nblocks);
+                    }
+                    CacheResult::NotFound(lsn) => lsn,
+                };
+
+                let read_lsn = self.request_lsns(not_modified_since);
+                match self
+                    .request_tracker
+                    .process_get_rel_size_request(page_api::GetRelSizeRequest { read_lsn, rel })
+                    .await
+                {
+                    Ok(nblocks) => {
+                        // update the cache
+                        tracing::info!("updated relsize for {:?} in cache: {}", rel, nblocks);
+                        self.cache.remember_rel_size(&rel, nblocks);
+
+                        NeonIOResult::RelSize(nblocks)
+                    }
+                    Err(err) => {
+                        info!("tonic error: {err:?}");
+                        NeonIOResult::Error(0)
+                    }
+                }
+            }
+            NeonIORequest::GetPageV(req) => {
+                self.request_get_pagev_counter.inc();
+                self.request_get_pagev_nblocks_counter
+                    .inc_by(req.nblocks as u64);
+                match self.handle_get_pagev_request(req).await {
+                    Ok(()) => NeonIOResult::GetPageV,
+                    Err(errno) => NeonIOResult::Error(errno),
+                }
+            }
+            NeonIORequest::PrefetchV(req) => {
+                self.request_prefetchv_counter.inc();
+                self.request_prefetchv_nblocks_counter
+                    .inc_by(req.nblocks as u64);
+                let req = *req;
+                tokio::spawn(async move { self.handle_prefetchv_request(&req).await });
+                NeonIOResult::PrefetchVLaunched
+            }
+            NeonIORequest::DbSize(req) => {
+                self.request_db_size_counter.inc();
+                let _in_progress_guard = self
+                    .in_progress_table
+                    .lock(RequestInProgressKey::Db(req.db_oid));
+
+                // Check the cache first
+                let not_modified_since = match self.cache.get_db_size(req.db_oid) {
+                    CacheResult::Found(db_size) => {
+                        // get_page already copied the block content to the destination
+                        return NeonIOResult::DbSize(db_size);
+                    }
+                    CacheResult::NotFound(lsn) => lsn,
+                };
+
+                match self
+                    .request_tracker
+                    .process_get_dbsize_request(page_api::GetDbSizeRequest {
+                        read_lsn: self.request_lsns(not_modified_since),
+                        db_oid: req.db_oid,
+                    })
+                    .await
+                {
+                    Ok(db_size) => NeonIOResult::DbSize(db_size),
+                    Err(err) => {
+                        info!("tonic error: {err:?}");
+                        NeonIOResult::Error(0)
+                    }
+                }
+            }
+
+            // Write requests
+            NeonIORequest::WritePage(req) => {
+                self.request_write_page_counter.inc();
+
+                // Also store it in the LFC while we still have it
+                let rel = req.reltag();
+                let _in_progress_guard = self
+                    .in_progress_table
+                    .lock(RequestInProgressKey::Block(rel, req.block_number));
+                self.cache
+                    .remember_page(&rel, req.block_number, req.src, Lsn(req.lsn), true)
+                    .await;
+                NeonIOResult::WriteOK
+            }
+            NeonIORequest::RelExtend(req) => {
+                self.request_rel_extend_counter.inc();
+
+                // TODO: need to grab an io-in-progress lock for this? I guess not
+                self.cache
+                    .remember_rel_size(&req.reltag(), req.block_number + 1);
+                NeonIOResult::WriteOK
+            }
+            NeonIORequest::RelZeroExtend(req) => {
+                self.request_rel_zero_extend_counter.inc();
+                self.request_rel_zero_extend_nblocks_counter
+                    .inc_by(req.nblocks as u64);
+
+                // TODO: need to grab an io-in-progress lock for this? I guess not
+                self.cache
+                    .remember_rel_size(&req.reltag(), req.block_number + req.nblocks);
+                NeonIOResult::WriteOK
+            }
+            NeonIORequest::RelCreate(req) => {
+                self.request_rel_create_counter.inc();
+
+                // TODO: need to grab an io-in-progress lock for this? I guess not
+                self.cache.remember_rel_size(&req.reltag(), 0);
+                NeonIOResult::WriteOK
+            }
+            NeonIORequest::RelTruncate(req) => {
+                self.request_rel_truncate_counter.inc();
+
+                // TODO: need to grab an io-in-progress lock for this? I guess not
+                self.cache.remember_rel_size(&req.reltag(), req.nblocks);
+                NeonIOResult::WriteOK
+            }
+            NeonIORequest::RelUnlink(req) => {
+                self.request_rel_unlink_counter.inc();
+
+                // TODO: need to grab an io-in-progress lock for this? I guess not
+                self.cache.forget_rel(&req.reltag());
+                NeonIOResult::WriteOK
+            }
+        }
+    }
+
+    async fn handle_get_pagev_request(&'t self, req: &CGetPageVRequest) -> Result<(), i32> {
+        let rel = req.reltag();
+
+        // Check the cache first
+        //
+        // Note: Because the backends perform a direct lookup in the cache before sending
+        // the request to the communicator process, we expect the pages to almost never
+        // be already in cache. It could happen when:
+        // 1. two backends try to read the same page at the same time, but that should never
+        //    happen because there's higher level locking in the Postgres buffer manager, or
+        // 2. if a prefetch request finished at the same time as a backend requested the
+        //    page. That's much more likely.
+        let mut cache_misses = Vec::with_capacity(req.nblocks as usize);
+        for i in 0..req.nblocks {
+            let blkno = req.block_number + i as u32;
+
+            // note: this is deadlock-safe even though we hold multiple locks at the same time,
+            // because they're always acquired in the same order.
+            let in_progress_guard = self
+                .in_progress_table
+                .lock(RequestInProgressKey::Block(rel, blkno))
+                .await;
+
+            let dest = req.dest[i as usize];
+            let not_modified_since = match self.cache.get_page(&rel, blkno, dest).await {
+                Ok(CacheResult::Found(_)) => {
+                    // get_page already copied the block content to the destination
+                    trace!("found blk {} in rel {:?} in LFC", blkno, rel);
+                    continue;
+                }
+                Ok(CacheResult::NotFound(lsn)) => lsn,
+                Err(_io_error) => return Err(-1), // FIXME errno?
+            };
+            cache_misses.push((blkno, not_modified_since, dest, in_progress_guard));
+        }
+        self.getpage_cache_misses_counter
+            .inc_by(cache_misses.len() as u64);
+        self.getpage_cache_hits_counter
+            .inc_by(req.nblocks as u64 - cache_misses.len() as u64);
+
+        if cache_misses.is_empty() {
+            return Ok(());
+        }
+        let not_modified_since = cache_misses
+            .iter()
+            .map(|(_blkno, lsn, _dest, _guard)| *lsn)
+            .max()
+            .unwrap();
+
+        // TODO: Use batched protocol
+        for (blkno, _lsn, dest, _guard) in cache_misses.iter() {
+            match self
+                .request_tracker
+                .get_page(page_api::GetPageRequest {
+                    request_id: self.next_request_id.fetch_add(1, Ordering::Relaxed),
+                    request_class: page_api::GetPageClass::Normal,
+                    read_lsn: self.request_lsns(not_modified_since),
+                    rel,
+                    block_numbers: vec![*blkno],
+                })
+                .await
+            {
+                Ok(resp) => {
+                    // Write the received page image directly to the shared memory location
+                    // that the backend requested.
+                    if resp.page_images.len() != 1 {
+                        error!(
+                            "received unexpected response with {} page images received from pageserver for a request for one page",
+                            resp.page_images.len()
+                        );
+                        return Err(-1);
+                    }
+                    let page_image = resp.page_images[0].clone();
+                    let src: &[u8] = page_image.as_ref();
+                    let len = std::cmp::min(src.len(), dest.bytes_total());
+                    unsafe {
+                        std::ptr::copy_nonoverlapping(src.as_ptr(), dest.as_mut_ptr(), len);
+                    };
+
+                    // Also store it in the LFC while we have it
+                    self.cache
+                        .remember_page(&rel, *blkno, page_image, not_modified_since, false)
+                        .await;
+                }
+                Err(err) => {
+                    info!("tonic error: {err:?}");
+                    return Err(-1);
+                }
+            }
+        }
+        Ok(())
+    }
+
+    async fn handle_prefetchv_request(&'static self, req: &CPrefetchVRequest) -> Result<(), i32> {
+        let rel = req.reltag();
+
+        // Check the cache first
+        let mut cache_misses = Vec::with_capacity(req.nblocks as usize);
+        for i in 0..req.nblocks {
+            let blkno = req.block_number + i as u32;
+
+            // note: this is deadlock-safe even though we hold multiple locks at the same time,
+            // because they're always acquired in the same order.
+            let in_progress_guard = self
+                .in_progress_table
+                .lock(RequestInProgressKey::Block(rel, blkno))
+                .await;
+
+            let not_modified_since = match self.cache.page_is_cached(&rel, blkno).await {
+                Ok(CacheResult::Found(_)) => {
+                    trace!("found blk {} in rel {:?} in LFC", blkno, rel);
+                    continue;
+                }
+                Ok(CacheResult::NotFound(lsn)) => lsn,
+                Err(_io_error) => return Err(-1), // FIXME errno?
+            };
+            cache_misses.push((blkno, not_modified_since, in_progress_guard));
+        }
+        if cache_misses.is_empty() {
+            return Ok(());
+        }
+        let not_modified_since = cache_misses
+            .iter()
+            .map(|(_blkno, lsn, _guard)| *lsn)
+            .max()
+            .unwrap();
+
+        // TODO: spawn separate tasks for these. Use the integrated cache to keep track of the
+        // in-flight requests
+
+        // TODO: Use batched protocol
+        for (blkno, _lsn, _guard) in cache_misses.iter() {
+            match self
+                .request_tracker
+                .get_page(page_api::GetPageRequest {
+                    request_id: self.next_request_id.fetch_add(1, Ordering::Relaxed),
+                    request_class: page_api::GetPageClass::Prefetch,
+                    read_lsn: self.request_lsns(not_modified_since),
+                    rel,
+                    block_numbers: vec![*blkno],
+                })
+                .await
+            {
+                Ok(resp) => {
+                    trace!(
+                        "prefetch completed, remembering blk {} in rel {:?} in LFC",
+                        *blkno, rel
+                    );
+                    if resp.page_images.len() != 1 {
+                        error!(
+                            "received unexpected response with {} page images received from pageserver for a request for one page",
+                            resp.page_images.len()
+                        );
+                        return Err(-1);
+                    }
+                    let page_image = resp.page_images[0].clone();
+                    self.cache
+                        .remember_page(&rel, *blkno, page_image, not_modified_since, false)
+                        .await;
+                }
+                Err(err) => {
+                    info!("tonic error: {err:?}");
+                    return Err(-1);
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<'t> metrics::core::Collector for CommunicatorWorkerProcessStruct<'t> {
+    fn desc(&self) -> Vec<&metrics::core::Desc> {
+        let mut descs = Vec::new();
+
+        descs.append(&mut self.request_counters.desc());
+        descs.append(&mut self.getpage_cache_misses_counter.desc());
+        descs.append(&mut self.getpage_cache_hits_counter.desc());
+        descs.append(&mut self.request_nblocks_counters.desc());
+
+        if let Some(file_cache) = &self.cache.file_cache {
+            descs.append(&mut file_cache.desc());
+        }
+        descs.append(&mut self.cache.desc());
+        descs.append(&mut self.allocator_metrics.desc());
+
+        descs
+    }
+    fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
+        let mut values = Vec::new();
+
+        values.append(&mut self.request_counters.collect());
+        values.append(&mut self.getpage_cache_misses_counter.collect());
+        values.append(&mut self.getpage_cache_hits_counter.collect());
+        values.append(&mut self.request_nblocks_counters.collect());
+
+        if let Some(file_cache) = &self.cache.file_cache {
+            values.append(&mut file_cache.collect());
+        }
+        values.append(&mut self.cache.collect());
+        values.append(&mut self.allocator_metrics.collect());
+
+        values
+    }
+}
--- a/pgxn/neon/communicator/src/worker_process/metrics_exporter.rs
+++ b/pgxn/neon/communicator/src/worker_process/metrics_exporter.rs
@@ -0,0 +1,82 @@
+//! Export information about Postgres, the communicator process, file cache etc. as
+//! prometheus metrics.
+
+use axum::Router;
+use axum::body::Body;
+use axum::extract::State;
+use axum::response::Response;
+use http::StatusCode;
+use http::header::CONTENT_TYPE;
+
+use metrics::proto::MetricFamily;
+use metrics::{Encoder, TextEncoder};
+
+use std::path::PathBuf;
+
+use tokio::net::UnixListener;
+
+use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
+
+impl<'a> CommunicatorWorkerProcessStruct<'a> {
+    pub(crate) async fn launch_exporter_task(&'static self) {
+        use axum::routing::get;
+        let app = Router::new()
+            .route("/metrics", get(get_metrics))
+            .route("/dump_cache_map", get(dump_cache_map))
+            .with_state(self);
+
+        // Listen on unix domain socket, in the data directory. That should be unique.
+        let path = PathBuf::from(".metrics.socket");
+
+        let listener = UnixListener::bind(path.clone()).unwrap();
+
+        tokio::spawn(async {
+            tracing::info!("metrics listener spawned");
+            axum::serve(listener, app).await.unwrap()
+        });
+    }
+}
+
+async fn dump_cache_map(
+    State(state): State<&CommunicatorWorkerProcessStruct<'static>>,
+) -> Response {
+    let mut buf: Vec<u8> = Vec::new();
+    state.cache.dump_map(&mut buf);
+
+    Response::builder()
+        .status(StatusCode::OK)
+        .header(CONTENT_TYPE, "application/text")
+        .body(Body::from(buf))
+        .unwrap()
+}
+
+/// Expose Prometheus metrics.
+async fn get_metrics(State(state): State<&CommunicatorWorkerProcessStruct<'static>>) -> Response {
+    use metrics::core::Collector;
+    let metrics = state.collect();
+
+    // When we call TextEncoder::encode() below, it will immediately return an
+    // error if a metric family has no metrics, so we need to preemptively
+    // filter out metric families with no metrics.
+    let metrics = metrics
+        .into_iter()
+        .filter(|m| !m.get_metric().is_empty())
+        .collect::<Vec<MetricFamily>>();
+
+    let encoder = TextEncoder::new();
+    let mut buffer = vec![];
+
+    if let Err(e) = encoder.encode(&metrics, &mut buffer) {
+        Response::builder()
+            .status(StatusCode::INTERNAL_SERVER_ERROR)
+            .header(CONTENT_TYPE, "application/text")
+            .body(Body::from(e.to_string()))
+            .unwrap()
+    } else {
+        Response::builder()
+            .status(StatusCode::OK)
+            .header(CONTENT_TYPE, encoder.format_type())
+            .body(Body::from(buffer))
+            .unwrap()
+    }
+}
--- a/pgxn/neon/communicator/src/worker_process/mod.rs
+++ b/pgxn/neon/communicator/src/worker_process/mod.rs
@@ -0,0 +1,14 @@
+//! This code runs in the communicator worker process. This provides
+//! the glue code to:
+//!
+//! - launch the 'processor',
+//! - receive IO requests from backends and pass them to the processor,
+//! - write results back to backends.
+
+mod callbacks;
+mod logging;
+mod main_loop;
+mod metrics_exporter;
+mod worker_interface;
+
+mod in_progress_ios;
--- a/pgxn/neon/communicator/src/worker_process/worker_interface.rs
+++ b/pgxn/neon/communicator/src/worker_process/worker_interface.rs
@@ -0,0 +1,112 @@
+//! Functions called from the C code in the worker process
+
+use std::collections::HashMap;
+use std::ffi::{CStr, c_char};
+use std::path::PathBuf;
+
+use tracing::error;
+
+use crate::init::CommunicatorInitStruct;
+use crate::worker_process::main_loop;
+use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
+
+/// Launch the communicator's tokio tasks, which do most of the work.
+///
+/// The caller has initialized the process as a regular PostgreSQL
+/// background worker process. The shared memory segment used to
+/// communicate with the backends has been allocated and initialized
+/// earlier, at postmaster startup, in rcommunicator_shmem_init().
+#[unsafe(no_mangle)]
+pub extern "C" fn communicator_worker_process_launch(
+    cis: Box<CommunicatorInitStruct>,
+    tenant_id: *const c_char,
+    timeline_id: *const c_char,
+    auth_token: *const c_char,
+    shard_map: *mut *mut c_char,
+    nshards: u32,
+    file_cache_path: *const c_char,
+    initial_file_cache_size: u64,
+) -> &'static CommunicatorWorkerProcessStruct<'static> {
+    // Convert the arguments into more convenient Rust types
+    let tenant_id = unsafe { CStr::from_ptr(tenant_id) }.to_str().unwrap();
+    let timeline_id = unsafe { CStr::from_ptr(timeline_id) }.to_str().unwrap();
+    let auth_token = unsafe { auth_token.as_ref() }.map(|s| s.to_string());
+    let file_cache_path = {
+        if file_cache_path.is_null() {
+            None
+        } else {
+            let c_str = unsafe { CStr::from_ptr(file_cache_path) };
+            Some(PathBuf::from(c_str.to_str().unwrap()))
+        }
+    };
+    let shard_map = parse_shard_map(nshards, shard_map);
+
+    // start main loop
+    let runtime = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .thread_name("communicator thread")
+        .build()
+        .unwrap();
+
+    let worker_struct = runtime.block_on(main_loop::init(
+        cis,
+        tenant_id.to_string(),
+        timeline_id.to_string(),
+        auth_token,
+        shard_map,
+        initial_file_cache_size,
+        file_cache_path,
+    ));
+    let worker_struct = Box::leak(Box::new(worker_struct));
+
+    let main_loop_handle = runtime.spawn(worker_struct.run());
+
+    runtime.spawn(async {
+        let err = main_loop_handle.await.unwrap_err();
+        error!("error: {err:?}");
+    });
+
+    runtime.block_on(worker_struct.launch_exporter_task());
+
+    // keep the runtime running after we exit this function
+    Box::leak(Box::new(runtime));
+
+    worker_struct
+}
+
+/// Convert the "shard map" from an array of C strings, indexed by shard no to a rust HashMap
+fn parse_shard_map(
+    nshards: u32,
+    shard_map: *mut *mut c_char,
+) -> HashMap<utils::shard::ShardIndex, String> {
+    use utils::shard::*;
+
+    assert!(nshards <= u8::MAX as u32);
+
+    let mut result: HashMap<ShardIndex, String> = HashMap::new();
+    let mut p = shard_map;
+
+    for i in 0..nshards {
+        let c_str = unsafe { CStr::from_ptr(*p) };
+
+        p = unsafe { p.add(1) };
+
+        let s = c_str.to_str().unwrap();
+        let k = if nshards > 1 {
+            ShardIndex::new(ShardNumber(i as u8), ShardCount(nshards as u8))
+        } else {
+            ShardIndex::unsharded()
+        };
+        result.insert(k, s.into());
+    }
+    result
+}
+
+/// Inform the rust code about a configuration change
+#[unsafe(no_mangle)]
+pub extern "C" fn communicator_worker_config_reload(
+    proc_handle: &'static CommunicatorWorkerProcessStruct<'static>,
+    file_cache_size: u64,
+) {
+    proc_handle.cache.resize_file_cache(file_cache_size as u32);
+}
--- a/pgxn/neon/communicator_new.c
+++ b/pgxn/neon/communicator_new.c
--- a/pgxn/neon/communicator_new.h
+++ b/pgxn/neon/communicator_new.h
@@ -0,0 +1,56 @@
+/*-------------------------------------------------------------------------
+ *
+ * communicator_new.h
+ *	  new implementation
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef COMMUNICATOR_NEW_H
+#define COMMUNICATOR_NEW_H
+
+#include "neon_pgversioncompat.h"
+
+#include "storage/buf_internals.h"
+
+#include "pagestore_client.h"
+
+/* initialization at postmaster startup */
+extern void pg_init_communicator_new(void);
+extern void communicator_new_shmem_request(void);
+extern void communicator_new_shmem_startup(void);
+
+/* initialization at backend startup */
+extern void communicator_new_init(void);
+
+/* Read requests */
+extern bool communicator_new_rel_exists(NRelFileInfo rinfo, ForkNumber forkNum);
+extern BlockNumber communicator_new_rel_nblocks(NRelFileInfo rinfo, ForkNumber forknum);
+extern int64 communicator_new_dbsize(Oid dbNode);
+extern void communicator_new_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum,
+										  BlockNumber base_blockno,
+										  void **buffers, BlockNumber nblocks);
+extern void communicator_new_prefetch_register_bufferv(NRelFileInfo rinfo, ForkNumber forkNum,
+													   BlockNumber blockno,
+													   BlockNumber nblocks);
+extern bool communicator_new_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
+											BlockNumber blockno);
+extern int	communicator_new_read_slru_segment(SlruKind kind, int64 segno,
+											   void *buffer);
+
+/* Write requests, to keep the caches up-to-date */
+extern void communicator_new_write_page(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
+										const void *buffer, XLogRecPtr lsn);
+extern void communicator_new_rel_extend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
+										const void *buffer, XLogRecPtr lsn);
+extern void communicator_new_rel_zeroextend(NRelFileInfo rinfo, ForkNumber forkNum,
+											BlockNumber blockno, BlockNumber nblocks,
+											XLogRecPtr lsn);
+extern void communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum);
+extern void communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);
+extern void communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum);
+
+#endif							/* COMMUNICATOR_NEW_H */
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -183,13 +183,13 @@ typedef struct FileCacheControl
 static HTAB *lfc_hash;
 static int	lfc_desc = -1;
 static LWLockId lfc_lock;
-static int	lfc_max_size;
-static int	lfc_size_limit;
+int	lfc_max_size;
+int	lfc_size_limit;
 static int	lfc_prewarm_limit;
 static int	lfc_prewarm_batch;
 static int	lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
 static int	lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
-static char *lfc_path;
+char *lfc_path;
 static uint64 lfc_generation;
 static FileCacheControl *lfc_ctl;
 static bool lfc_do_prewarm;
@@ -693,6 +693,7 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
 	dsm_segment *seg;
 	BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS];

+	Assert(!neon_enable_new_communicator);

 	if (!lfc_ensure_opened())
 		return;
@@ -847,6 +848,8 @@ lfc_prewarm_main(Datum main_arg)
 	PrewarmWorkerState* ws;
 	uint32 worker_id = DatumGetInt32(main_arg);

+	Assert(!neon_enable_new_communicator);
+
 	AmPrewarmWorker = true;

 	pqsignal(SIGTERM, die);
@@ -947,6 +950,8 @@ lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
 	FileCacheEntry *entry;
 	uint32		hash;

+	Assert(!neon_enable_new_communicator);
+
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return;

@@ -992,6 +997,8 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	bool		found = false;
 	uint32		hash;

+	Assert(!neon_enable_new_communicator);
+
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return false;

@@ -1027,6 +1034,8 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	uint32		hash;
 	int			i = 0;

+	Assert(!neon_enable_new_communicator);
+
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return 0;

@@ -1134,6 +1143,8 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	int			blocks_read = 0;
 	int			buf_offset = 0;

+	Assert(!neon_enable_new_communicator);
+
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return -1;

@@ -1500,6 +1511,8 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,

 	int		chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);

+	Assert(!neon_enable_new_communicator);
+
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return false;

@@ -1645,6 +1658,8 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	uint32		entry_offset;
 	int			buf_offset = 0;

+	Assert(!neon_enable_new_communicator);
+
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return;

--- a/pgxn/neon/file_cache.h
+++ b/pgxn/neon/file_cache.h
@@ -26,6 +26,9 @@ typedef struct FileCacheState

 /* GUCs */
 extern bool lfc_store_prefetch_result;
+extern int	lfc_max_size;
+extern int	lfc_size_limit;
+extern char *lfc_path;

 /* functions for local file cache */
 extern void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -69,7 +69,8 @@ char	   *neon_project_id;
 char	   *neon_branch_id;
 char	   *neon_endpoint_id;
 int32		max_cluster_size;
-char	   *page_server_connstring;
+char	   *pageserver_connstring;
+char	   *pageserver_grpc_urls;
 char	   *neon_auth_token;

 int			readahead_buffer_size = 128;
@@ -177,6 +178,8 @@ static bool pageserver_flush(shardno_t shard_no);
 static void pageserver_disconnect(shardno_t shard_no);
 static void pageserver_disconnect_shard(shardno_t shard_no);

+static void AssignShardMap(const char *newval);
+
 static bool
 PagestoreShmemIsValid(void)
 {
@@ -239,6 +242,7 @@ ParseShardMap(const char *connstr, ShardMap *result)
 	return true;
 }

+/* GUC hooks for neon.pageserver_connstring */
 static bool
 CheckPageserverConnstring(char **newval, void **extra, GucSource source)
 {
@@ -249,6 +253,45 @@ CheckPageserverConnstring(char **newval, void **extra, GucSource source)

 static void
 AssignPageserverConnstring(const char *newval, void *extra)
+{
+	/*
+	 * 'neon.pageserver_connstring' is ignored if the new communicator is used.
+	 * In that case, the shard map is loaded from 'neon.pageserver_grpc_urls'
+	 * instead.
+	 */
+	if (neon_enable_new_communicator)
+		return;
+
+	AssignShardMap(newval);
+}
+
+
+/* GUC hooks for neon.pageserver_connstring */
+static bool
+CheckPageserverGrpcUrls(char **newval, void **extra, GucSource source)
+{
+	char	   *p = *newval;
+
+	return ParseShardMap(p, NULL);
+}
+
+static void
+AssignPageserverGrpcUrls(const char *newval, void *extra)
+{
+	/*
+	 * 'neon.pageserver_grpc-urls' is ignored if the new communicator is not
+	 * used.  In that case, the shard map is loaded from 'neon.pageserver_connstring'
+	  instead.
+	 */
+	if (!neon_enable_new_communicator)
+		return;
+
+	AssignShardMap(newval);
+}
+
+
+static void
+AssignShardMap(const char *newval)
 {
 	ShardMap	shard_map;

@@ -262,7 +305,7 @@ AssignPageserverConnstring(const char *newval, void *extra)
 	{
 		/*
 		 * shouldn't happen, because we already checked the value in
-		 * CheckPageserverConnstring
+		 * CheckPageserverConnstring/CheckPageserverGrpcUrls
 		 */
 		elog(ERROR, "could not parse shard map");
 	}
@@ -281,6 +324,54 @@ AssignPageserverConnstring(const char *newval, void *extra)
 	}
 }

+/* Return a copy of the whole shard map from shared memory */
+void
+get_shard_map(char ***connstrs_p, shardno_t *num_shards_p)
+{
+	uint64		begin_update_counter;
+	uint64		end_update_counter;
+	ShardMap   *shard_map = &pagestore_shared->shard_map;
+	shardno_t	num_shards;
+	char	   *buf;
+	char	  **connstrs;
+
+	buf = palloc(MAX_SHARDS*MAX_PAGESERVER_CONNSTRING_SIZE);
+	connstrs = palloc(sizeof(char *) * MAX_SHARDS);
+
+	/*
+	 * Postmaster can update the shared memory values concurrently, in which
+	 * case we would copy a garbled mix of the old and new values. We will
+	 * detect it because the counter's won't match, and retry. But it's
+	 * important that we don't do anything within the retry-loop that would
+	 * depend on the string having valid contents.
+	 */
+	do
+	{
+		char		*p;
+
+		begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
+		end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter);
+
+		num_shards = shard_map->num_shards;
+
+		p = buf;
+		for (int i = 0; i < Min(num_shards, MAX_SHARDS); i++)
+		{
+			strlcpy(p, shard_map->connstring[i], MAX_PAGESERVER_CONNSTRING_SIZE);
+			connstrs[i] = p;
+			p += MAX_PAGESERVER_CONNSTRING_SIZE;
+		}
+
+		pg_memory_barrier();
+	}
+	while (begin_update_counter != end_update_counter
+		   || begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter)
+		   || end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter));
+
+	*connstrs_p = connstrs;
+	*num_shards_p = num_shards;
+}
+
 /*
 * Get the current number of shards, and/or the connection string for a
 * particular shard from the shard map in shared memory.
@@ -1304,7 +1395,8 @@ PagestoreShmemInit(void)
 		pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0);
 		pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0);
 		memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
-		AssignPageserverConnstring(page_server_connstring, NULL);
+		AssignPageserverConnstring(pageserver_connstring, NULL);
+		AssignPageserverGrpcUrls(pageserver_grpc_urls, NULL);
 	}

 	NeonPerfCountersShmemInit();
@@ -1357,12 +1449,21 @@ pg_init_libpagestore(void)
 	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
 							   NULL,
-							   &page_server_connstring,
+							   &pageserver_connstring,
 							   "",
 							   PGC_SIGHUP,
 							   0,	/* no flags required */
 							   CheckPageserverConnstring, AssignPageserverConnstring, NULL);

+	DefineCustomStringVariable("neon.pageserver_grpc_urls",
+							   "list of gRPC URLs for the page servers",
+							   NULL,
+							   &pageserver_grpc_urls,
+							   "",
+							   PGC_SIGHUP,
+							   0,	/* no flags required */
+							   CheckPageserverGrpcUrls, AssignPageserverGrpcUrls, NULL);
+
 	DefineCustomStringVariable("neon.timeline_id",
 							   "Neon timeline_id the server is running on",
 							   NULL,
@@ -1520,7 +1621,7 @@ pg_init_libpagestore(void)
 	if (neon_auth_token)
 		neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");

-	if (page_server_connstring && page_server_connstring[0])
+	if (pageserver_connstring[0] || pageserver_connstring[0])
 	{
 		neon_log(PageStoreTrace, "set neon_smgr hook");
 		smgr_hook = smgr_neon;
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -21,6 +21,7 @@
 #include "replication/logicallauncher.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
+#include "storage/ipc.h"
 #include "storage/proc.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
@@ -30,6 +31,7 @@
 #include "utils/guc_tables.h"

 #include "communicator.h"
+#include "communicator_new.h"
 #include "extension_server.h"
 #include "file_cache.h"
 #include "neon.h"
@@ -47,6 +49,7 @@ PG_MODULE_MAGIC;
 void		_PG_init(void);


+bool neon_enable_new_communicator;
 static int  running_xacts_overflow_policy;
 static bool monitor_query_exec_time = false;

@@ -56,11 +59,14 @@ static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
 static void neon_ExecutorStart(QueryDesc *queryDesc, int eflags);
 static void neon_ExecutorEnd(QueryDesc *queryDesc);

-#if PG_MAJORVERSION_NUM >= 16
 static shmem_startup_hook_type prev_shmem_startup_hook;
-
-static void neon_shmem_startup_hook(void);
+#if PG_VERSION_NUM>=150000
+static shmem_request_hook_type prev_shmem_request_hook;
 #endif
+
+static void neon_shmem_request(void);
+static void neon_shmem_startup_hook(void);
+
 #if PG_MAJORVERSION_NUM >= 17
 uint32		WAIT_EVENT_NEON_LFC_MAINTENANCE;
 uint32		WAIT_EVENT_NEON_LFC_READ;
@@ -439,17 +445,36 @@ _PG_init(void)
 	 */
 #if PG_VERSION_NUM >= 160000
 	load_file("$libdir/neon_rmgr", false);
+#endif

 	prev_shmem_startup_hook = shmem_startup_hook;
 	shmem_startup_hook = neon_shmem_startup_hook;
+#if PG_VERSION_NUM>=150000
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = neon_shmem_request;
+#else
+	neon_shmem_request();
 #endif

+	DefineCustomBoolVariable(
+							"neon.enable_new_communicator",
+							"Enables new communicator implementation",
+							NULL,
+							&neon_enable_new_communicator,
+							true,
+							PGC_POSTMASTER,
+							0,
+							NULL, NULL, NULL);
+
 	pg_init_libpagestore();
 	lfc_init();
 	pg_init_walproposer();
 	init_lwlsncache();

 	pg_init_communicator();
+	if (neon_enable_new_communicator)
+		pg_init_communicator_new();
+
 	Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;

 	InitUnstableExtensionsSupport();
@@ -583,7 +608,17 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
 	PG_RETURN_UINT64(BackpressureThrottlingTime());
 }

-#if PG_MAJORVERSION_NUM >= 16
+static void
+neon_shmem_request(void)
+{
+#if PG_VERSION_NUM>=150000
+	if (prev_shmem_request_hook)
+		prev_shmem_request_hook();
+#endif
+
+	communicator_new_shmem_request();
+}
+
 static void
 neon_shmem_startup_hook(void)
 {
@@ -603,8 +638,9 @@ neon_shmem_startup_hook(void)
 	WAIT_EVENT_NEON_PS_READ = WaitEventExtensionNew("Neon/PS_ReadIO");
 	WAIT_EVENT_NEON_WAL_DL = WaitEventExtensionNew("Neon/WAL_Download");
 #endif
+
+	communicator_new_shmem_startup();
 }
-#endif

 /*
 * ExecutorStart hook: start up tracking if needed
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -13,6 +13,7 @@
 #include "utils/wait_event.h"

 /* GUCs */
+extern bool neon_enable_new_communicator;
 extern char *neon_auth_token;
 extern char *neon_timeline;
 extern char *neon_tenant;
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -9,6 +9,10 @@
 #include "fmgr.h"
 #include "storage/buf_internals.h"

+#if PG_MAJORVERSION_NUM < 16
+typedef PGAlignedBlock PGIOAlignedBlock;
+#endif
+
 #if PG_MAJORVERSION_NUM < 17
 #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId)
 #else
@@ -160,6 +164,10 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
 #define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess())
 #endif

+#if PG_MAJORVERSION_NUM < 17
+#define	MyProcNumber (MyProc - &ProcGlobal->allProcs[0])
+#endif
+
 #if PG_MAJORVERSION_NUM < 15
 extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
 extern TimeLineID GetWALInsertionTimeLine(void);
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -220,7 +220,8 @@ extern void prefetch_on_ps_disconnect(void);

 extern page_server_api *page_server;

-extern char *page_server_connstring;
+extern char *pageserver_connstring;
+extern char *pageserver_grpc_urls;
 extern int	flush_every_n_requests;
 extern int	readahead_buffer_size;
 extern char *neon_timeline;
@@ -228,6 +229,7 @@ extern char *neon_tenant;
 extern int32 max_cluster_size;
 extern int  neon_protocol_version;

+extern void get_shard_map(char ***connstrs_p, shardno_t *num_shards_p);
 extern shardno_t get_shard_number(BufferTag* tag);

 extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -62,6 +62,7 @@

 #include "bitmap.h"
 #include "communicator.h"
+#include "communicator_new.h"
 #include "file_cache.h"
 #include "neon.h"
 #include "neon_lwlsncache.h"
@@ -72,10 +73,6 @@
 #include "access/xlogrecovery.h"
 #endif

-#if PG_VERSION_NUM < 160000
-typedef PGAlignedBlock PGIOAlignedBlock;
-#endif
-
 /*
 * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API
 * calls to md.c, and *also* do the calls to the Page Server. On every
@@ -97,7 +94,7 @@ static char *hexdump_page(char *page);
 		NInfoGetRelNumber(InfoFromSMgrRel(reln)) >= FirstNormalObjectId \
 )

-const int	SmgrTrace = DEBUG5;
+const int	SmgrTrace = DEBUG1;

 /* unlogged relation build states */
 typedef enum
@@ -751,11 +748,6 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
-	{
-		return true;
-	}
-
 	/*
 	 * \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server
 	 * will error out if you check that, because the whole dbdir for
@@ -779,10 +771,20 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}

-	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
-						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
+	if (neon_enable_new_communicator)
+		return communicator_new_rel_exists(InfoFromSMgrRel(reln), forkNum);
+	else
+	{
+		if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
+		{
+			return true;
+		}

-	return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
+		neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
+							  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
+
+		return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
+	}
 }

 /*
@@ -820,33 +822,40 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 		 forkNum);

-	/*
-	 * Newly created relation is empty, remember that in the relsize cache.
-	 *
-	 * Note that in REDO, this is called to make sure the relation fork
-	 * exists, but it does not truncate the relation. So, we can only update
-	 * the relsize if it didn't exist before.
-	 *
-	 * Also, in redo, we must make sure to update the cached size of the
-	 * relation, as that is the primary source of truth for REDO's file length
-	 * considerations, and as file extension isn't (perfectly) logged, we need
-	 * to take care of that before we hit file size checks.
-	 *
-	 * FIXME: This is currently not just an optimization, but required for
-	 * correctness. Postgres can call smgrnblocks() on the newly-created
-	 * relation. Currently, we don't call SetLastWrittenLSN() when a new
-	 * relation created, so if we didn't remember the size in the relsize
-	 * cache, we might call smgrnblocks() on the newly-created relation before
-	 * the creation WAL record hass been received by the page server.
-	 */
-	if (isRedo)
+	if (neon_enable_new_communicator)
 	{
-		update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
-		get_cached_relsize(InfoFromSMgrRel(reln), forkNum,
-						   &reln->smgr_cached_nblocks[forkNum]);
+		communicator_new_rel_create(InfoFromSMgrRel(reln), forkNum);
 	}
 	else
-		set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
+	{
+		/*
+		 * Newly created relation is empty, remember that in the relsize cache.
+		 *
+		 * Note that in REDO, this is called to make sure the relation fork
+		 * exists, but it does not truncate the relation. So, we can only update
+		 * the relsize if it didn't exist before.
+		 *
+		 * Also, in redo, we must make sure to update the cached size of the
+		 * relation, as that is the primary source of truth for REDO's file length
+		 * considerations, and as file extension isn't (perfectly) logged, we need
+		 * to take care of that before we hit file size checks.
+		 *
+		 * FIXME: This is currently not just an optimization, but required for
+		 * correctness. Postgres can call smgrnblocks() on the newly-created
+		 * relation. Currently, we don't call SetLastWrittenLSN() when a new
+		 * relation created, so if we didn't remember the size in the relsize
+		 * cache, we might call smgrnblocks() on the newly-created relation before
+		 * the creation WAL record hass been received by the page server.
+		 */
+		if (isRedo)
+		{
+			update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
+			get_cached_relsize(InfoFromSMgrRel(reln), forkNum,
+							   &reln->smgr_cached_nblocks[forkNum]);
+		}
+		else
+			set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
+	}

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -881,9 +890,15 @@ neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo)
 	 * unlink, it won't do any harm if the file doesn't exist.
 	 */
 	mdunlink(rinfo, forkNum, isRedo);
+
 	if (!NRelFileInfoBackendIsTemp(rinfo))
 	{
-		forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum);
+		if (neon_enable_new_communicator)
+		{
+			communicator_new_rel_unlink(InfoFromNInfoB(rinfo), forkNum);
+		}
+		else
+			forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum);
 	}
 }

@@ -971,34 +986,43 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		 forkNum, blkno,
 		 (uint32) (lsn >> 32), (uint32) lsn);

-	lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
+	if (neon_enable_new_communicator)
+	{
+		// FIXME: this can pass lsn == invalid. Is that ok?
+		communicator_new_rel_extend(InfoFromSMgrRel(reln), forkNum, blkno, (const void *) buffer, lsn);
+	}
+	else
+	{
+		lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);

 #ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdextend(reln, forkNum, blkno, buffer, skipFsync);
+		if (IS_LOCAL_REL(reln))
+			mdextend(reln, forkNum, blkno, buffer, skipFsync);
 #endif

-	/*
-	 * smgr_extend is often called with an all-zeroes page, so
-	 * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
-	 * later, after it has been initialized with the real page contents, and
-	 * it is eventually evicted from the buffer cache. But we need a valid LSN
-	 * to the relation metadata update now.
-	 */
-	if (lsn == InvalidXLogRecPtr)
-	{
-		lsn = GetXLogInsertRecPtr();
-		neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno);
+		/*
+		 * smgr_extend is often called with an all-zeroes page, so
+		 * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
+		 * later, after it has been initialized with the real page contents, and
+		 * it is eventually evicted from the buffer cache. But we need a valid LSN
+		 * to the relation metadata update now.
+		 */
+		if (lsn == InvalidXLogRecPtr)
+		{
+			lsn = GetXLogInsertRecPtr();
+			neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno);
+		}
+		neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
 	}
-	neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
 }

 #if PG_MAJORVERSION_NUM >= 16
 static void
-neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
+neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber start_block,
 				int nblocks, bool skipFsync)
 {
 	const PGIOAlignedBlock buffer = {0};
+	BlockNumber blocknum = start_block;
 	int			remblocks = nblocks;
 	XLogRecPtr	lsn = 0;

@@ -1080,11 +1104,14 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,

 		lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI);

-		for (int i = 0; i < count; i++)
+		if (!neon_enable_new_communicator)
 		{
-			lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
-			neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum,
-									  blocknum + i);
+			for (int i = 0; i < count; i++)
+			{
+				lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
+				neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum,
+									 blocknum + i);
+			}
 		}

 		blocknum += count;
@@ -1093,8 +1120,15 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,

 	Assert(lsn != 0);

-	neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
-	set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
+	if (neon_enable_new_communicator)
+	{
+		communicator_new_rel_zeroextend(InfoFromSMgrRel(reln), forkNum, start_block, nblocks, lsn);
+	}
+	else
+	{
+		neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
+		set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
+	}
 }
 #endif

@@ -1154,11 +1188,17 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

+	if (neon_enable_new_communicator)
+	{
+		communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, nblocks);
+		return false;
+	}
+
 	tag.spcOid = reln->smgr_rlocator.locator.spcOid;
 	tag.dbOid = reln->smgr_rlocator.locator.dbOid;
 	tag.relNumber = reln->smgr_rlocator.locator.relNumber;
 	tag.forkNum = forknum;
-
+	
 	while (nblocks > 0)
 	{
 		int		iterblocks = Min(nblocks, PG_IOV_MAX);
@@ -1180,7 +1220,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		blocknum += iterblocks;
 	}

-	communicator_prefetch_pump_state();
+	if (!neon_enable_new_communicator)
+		communicator_prefetch_pump_state();

 	return false;
 }
@@ -1193,8 +1234,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 static bool
 neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
-	BufferTag	tag;
-
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:					/* probably shouldn't happen, but ignore it */
@@ -1209,17 +1248,25 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
-		return false;
+	if (neon_enable_new_communicator)
+	{
+		communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, 1);
+	}
+	else
+	{
+		BufferTag	tag;

-	tag.forkNum = forknum;
-	tag.blockNum = blocknum;
+		if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
+			return false;

-	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
+		tag.forkNum = forknum;
+		tag.blockNum = blocknum;

-	communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
+		CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
+		communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);

-	communicator_prefetch_pump_state();
+		communicator_prefetch_pump_state();
+	}

 	return false;
 }
@@ -1263,7 +1310,8 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 	 */
 	neon_log(SmgrTrace, "writeback noop");

-	communicator_prefetch_pump_state();
+	if (!neon_enable_new_communicator)
+		communicator_prefetch_pump_state();

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1279,7 +1327,14 @@ void
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 neon_request_lsns request_lsns, void *buffer)
 {
-	communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
+	if (neon_enable_new_communicator)
+	{
+		// FIXME: request_lsns is ignored. That affects the neon_test_utils callers.
+		// Add the capability to specify the LSNs explicitly, for the sake of neon_test_utils ?
+		communicator_new_read_at_lsnv(rinfo, forkNum, blkno, &buffer, 1);
+	}
+	else
+		communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
 }

 #ifdef DEBUG_COMPARE_LOCAL
@@ -1407,41 +1462,49 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	/* Try to read PS results if they are available */
-	communicator_prefetch_pump_state();
-
-	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
-
-	present = 0;
-	bufferp = buffer;
-	if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
+	if (neon_enable_new_communicator)
 	{
-		/* Prefetch hit */
-#ifdef DEBUG_COMPARE_LOCAL
-		compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
-#else
-		return;
-#endif
+		communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forkNum, blkno,
+									  (void *) &buffer, 1);
 	}
-
-	/* Try to read from local file cache */
-	if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
+	else
 	{
-		MyNeonCounters->file_cache_hits_total++;
+		/* Try to read PS results if they are available */
+		communicator_prefetch_pump_state();
+
+		neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
+
+		present = 0;
+		bufferp = buffer;
+		if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
+		{
+			/* Prefetch hit */
 #ifdef DEBUG_COMPARE_LOCAL
-		compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
+			compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
 #else
-		return;
+			return;
 #endif
+		}
+
+		/* Try to read from local file cache */
+		if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
+		{
+			MyNeonCounters->file_cache_hits_total++;
+#ifdef DEBUG_COMPARE_LOCAL
+			compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
+#else
+			return;
+#endif
+		}
+
+		neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
+
+		/*
+		 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
+		 */
+		communicator_prefetch_pump_state();
 	}

-	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
-
-	/*
-	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
-	 */
-	communicator_prefetch_pump_state();
-
 #ifdef DEBUG_COMPARE_LOCAL
 	compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
 #endif
@@ -1505,48 +1568,57 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 				 nblocks, PG_IOV_MAX);

 	/* Try to read PS results if they are available */
-	communicator_prefetch_pump_state();
-
-	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
-						  request_lsns, nblocks);
+	if (!neon_enable_new_communicator)
+		communicator_prefetch_pump_state();

 	memset(read_pages, 0, sizeof(read_pages));

-	prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
-													blocknum, request_lsns, nblocks,
-													buffers, read_pages);
+	if (neon_enable_new_communicator)
+	{
+		communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum,
+									  buffers, nblocks);
+	}
+	else
+	{
+		neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
+							  request_lsns, nblocks);
+		
+		prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
+														blocknum, request_lsns, nblocks,
+														buffers, read_pages);

 #ifdef DEBUG_COMPARE_LOCAL
-	compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
-	memset(read_pages, 0, sizeof(read_pages));
+		compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
+		memset(read_pages, 0, sizeof(read_pages));
 #else
-	if (prefetch_result == nblocks)
-		return;
+		if (prefetch_result == nblocks)
+			return;
 #endif

-	/* Try to read from local file cache */
-	lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
-								  nblocks, read_pages);
+		/* Try to read from local file cache */
+		lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
+									  nblocks, read_pages);

-	if (lfc_result > 0)
-		MyNeonCounters->file_cache_hits_total += lfc_result;
+		if (lfc_result > 0)
+			MyNeonCounters->file_cache_hits_total += lfc_result;

 #ifdef DEBUG_COMPARE_LOCAL
-	compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
-	memset(read_pages, 0, sizeof(read_pages));
+		compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
+		memset(read_pages, 0, sizeof(read_pages));
 #else
-	/* Read all blocks from LFC, so we're done */
-	if (prefetch_result + lfc_result == nblocks)
-		return;
+		/* Read all blocks from LFC, so we're done */
+		if (prefetch_result + lfc_result == nblocks)
+			return;
 #endif

-	communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
-							  buffers, nblocks, read_pages);
+		communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
+								  buffers, nblocks, read_pages);

-	/*
-	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
-	 */
-	communicator_prefetch_pump_state();
+		/*
+		 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
+		 */
+		communicator_prefetch_pump_state();
+	}

 #ifdef DEBUG_COMPARE_LOCAL
 	memset(read_pages, 0xFF, sizeof(read_pages));
@@ -1652,9 +1724,16 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 		 forknum, blocknum,
 		 (uint32) (lsn >> 32), (uint32) lsn);

-	lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
+	if (neon_enable_new_communicator)
+	{
+		communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blocknum, buffer, lsn);
+	}
+	else
+	{
+		lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);

-	communicator_prefetch_pump_state();
+		communicator_prefetch_pump_state();
+	}

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1716,9 +1795,21 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,

 	neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false);

-	lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
+	if (neon_enable_new_communicator)
+	{
+		for (int i = 0; i < nblocks; i++)
+		{
+			XLogRecPtr lsn = PageGetLSN((Page) buffers[i]);

-	communicator_prefetch_pump_state();
+			communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blkno + i, buffers[i], lsn);
+		}
+	}
+	else
+	{
+		lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
+
+		communicator_prefetch_pump_state();
+	}

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1758,19 +1849,26 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
+	if (neon_enable_new_communicator)
 	{
-		neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
-			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-			 forknum, n_blocks);
-		return n_blocks;
+		n_blocks = communicator_new_rel_nblocks(InfoFromSMgrRel(reln), forknum);
 	}
+	else
+	{
+		if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
+		{
+			neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
+					 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+					 forknum, n_blocks);
+			return n_blocks;
+		}

-	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
-						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
+		neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
+							  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);

-	n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
-	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
+		n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
+		update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
+	}

 	neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
 			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
@@ -1791,10 +1889,17 @@ neon_dbsize(Oid dbNode)
 	neon_request_lsns request_lsns;
 	NRelFileInfo dummy_node = {0};

-	neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
-						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
+	if (neon_enable_new_communicator)
+	{
+		db_size = communicator_new_dbsize(dbNode);
+	}
+	else
+	{
+		neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
+							  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);

-	db_size = communicator_dbsize(dbNode, &request_lsns);
+		db_size = communicator_dbsize(dbNode, &request_lsns);
+	}

 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
 			 dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
@@ -1808,8 +1913,6 @@ neon_dbsize(Oid dbNode)
 static void
 neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks)
 {
-	XLogRecPtr	lsn;
-
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
@@ -1833,34 +1936,43 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
+	if (neon_enable_new_communicator)
+	{
+		communicator_new_rel_truncate(InfoFromSMgrRel(reln), forknum, nblocks);
+	}
+	else
+	{
+		XLogRecPtr	lsn;

-	/*
-	 * Truncating a relation drops all its buffers from the buffer cache
-	 * without calling smgrwrite() on them. But we must account for that in
-	 * our tracking of last-written-LSN all the same: any future smgrnblocks()
-	 * request must return the new size after the truncation. We don't know
-	 * what the LSN of the truncation record was, so be conservative and use
-	 * the most recently inserted WAL record's LSN.
-	 */
-	lsn = GetXLogInsertRecPtr();
-	lsn = nm_adjust_lsn(lsn);
+		set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);

-	/*
-	 * Flush it, too. We don't actually care about it here, but let's uphold
-	 * the invariant that last-written LSN <= flush LSN.
-	 */
-	XLogFlush(lsn);
+		/*
+		 * Truncating a relation drops all its buffers from the buffer cache
+		 * without calling smgrwrite() on them. But we must account for that in
+		 * our tracking of last-written-LSN all the same: any future smgrnblocks()
+		 * request must return the new size after the truncation. We don't know
+		 * what the LSN of the truncation record was, so be conservative and use
+		 * the most recently inserted WAL record's LSN.
+		 */
+		lsn = GetXLogInsertRecPtr();
+		lsn = nm_adjust_lsn(lsn);

-	/*
-	 * Truncate may affect several chunks of relations. So we should either
-	 * update last written LSN for all of them, or update LSN for "dummy"
-	 * metadata block. Second approach seems more efficient. If the relation
-	 * is extended again later, the extension will update the last-written LSN
-	 * for the extended pages, so there's no harm in leaving behind obsolete
-	 * entries for the truncated chunks.
-	 */
-	neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);
+		/*
+		 * Flush it, too. We don't actually care about it here, but let's uphold
+		 * the invariant that last-written LSN <= flush LSN.
+		 */
+		XLogFlush(lsn);
+
+		/*
+		 * Truncate may affect several chunks of relations. So we should either
+		 * update last written LSN for all of them, or update LSN for "dummy"
+		 * metadata block. Second approach seems more efficient. If the relation
+		 * is extended again later, the extension will update the last-written LSN
+		 * for the extended pages, so there's no harm in leaving behind obsolete
+		 * entries for the truncated chunks.
+		 */
+		neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);
+	}

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1902,7 +2014,8 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)

 	neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");

-	communicator_prefetch_pump_state();
+	if (!neon_enable_new_communicator)
+		communicator_prefetch_pump_state();

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -2103,8 +2216,12 @@ neon_end_unlogged_build(SMgrRelation reln)
 				 RelFileInfoFmt(InfoFromNInfoB(rinfob)),
 				 forknum);

-			forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
-			lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
+			// FIXME: also do this with the new communicator
+			if (!neon_enable_new_communicator)
+			{
+				forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
+				lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
+			}

 			mdclose(reln, forknum);
 #ifndef DEBUG_COMPARE_LOCAL
@@ -2172,7 +2289,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	request_lsns.not_modified_since = not_modified_since;
 	request_lsns.effective_request_lsn = request_lsn;

-	n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);
+	if (neon_enable_new_communicator)
+		n_blocks = communicator_new_read_slru_segment(kind, segno, buffer);
+	else
+		n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);

 	return n_blocks;
 }
@@ -2209,7 +2329,8 @@ AtEOXact_neon(XactEvent event, void *arg)
 			}
 			break;
 	}
-	communicator_reconfigure_timeout_if_needed();
+	if (!neon_enable_new_communicator)
+		communicator_reconfigure_timeout_if_needed();
 }

 static const struct f_smgr neon_smgr =
@@ -2267,7 +2388,10 @@ smgr_init_neon(void)

 	smgr_init_standard();
 	neon_init();
-	communicator_init();
+	if (neon_enable_new_communicator)
+		communicator_new_init();
+	else
+		communicator_init();
 }


@@ -2279,6 +2403,12 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	/* This is only used in WAL replay */
 	Assert(RecoveryInProgress());

+	if (neon_enable_new_communicator)
+	{
+		// FIXME: broken, but this is only used in replica
+		elog(ERROR, "not implemented yet");
+	}
+
 	/* Extend the relation if we know its size */
 	if (get_cached_relsize(rinfo, forknum, &relsize))
 	{
@@ -2444,7 +2574,10 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 		 * We should perform this check after assigning LwLSN to prevent
 		 * prefetching of some older version of the page by some other backend.
 		 */
-		no_redo_needed = !lfc_cache_contains(rinfo, forknum, blkno);
+		if (neon_enable_new_communicator)
+			no_redo_needed = communicator_new_cache_contains(rinfo, forknum, blkno);
+		else
+			no_redo_needed = !lfc_cache_contains(rinfo, forknum, blkno);
 	}

 	LWLockRelease(partitionLock);
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -10,6 +10,7 @@
 */
 #include "postgres.h"

+#include "neon.h"
 #include "neon_pgversioncompat.h"

 #include "pagestore_client.h"
@@ -99,6 +100,8 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 {
 	bool		found = false;

+	Assert(!neon_enable_new_communicator);
+
 	if (relsize_hash_size > 0)
 	{
 		RelTag		tag;
@@ -130,6 +133,8 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 void
 set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 {
+	Assert(!neon_enable_new_communicator);
+
 	if (relsize_hash_size > 0)
 	{
 		RelTag		tag;
@@ -178,6 +183,8 @@ set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 void
 update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 {
+	Assert(!neon_enable_new_communicator);
+
 	if (relsize_hash_size > 0)
 	{
 		RelTag		tag;
@@ -212,6 +219,8 @@ update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 void
 forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum)
 {
+	Assert(!neon_enable_new_communicator);
+
 	if (relsize_hash_size > 0)
 	{
 		RelTag		tag;
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -5,8 +5,9 @@ use std::sync::Arc;
 use std::time::Duration;

 use anyhow::Context;
-use compute_api::spec::PageserverProtocol;
-use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
+use control_plane::endpoint::{
+    ComputeControlPlane, EndpointStatus, PageserverConnectionInfo, PageserverShardConnectionInfo,
+};
 use control_plane::local_env::LocalEnv;
 use futures::StreamExt;
 use hyper::StatusCode;
@@ -506,27 +507,40 @@ impl ApiMethod for ComputeHookTenant {
            if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
                tracing::info!("Reconfiguring pageservers for endpoint {endpoint_name}");

-                let pageservers = shards
-                    .iter()
-                    .map(|shard| {
-                        let ps_conf = env
-                            .get_pageserver_conf(shard.node_id)
-                            .expect("Unknown pageserver");
-                        if endpoint.grpc {
-                            let addr = ps_conf.listen_grpc_addr.as_ref().expect("no gRPC address");
-                            let (host, port) = parse_host_port(addr).expect("invalid gRPC address");
-                            let port = port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT);
-                            (PageserverProtocol::Grpc, host, port)
-                        } else {
-                            let (host, port) = parse_host_port(&ps_conf.listen_pg_addr)
-                                .expect("Unable to parse listen_pg_addr");
-                            (PageserverProtocol::Libpq, host, port.unwrap_or(5432))
-                        }
-                    })
-                    .collect::<Vec<_>>();
+                let mut shard_conninfos = HashMap::new();
+                for shard in shards.iter() {
+                    let ps_conf = env
+                        .get_pageserver_conf(shard.node_id)
+                        .expect("Unknown pageserver");
+
+                    let libpq_url = Some({
+                        let (host, port) = parse_host_port(&ps_conf.listen_pg_addr)
+                            .expect("Unable to parse listen_pg_addr");
+                        let port = port.unwrap_or(5432);
+                        format!("postgres://no_user@{host}:{port}")
+                    });
+                    let grpc_url = if let Some(grpc_addr) = &ps_conf.listen_grpc_addr {
+                        let (host, port) =
+                            parse_host_port(grpc_addr).expect("invalid gRPC address");
+                        let port = port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT);
+                        Some(format!("grpc://no_user@{host}:{port}"))
+                    } else {
+                        None
+                    };
+                    let pageserver = PageserverShardConnectionInfo {
+                        libpq_url,
+                        grpc_url,
+                    };
+                    shard_conninfos.insert(shard.shard_number.0 as u32, pageserver);
+                }
+
+                let pageserver_conninfo = PageserverConnectionInfo {
+                    shards: shard_conninfos,
+                    prefer_grpc: endpoint.grpc,
+                };

                endpoint
-                    .reconfigure_pageservers(pageservers, *stripe_size)
+                    .reconfigure_pageservers(pageserver_conninfo, *stripe_size)
                    .await
                    .map_err(NotifyError::NeonLocal)?;
            }
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -567,7 +567,7 @@ class NeonLocalCli(AbstractNeonCli):
        basebackup_request_tries: int | None = None,
        timeout: str | None = None,
        env: dict[str, str] | None = None,
-        dev: bool = False,
+        grpc: bool = False,
    ) -> subprocess.CompletedProcess[str]:
        args = [
            "endpoint",
@@ -587,14 +587,14 @@ class NeonLocalCli(AbstractNeonCli):
            args.append(endpoint_id)
        if pageserver_id is not None:
            args.extend(["--pageserver-id", str(pageserver_id)])
+        if grpc:
+            args.extend(["--grpc"])
        if allow_multiple:
            args.extend(["--allow-multiple"])
        if create_test_user:
            args.extend(["--create-test-user"])
        if timeout is not None:
            args.extend(["--start-timeout", str(timeout)])
-        if dev:
-            args.extend(["--dev"])

        res = self.raw_cli(args, extra_env_vars)
        res.check_returncode()
@@ -605,6 +605,7 @@ class NeonLocalCli(AbstractNeonCli):
        endpoint_id: str,
        tenant_id: TenantId | None = None,
        pageserver_id: int | None = None,
+        grpc: bool = False,
        safekeepers: list[int] | None = None,
        check_return_code=True,
    ) -> subprocess.CompletedProcess[str]:
@@ -613,6 +614,8 @@ class NeonLocalCli(AbstractNeonCli):
            args.extend(["--tenant-id", str(tenant_id)])
        if pageserver_id is not None:
            args.extend(["--pageserver-id", str(pageserver_id)])
+        if grpc:
+            args.extend(["--grpc"])
        if safekeepers is not None:
            args.extend(["--safekeepers", (",".join(map(str, safekeepers)))])
        return self.raw_cli(args, check_return_code=check_return_code)
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4286,7 +4286,18 @@ class Endpoint(PgProtocol, LogUtils):

        # set small 'max_replication_write_lag' to enable backpressure
        # and make tests more stable.
-        config_lines = ["max_replication_write_lag=15MB"] + config_lines
+        config_lines += ["max_replication_write_lag=15MB"]
+
+        # If gRPC is enabled, use the new communicator too.
+        #
+        # NB: the communicator is enabled by default, so force it to false otherwise.
+        #
+        # XXX: By checking for None, we enable the new communicator for all tests
+        # by default
+        if grpc or grpc is None:
+            config_lines += [f"neon.enable_new_communicator=on"]
+        else:
+            config_lines += [f"neon.enable_new_communicator=off"]

        # Delete file cache if it exists (and we're recreating the endpoint)
        if USE_LFC:
@@ -5332,6 +5343,7 @@ SKIP_FILES = frozenset(
        "postmaster.pid",
        "pg_control",
        "pg_dynshmem",
+        ".metrics.socket",
    )
 )

--- a/test_runner/regress/test_normal_work.py
+++ b/test_runner/regress/test_normal_work.py
@@ -17,7 +17,7 @@ def check_tenant(
    config_lines = [
        f"neon.safekeeper_proto_version = {safekeeper_proto_version}",
    ]
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id, config_lines=config_lines)
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id, config_lines=config_lines, grpc=True)
    # we rely upon autocommit after each statement
    res_1 = endpoint.safe_psql_many(
        queries=[
Author	SHA1	Message	Date
Erik Grinaker	f6761760a2	Documentation and tweaks	2025-07-01 17:54:41 +02:00
Erik Grinaker	0bce818d5e	Add stream pool	2025-07-01 17:54:41 +02:00
Erik Grinaker	48be1da6ef	Add initial client pool	2025-07-01 17:54:41 +02:00
Erik Grinaker	d2efc80e40	Add initial ChannelPool	2025-07-01 17:54:41 +02:00
Erik Grinaker	958c2577f5	pageserver: tighten up `page_api::Client`	2025-07-01 17:54:41 +02:00
Heikki Linnakangas	175c2e11e3	Add assertions that the legacy relsize cache is not used with new communicator And fix a few cases where it was being called	2025-07-01 16:44:25 +03:00
Heikki Linnakangas	efdb07e7b6	Implement function to check if page is in local cache This is needed for read replicas. There's one more TODO that needs to implemented before read replicas work though, in neon_extend_rel_size()	2025-07-01 16:22:51 +03:00
Heikki Linnakangas	b0970b415c	Don't call legacy lfc function when new communicator is used	2025-07-01 15:47:26 +03:00
Heikki Linnakangas	7429dd711c	fix the .metrics.socket filename in the ignore list	2025-06-30 23:41:09 +03:00
Heikki Linnakangas	88ac1e356b	Ignore the metrics unix domain socket in tests	2025-06-30 23:39:01 +03:00
Erik Grinaker	c3cb1ab98d	Merge branch 'main' into communicator-rewrite	2025-06-30 21:07:01 +02:00
Erik Grinaker	81ac4ef43a	Add a generic pool prototype	2025-06-30 14:49:34 +02:00
Erik Grinaker	a5b0fc560c	Fix/allow remaining clippy lints	2025-06-30 12:36:20 +02:00
Erik Grinaker	67b04f8ab3	Fix a bunch of linter warnings	2025-06-30 11:10:02 +02:00
Erik Grinaker	9d9e3cd08a	Fix `test_normal_work` grpc param	2025-06-30 10:13:46 +02:00
Heikki Linnakangas	97a8f4ef85	Handle unexpected EOF while doing an LFC read more gracefully There's a bug somewhere because this happens in python regression tests. We need to hunt that down, but in any case, let's not get stuck in an infinite loop if it happens.	2025-06-30 00:59:53 +03:00
Heikki Linnakangas	39f31957e3	Handle pageserver response with different number of pages gracefully Some tests are hitting this case, where pageserver returns 0 page images in the response to a GetPage request. I suspect it's because the code doesn't handle sharding correclty? In any case, let's not panic on it, but return an IO error to the originating backend.	2025-06-29 23:44:28 +03:00
Heikki Linnakangas	924c6a6fdf	Fix handling the case that server closes the stream - avoid panic by checking for Ok(None) response from tonic::Streaming::message() instead of just using unwrap() - There was a race condition, if the caller sent the message, but the receiver task concurrently received Ok(None) indicating the stream was closed. (I didn't see that in action, but I think it could happen by reading the code)	2025-06-29 22:53:39 +03:00
Heikki Linnakangas	7020476bf5	Run `cargo fmt`	2025-06-29 22:53:09 +03:00
Heikki Linnakangas	80e948db93	Remove ununused mock factory After reading the code a few times, I didn't quite understand what it was, to be honest, or how it was going to be used. Remove it now to reduce noise, but we can resurrect it from git history if we need it in the future.	2025-06-29 22:52:48 +03:00
Heikki Linnakangas	bfb30d434c	minor code tidy-up	2025-06-29 22:51:34 +03:00
Heikki Linnakangas	f3ba201800	Run `cargo fmt`	2025-06-29 21:21:07 +03:00
Heikki Linnakangas	8b7796cbfa	wip	2025-06-29 21:20:48 +03:00
Heikki Linnakangas	fdc7e9c2a4	Extract repeated code to look up RequestTracker into a helper function	2025-06-29 21:20:14 +03:00
Heikki Linnakangas	a352d290eb	Plumb through both libpq and grpc connection strings to the compute Add a new 'pageserver_connection_info' field in the compute spec. It replaces the old 'pageserver_connstring' field with a more complicated struct that includes both libpq and grpc URLs, for each shard (or only one of the the URLs, depending on the configuration). It also includes a flag suggesting which one to use; compute_ctl now uses it to decide which protocol to use for the basebackup. This is compatible with everything that's in production, because the control plane never used the 'pageserver_connstring' field. That was added a long time ago with the idea that it would replace the code that digs the 'neon.pageserver_connstring' GUC from the list of Postgres settings, but we never got around to do that in the control plane. Hence, it was only used with neon_local. But the plan now is to pass the 'pageserver_connection_info' from the control plane, and once that's fully deployed everywhere, the code to parse 'neon.pageserver_connstring' in compute_ctl can be removed. The 'grpc' flag on an endpoint in endpoint config is now more of a suggestion. Compute_ctl gets both URLs, so it can choose to use libpq or grpc as it wishes. It currently always obeys the 'prefer_grpc' flag that's part of the connection info though. Postgres however uses grpc iff the new rust-based communicator is enabled. TODO/plan for the control plane: - Start to pass `pageserver_connection_info` in the spec file. - Also keep the current `neon.pageserver_connstring` setting for now, for backwards compatibility with old computes After that, the `pageserver_connection_info.prefer_grpc` flag in the spec file can be used to control whether compute_ctl uses grpc or libpq. The actual compute's grpc usage will be controlled by the `neon.enable_new_communicator` GUC. It can be set separately from 'prefer_grpc'. Later: - Once all old computes are gone, remove the code to pass `neon.pageserver_connstring`	2025-06-29 18:16:49 +03:00
Heikki Linnakangas	8c122a1c98	Don't call into the old LFC when using the new communicator This fixes errors like `index "pg_class_relname_nsp_index" contains unexpected zero page at block 2` when running the python tests smgrzeroextend() still called into the old LFC's lfc_write() function, even when using the new communicator, which zeroed some arbitrary pages in the LFC file, overwriting pages managed by the new LFC implementation managed by `integrated_cache.rs`	2025-06-29 17:40:46 +03:00
Erik Grinaker	e3ecdfbecc	pgxn/neon: actually use UNAME_S	2025-06-26 12:38:44 +02:00
Erik Grinaker	d08e553835	pgxn/neon: fix `callback_get_request_lsn_unsafe` return type	2025-06-26 12:33:59 +02:00
Erik Grinaker	7fffb5b4df	pgxn/neon: fix macOS build	2025-06-26 12:33:39 +02:00
Heikki Linnakangas	46b5c0be0b	Remove duplicated migration script I messed this up during the merge I guess?	2025-06-23 19:46:32 +03:00
Heikki Linnakangas	2d913ff125	fix some mismerges	2025-06-23 18:21:16 +03:00
Heikki Linnakangas	e90be06d46	silence a few compiler warnings about unnecessary 'mut's and 'use's	2025-06-23 18:16:54 +03:00
Heikki Linnakangas	356ba67607	Merge remote-tracking branch 'origin/main' into HEAD I also included build script changes from https://github.com/neondatabase/neon/pull/12266, which is not yet merged but will be soon.	2025-06-23 17:46:30 +03:00
Heikki Linnakangas	1847f4de54	Add missing #include. Got a warning on macos without this	2025-06-18 17:26:20 +03:00
Heikki Linnakangas	e8af3a2811	remove unused struct in example code, to silence compiler warning	2025-06-17 02:09:21 +03:00
Heikki Linnakangas	b603e3dddb	Silence compiler warnings in example code	2025-06-17 02:07:33 +03:00
Heikki Linnakangas	83007782fd	fix compilation of example	2025-06-17 02:07:15 +03:00
Erik Grinaker	782062014e	Fix `test_normal_work` endpoint restart	2025-06-16 10:16:27 +02:00
Erik Grinaker	d0b3629412	Tweak base backups	2025-06-13 13:47:26 -07:00
Erik Grinaker	f4d51c0f5c	Use gRPC for `test_normal_work`	2025-06-09 22:51:15 +02:00
Erik Grinaker	ec17ae0658	Handle gRPC basebackups in compute_ctl	2025-06-09 22:50:57 +02:00
Erik Grinaker	9ecce60ded	Plumb gRPC addr through storage-controller	2025-06-09 20:24:18 +02:00
Erik Grinaker	e74a957045	test_runner: initial gRPC protocol support	2025-06-06 16:56:33 +02:00
Erik Grinaker	396a16a3b2	test_runner: enable gRPC Pageserver	2025-06-06 14:55:29 +02:00
Elizabeth Murray	7140a50225	Minor changes to get integration tests to run for communicator.	2025-06-06 04:32:51 +02:00
Elizabeth Murray	68f18ccacf	Request Tracker Prototype Does not include splitting requests across shards.	2025-06-05 13:32:18 -07:00
Heikki Linnakangas	786888d93f	Instead of a fixed TCP port for metrics, listen on a unix domain socket That avoids clashes if you run two computes at the same time. More secure too. We might want to have a TCP port in the long run, but this is less trouble for now. To see the metrics with curl you can use: curl --unix-socket .neon/endpoints/ep-main/pgdata/.metrics.socket http://localhost/metrics	2025-06-05 21:28:11 +03:00
Heikki Linnakangas	255537dda1	avoid hitting assertion failure in MarkPostmasterChildWalSender()	2025-06-05 20:08:32 +03:00
Erik Grinaker	8b494f6a24	Ignore communicator_bindings.h	2025-06-05 17:52:50 +02:00
Erik Grinaker	28a61741b3	Mangle gRPC connstrings to use port 51051	2025-06-05 17:46:58 +02:00
Erik Grinaker	2fb6164bf8	Misc build fixes	2025-06-05 17:22:11 +02:00
Erik Grinaker	328f28dfe5	impl Default for SlabBlockHeader	2025-06-05 17:18:28 +02:00
Erik Grinaker	95838056da	Fix `RelTag` fields	2025-06-05 17:13:51 +02:00
Erik Grinaker	6d451654f1	Remove generated communicator_bindings.h	2025-06-05 17:12:13 +02:00
Erik Grinaker	37c58522a2	Merge branch 'main' into communicator-rewrite	2025-06-05 15:08:05 +02:00
Erik Grinaker	4b6f02e47d	Merge branch 'main' into communicator-rewrite	2025-06-04 10:23:29 +02:00
Erik Grinaker	8202c6172f	Merge branch 'main' into communicator-rewrite	2025-06-03 16:04:31 +02:00
Erik Grinaker	69a47d789d	pageserver: remove gRPC compute service prototype	2025-06-03 13:47:21 +02:00
Erik Grinaker	b36f880710	Fix Linux build failures	2025-06-03 13:37:56 +02:00
Erik Grinaker	745b750f33	Merge branch 'main' into communicator-rewrite	2025-06-03 13:29:45 +02:00
Heikki Linnakangas	f06bb2bbd8	Implement growing the hash table. Fix unit tests.	2025-05-29 15:54:55 +03:00
Heikki Linnakangas	b3c25418a6	Add metrics to track memory usage of the rust communicator	2025-05-29 02:14:01 +03:00
Heikki Linnakangas	33549bad1d	use separate hash tables for relsize cache and block mappings	2025-05-28 23:57:55 +03:00
Heikki Linnakangas	009168d711	Add placeholder shmem hashmap implementation Use that instead of the half-baked Adaptive Radix Tree implementation. ART would probably be better in the long run, but more complicated to implement.	2025-05-28 11:08:35 +03:00
Elizabeth Murray	7c9bd542a6	Fix compile warnings, minor cleanup.	2025-05-26 06:30:48 -07:00
Elizabeth Murray	014823b305	Add a new iteration of a new client pool with some updates.	2025-05-26 05:29:32 -07:00
Elizabeth Murray	af9379ccf6	Use a sempahore to gate access to connections. Add metrics for testing.	2025-05-26 05:28:50 -07:00
Heikki Linnakangas	bb28109ffa	Merge remote-tracking branch 'origin/main' into communicator-rewrite-with-integrated-cache There were conflicts because of the differences in the page_api protocol that was merged to main vs what was on the branch. I adapted the code for the protocol in main.	2025-05-26 11:52:32 +03:00
Elizabeth Murray	60a0bec1c0	Set default max consumers per connection to a high number.	2025-05-19 07:00:39 -07:00
Elizabeth Murray	31fa7a545d	Remove unnecessary info include now that the info message is gone.	2025-05-19 06:52:07 -07:00
Elizabeth Murray	ac464c5f2c	Return info message that was used for debugging.	2025-05-19 06:39:16 -07:00
Elizabeth Murray	0dddb1e373	Add back whitespace that was removed.	2025-05-19 06:34:52 -07:00
Elizabeth Murray	3acb263e62	Add first iteration of simulating a flakey network with a custom TCP.	2025-05-19 06:33:30 -07:00
Elizabeth Murray	1e83398cdd	Correct out-of-date comment.	2025-05-14 07:31:52 -07:00
Elizabeth Murray	be8ed81532	Connection pool: update error accounting, sweep idle connections, add config options.	2025-05-14 07:31:52 -07:00
Heikki Linnakangas	12b08c4b82	Fix shutdown	2025-05-14 01:49:55 +03:00
Heikki Linnakangas	827358dd03	Handle OOMs a little more gracefully	2025-05-12 23:33:22 +03:00
Heikki Linnakangas	d367273000	minor cleanup	2025-05-12 23:11:55 +03:00
Heikki Linnakangas	e2bad5d9e9	Add debugging HTTP endpoint for dumping the cache tree	2025-05-12 22:54:03 +03:00
Heikki Linnakangas	5623e4665b	bunch of fixes	2025-05-12 18:40:54 +03:00
Heikki Linnakangas	8abb4dab6d	implement shrinking nodes	2025-05-12 03:57:10 +03:00
Heikki Linnakangas	731667ac37	better metrics of the art tree	2025-05-12 02:08:51 +03:00
Heikki Linnakangas	6a1374d106	Pack tree node structs more tightly, avoiding alignment padding	2025-05-12 01:01:58 +03:00
Heikki Linnakangas	f7c908f2f0	more metrics	2025-05-12 01:01:50 +03:00
Heikki Linnakangas	86671e3a0b	Add a bunch of metric counters	2025-05-11 20:11:13 +03:00
Heikki Linnakangas	319cd74f73	Fix eviction	2025-05-11 19:34:50 +03:00
Heikki Linnakangas	0efefbf77c	Add a few metrics, fix page eviction	2025-05-10 03:13:28 +03:00
Heikki Linnakangas	e6a4171fa1	fix concurrency issues with the LFC - Add another locking hash table to track which cached pages are currently being modified, by smgrwrite() or smgrread() or by prefetch. - Use single-value Leaf pages in the art tree. That seems simpler after all, and it eliminates some corner cases where a Value needed to be cloned, which made it tricky to use atomics or other interior mutability on the Values	2025-05-10 02:36:48 +03:00
Heikki Linnakangas	0c25ea9e31	reduce LOG noise	2025-05-09 18:27:36 +03:00
Heikki Linnakangas	6692321026	Remove dependency on io_uring, use plain std::fs ops instead io_uring is a great idea in the long term, but for now, let's make it easier to develop locally on macos, where io_uring is not available.	2025-05-06 17:46:21 +03:00
Heikki Linnakangas	791df28755	Linked list fix and add unit test	2025-05-06 16:46:54 +03:00
Heikki Linnakangas	d20da994f4	git add missing file	2025-05-06 15:36:48 +03:00
Heikki Linnakangas	6dbbdaae73	run 'cargo fmt'	2025-05-06 15:35:56 +03:00
Heikki Linnakangas	977bc09d2a	Bunch of fixes, smarter iterator, metrics exporter	2025-05-06 15:28:50 +03:00
Heikki Linnakangas	44269fcd5e	Implement simple eviction and free block tracking	2025-05-06 15:28:15 +03:00
Heikki Linnakangas	44cc648dc8	Implement iterator over keys the implementation is not very optimized, but probably good enough for an MVP	2025-05-06 15:27:38 +03:00
Heikki Linnakangas	884e028a4a	implement deletion in art tree	2025-05-06 15:27:38 +03:00
Heikki Linnakangas	42df3e5453	debugging stats	2025-05-06 15:27:38 +03:00
Heikki Linnakangas	fc743e284f	more work on allocators	2025-05-06 15:27:38 +03:00
Heikki Linnakangas	d02f9a2139	Collect garbage, handle OOMs	2025-05-06 15:27:38 +03:00
Heikki Linnakangas	083118e98e	Implement epoch system	2025-05-06 15:27:38 +03:00
Heikki Linnakangas	54cd2272f1	more memory allocation stuff	2025-05-06 15:27:38 +03:00
Heikki Linnakangas	e40193e3c8	simple block-based allocator	2025-05-06 15:27:38 +03:00
Heikki Linnakangas	ce9f7bacc1	Fix communicator client for recent changes in protocol and client code	2025-05-06 15:26:51 +03:00
Heikki Linnakangas	b7891f8fe8	Include 'neon-shard-id' header in client requests	2025-05-06 15:23:30 +03:00
Elizabeth Murray	5f2adaa9ad	Remove some additional debug info messages.	2025-05-02 10:50:53 -07:00
Elizabeth Murray	3e5e396c8d	Remove some debug info messages.	2025-05-02 10:24:18 -07:00
Elizabeth Murray	9d781c6fda	Add a connection pool module to the grpc client.	2025-05-02 10:22:33 -07:00
Erik Grinaker	cf5d038472	service documentation	2025-05-02 15:20:12 +02:00
Erik Grinaker	d785100c02	page_api: add `GetPageRequest::class`	2025-05-02 10:48:32 +02:00
Erik Grinaker	2c0d930e3d	page_api: add `GetPageResponse::status`	2025-04-30 16:48:45 +02:00
Erik Grinaker	66171a117b	page_api: add `GetPageRequestBatch`	2025-04-30 15:31:11 +02:00
Erik Grinaker	df2806e7a0	page_api: add `GetPageRequest::id`	2025-04-30 15:00:16 +02:00
Erik Grinaker	07631692db	page_api: protobuf comments	2025-04-30 12:36:11 +02:00
Erik Grinaker	4c77397943	Add `neon-shard-id` header	2025-04-30 11:18:06 +02:00
Erik Grinaker	7bb58be546	Use `authorization` header instead of `neon-auth-token`	2025-04-30 10:38:44 +02:00
Erik Grinaker	b5373de208	page_api: add `get_slru_segment()`	2025-04-29 17:59:27 +02:00
Erik Grinaker	b86c610f42	page_api: tweaks	2025-04-29 17:23:51 +02:00
Erik Grinaker	0f520d79ab	pageserver: rename `data_api` to `page_api`	2025-04-29 15:58:52 +02:00
Heikki Linnakangas	93eb7bb6b8	include lots of changes that went missing by accident	2025-04-29 15:32:27 +03:00
Heikki Linnakangas	e58d0fece1	New communicator, with "integrated" cache accessible from all processes	2025-04-29 11:52:44 +03:00