Rename the communicator switch variable

Just limit strlcpy with BLCKSZ
Fix the comment
2026-05-15 20:20:38 +00:00 · 2025-07-16 15:36:39 +02:00 · 2025-07-15 17:40:35 +02:00 · 2025-07-15 11:58:15 +02:00 · 2025-07-15 11:57:32 +02:00 · 2025-07-15 11:55:27 +02:00
74 changed files with 13226 additions and 913 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@ neon.iml
 /.neon
 /integration_tests/.neon
 compaction-suite-results.*
+pgxn/neon/communicator/communicator_bindings.h
 docker-compose/docker-compose-parallel.yml

 # Coverage
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -253,6 +253,17 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"

+[[package]]
+name = "atomic_enum"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.100",
+]
+
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -687,13 +698,40 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "axum"
+version = "0.7.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
+dependencies = [
+ "async-trait",
+ "axum-core 0.4.5",
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
+ "itoa",
+ "matchit 0.7.3",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustversion",
+ "serde",
+ "sync_wrapper 1.0.1",
+ "tower 0.5.2",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "axum"
 version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6d6fd624c75e18b3b4c6b9caf42b1afe24437daaee904069137d8bab077be8b8"
 dependencies = [
- "axum-core",
+ "axum-core 0.5.0",
 "base64 0.22.1",
 "bytes",
 "form_urlencoded",
@@ -701,10 +739,10 @@ dependencies = [
 "http 1.1.0",
 "http-body 1.0.0",
 "http-body-util",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "itoa",
- "matchit",
+ "matchit 0.8.4",
 "memchr",
 "mime",
 "percent-encoding",
@@ -724,6 +762,26 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "axum-core"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
+ "mime",
+ "pin-project-lite",
+ "rustversion",
+ "sync_wrapper 1.0.1",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "axum-core"
 version = "0.5.0"
@@ -750,8 +808,8 @@ version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
 dependencies = [
- "axum",
- "axum-core",
+ "axum 0.8.1",
+ "axum-core 0.5.0",
 "bytes",
 "form_urlencoded",
 "futures-util",
@@ -1288,10 +1346,31 @@ dependencies = [

 [[package]]
 name = "communicator"
-version = "0.1.0"
+version = "0.0.0"
 dependencies = [
+ "atomic_enum",
+ "axum 0.8.1",
+ "bytes",
 "cbindgen",
+ "clashmap",
+ "http 1.1.0",
+ "libc",
+ "metrics",
 "neon-shmem",
+ "nix 0.30.1",
+ "pageserver_api",
+ "pageserver_client_grpc",
+ "pageserver_page_api",
+ "prometheus",
+ "prost 0.13.5",
+ "thiserror 1.0.69",
+ "tokio",
+ "tokio-pipe",
+ "tonic 0.12.3",
+ "tracing",
+ "tracing-subscriber",
+ "uring-common",
+ "utils",
 "workspace_hack",
 ]

@@ -1321,7 +1400,7 @@ dependencies = [
 "aws-sdk-kms",
 "aws-sdk-s3",
 "aws-smithy-types",
- "axum",
+ "axum 0.8.1",
 "axum-extra",
 "base64 0.22.1",
 "bytes",
@@ -1626,9 +1705,9 @@ dependencies = [

 [[package]]
 name = "crossbeam-utils"
-version = "0.8.19"
+version = "0.8.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"

 [[package]]
 name = "crossterm"
@@ -2082,7 +2161,7 @@ name = "endpoint_storage"
 version = "0.0.1"
 dependencies = [
 "anyhow",
- "axum",
+ "axum 0.8.1",
 "axum-extra",
 "camino",
 "camino-tempfile",
@@ -2343,6 +2422,12 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"

+[[package]]
+name = "foldhash"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
+
 [[package]]
 name = "form_urlencoded"
 version = "1.2.1"
@@ -2363,7 +2448,7 @@ dependencies = [
 "futures-core",
 "futures-sink",
 "http-body-util",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "pin-project",
 "rand 0.8.5",
@@ -2533,6 +2618,18 @@ dependencies = [
 "wasm-bindgen",
 ]

+[[package]]
+name = "getrandom"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi",
+ "wasi 0.14.2+wasi-0.2.4",
+]
+
 [[package]]
 name = "gettid"
 version = "0.1.3"
@@ -2698,6 +2795,16 @@ version = "0.15.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"

+[[package]]
+name = "hashbrown"
+version = "0.15.4"
+source = "git+https://github.com/quantumish/hashbrown.git?rev=6610e6d#6610e6d2b1f288ef7b0709a3efefbc846395dc5e"
+dependencies = [
+ "allocator-api2",
+ "equivalent",
+ "foldhash",
+]
+
 [[package]]
 name = "hashlink"
 version = "0.9.1"
@@ -2922,9 +3029,9 @@ dependencies = [

 [[package]]
 name = "httparse"
-version = "1.8.0"
+version = "1.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904"
+checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"

 [[package]]
 name = "httpdate"
@@ -2974,9 +3081,9 @@ dependencies = [

 [[package]]
 name = "hyper"
-version = "1.4.1"
+version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05"
+checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80"
 dependencies = [
 "bytes",
 "futures-channel",
@@ -3016,7 +3123,7 @@ checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c"
 dependencies = [
 "futures-util",
 "http 1.1.0",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "rustls 0.22.4",
 "rustls-pki-types",
@@ -3031,7 +3138,7 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3203a961e5c83b6f5498933e78b6b263e208c197b63e9c6c53cc82ffd3f63793"
 dependencies = [
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "pin-project-lite",
 "tokio",
@@ -3040,20 +3147,21 @@ dependencies = [

 [[package]]
 name = "hyper-util"
-version = "0.1.7"
+version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9"
+checksum = "dc2fdfdbff08affe55bb779f33b053aa1fe5dd5b54c257343c17edfa55711bdb"
 dependencies = [
 "bytes",
 "futures-channel",
+ "futures-core",
 "futures-util",
 "http 1.1.0",
 "http-body 1.0.0",
- "hyper 1.4.1",
+ "hyper 1.6.0",
+ "libc",
 "pin-project-lite",
 "socket2",
 "tokio",
- "tower 0.4.13",
 "tower-service",
 "tracing",
 ]
@@ -3606,9 +3714,9 @@ checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"

 [[package]]
 name = "lock_api"
-version = "0.4.10"
+version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16"
+checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765"
 dependencies = [
 "autocfg",
 "scopeguard",
@@ -3651,6 +3759,12 @@ dependencies = [
 "regex-automata 0.1.10",
 ]

+[[package]]
+name = "matchit"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
+
 [[package]]
 name = "matchit"
 version = "0.8.4"
@@ -3758,8 +3872,8 @@ dependencies = [
 "procfs",
 "prometheus",
 "rand 0.8.5",
- "rand_distr",
- "twox-hash",
+ "rand_distr 0.4.3",
+ "twox-hash 1.6.3",
 ]

 [[package]]
@@ -3846,10 +3960,33 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 name = "neon-shmem"
 version = "0.1.0"
 dependencies = [
+ "ahash",
+ "criterion",
+ "foldhash",
+ "hashbrown 0.15.4",
+ "libc",
+ "lock_api",
 "nix 0.30.1",
+ "rand 0.9.1",
+ "rand_distr 0.5.1",
+ "rustc-hash 2.1.1",
+ "seahash",
 "tempfile",
 "thiserror 1.0.69",
+ "twox-hash 2.1.1",
 "workspace_hack",
+ "xxhash-rust",
+]
+
+[[package]]
+name = "neonart"
+version = "0.1.0"
+dependencies = [
+ "crossbeam-utils",
+ "rand 0.9.1",
+ "rand_distr 0.5.1",
+ "spin",
+ "tracing",
 ]

 [[package]]
@@ -4285,13 +4422,16 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-trait",
+ "axum 0.8.1",
 "bytes",
 "camino",
 "clap",
 "futures",
 "hdrhistogram",
+ "http 1.1.0",
 "humantime",
 "humantime-serde",
+ "metrics",
 "pageserver_api",
 "pageserver_client",
 "pageserver_client_grpc",
@@ -4380,6 +4520,7 @@ dependencies = [
 "pageserver_client",
 "pageserver_compaction",
 "pageserver_page_api",
+ "peekable",
 "pem",
 "pin-project-lite",
 "postgres-protocol",
@@ -4393,6 +4534,7 @@ dependencies = [
 "pprof",
 "pq_proto",
 "procfs",
+ "prost 0.13.5",
 "rand 0.8.5",
 "range-set-blaze",
 "regex",
@@ -4429,7 +4571,7 @@ dependencies = [
 "tower 0.5.2",
 "tracing",
 "tracing-utils",
- "twox-hash",
+ "twox-hash 1.6.3",
 "url",
 "utils",
 "uuid",
@@ -4641,7 +4783,7 @@ dependencies = [
 "paste",
 "seq-macro",
 "thrift",
- "twox-hash",
+ "twox-hash 1.6.3",
 "zstd",
 "zstd-sys",
 ]
@@ -4687,6 +4829,15 @@ dependencies = [
 "sha2",
 ]

+[[package]]
+name = "peekable"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b"
+dependencies = [
+ "smallvec",
+]
+
 [[package]]
 name = "pem"
 version = "3.0.3"
@@ -5320,7 +5471,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "hyper 0.14.30",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "indexmap 2.9.0",
 "ipnet",
@@ -5344,7 +5495,7 @@ dependencies = [
 "postgres_backend",
 "pq_proto",
 "rand 0.8.5",
- "rand_distr",
+ "rand_distr 0.4.3",
 "rcgen",
 "redis",
 "regex",
@@ -5448,6 +5599,12 @@ dependencies = [
 "proc-macro2",
 ]

+[[package]]
+name = "r-efi"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5472,6 +5629,16 @@ dependencies = [
 "rand_core 0.6.4",
 ]

+[[package]]
+name = "rand"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
+dependencies = [
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.3",
+]
+
 [[package]]
 name = "rand_chacha"
 version = "0.2.2"
@@ -5492,6 +5659,16 @@ dependencies = [
 "rand_core 0.6.4",
 ]

+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.3",
+]
+
 [[package]]
 name = "rand_core"
 version = "0.5.1"
@@ -5510,6 +5687,15 @@ dependencies = [
 "getrandom 0.2.11",
 ]

+[[package]]
+name = "rand_core"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
+dependencies = [
+ "getrandom 0.3.3",
+]
+
 [[package]]
 name = "rand_distr"
 version = "0.4.3"
@@ -5520,6 +5706,16 @@ dependencies = [
 "rand 0.8.5",
 ]

+[[package]]
+name = "rand_distr"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
+dependencies = [
+ "num-traits",
+ "rand 0.9.1",
+]
+
 [[package]]
 name = "rand_hc"
 version = "0.2.0"
@@ -5718,7 +5914,7 @@ dependencies = [
 "http-body-util",
 "http-types",
 "humantime-serde",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "itertools 0.10.5",
 "metrics",
 "once_cell",
@@ -5758,7 +5954,7 @@ dependencies = [
 "http 1.1.0",
 "http-body 1.0.0",
 "http-body-util",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-rustls 0.26.0",
 "hyper-util",
 "ipnet",
@@ -5815,7 +6011,7 @@ dependencies = [
 "futures",
 "getrandom 0.2.11",
 "http 1.1.0",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "parking_lot 0.11.2",
 "reqwest",
 "reqwest-middleware",
@@ -5836,7 +6032,7 @@ dependencies = [
 "async-trait",
 "getrandom 0.2.11",
 "http 1.1.0",
- "matchit",
+ "matchit 0.8.4",
 "opentelemetry",
 "reqwest",
 "reqwest-middleware",
@@ -6323,6 +6519,12 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "621e3680f3e07db4c9c2c3fb07c6223ab2fab2e54bd3c04c3ae037990f428c32"

+[[package]]
+name = "seahash"
+version = "4.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
+
 [[package]]
 name = "sec1"
 version = "0.3.0"
@@ -6784,12 +6986,12 @@ dependencies = [

 [[package]]
 name = "socket2"
-version = "0.5.5"
+version = "0.5.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
+checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678"
 dependencies = [
 "libc",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]

 [[package]]
@@ -6797,6 +6999,9 @@ name = "spin"
 version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
+dependencies = [
+ "lock_api",
+]

 [[package]]
 name = "spinning_top"
@@ -6855,7 +7060,7 @@ dependencies = [
 "http-body-util",
 "http-utils",
 "humantime",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "metrics",
 "once_cell",
@@ -7464,6 +7669,16 @@ dependencies = [
 "syn 2.0.100",
 ]

+[[package]]
+name = "tokio-pipe"
+version = "0.2.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784"
+dependencies = [
+ "libc",
+ "tokio",
+]
+
 [[package]]
 name = "tokio-postgres"
 version = "0.7.10"
@@ -7658,16 +7873,25 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
 dependencies = [
+ "async-stream",
 "async-trait",
+ "axum 0.7.9",
 "base64 0.22.1",
 "bytes",
+ "h2 0.4.4",
 "http 1.1.0",
 "http-body 1.0.0",
 "http-body-util",
+ "hyper 1.6.0",
+ "hyper-timeout",
+ "hyper-util",
 "percent-encoding",
 "pin-project",
 "prost 0.13.5",
+ "socket2",
+ "tokio",
 "tokio-stream",
+ "tower 0.4.13",
 "tower-layer",
 "tower-service",
 "tracing",
@@ -7680,7 +7904,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9"
 dependencies = [
 "async-trait",
- "axum",
+ "axum 0.8.1",
 "base64 0.22.1",
 "bytes",
 "flate2",
@@ -7688,7 +7912,7 @@ dependencies = [
 "http 1.1.0",
 "http-body 1.0.0",
 "http-body-util",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-timeout",
 "hyper-util",
 "percent-encoding",
@@ -7741,11 +7965,16 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
 dependencies = [
 "futures-core",
 "futures-util",
+ "indexmap 1.9.3",
 "pin-project",
 "pin-project-lite",
+ "rand 0.8.5",
+ "slab",
 "tokio",
+ "tokio-util",
 "tower-layer",
 "tower-service",
+ "tracing",
 ]

 [[package]]
@@ -8016,6 +8245,15 @@ dependencies = [
 "static_assertions",
 ]

+[[package]]
+name = "twox-hash"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b907da542cbced5261bd3256de1b3a1bf340a3d37f93425a07362a1d687de56"
+dependencies = [
+ "rand 0.9.1",
+]
+
 [[package]]
 name = "typed-json"
 version = "0.1.1"
@@ -8229,7 +8467,7 @@ name = "vm_monitor"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "axum",
+ "axum 0.8.1",
 "cgroups-rs",
 "clap",
 "futures",
@@ -8341,6 +8579,15 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

+[[package]]
+name = "wasi"
+version = "0.14.2+wasi-0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
+dependencies = [
+ "wit-bindgen-rt",
+]
+
 [[package]]
 name = "wasite"
 version = "0.1.0"
@@ -8698,6 +8945,15 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

+[[package]]
+name = "wit-bindgen-rt"
+version = "0.39.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
+dependencies = [
+ "bitflags 2.8.0",
+]
+
 [[package]]
 name = "workspace_hack"
 version = "0.1.0"
@@ -8705,8 +8961,8 @@ dependencies = [
 "ahash",
 "anstream",
 "anyhow",
- "axum",
- "axum-core",
+ "axum 0.8.1",
+ "axum-core 0.5.0",
 "base64 0.21.7",
 "base64ct",
 "bytes",
@@ -8740,7 +8996,7 @@ dependencies = [
 "hex",
 "hmac",
 "hyper 0.14.30",
- "hyper 1.4.1",
+ "hyper 1.6.0",
 "hyper-util",
 "indexmap 2.9.0",
 "itertools 0.12.1",
@@ -8865,6 +9121,12 @@ version = "0.13.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd"

+[[package]]
+name = "xxhash-rust"
+version = "0.8.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3"
+
 [[package]]
 name = "yasna"
 version = "0.5.2"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -35,6 +35,7 @@ members = [
    "libs/pq_proto",
    "libs/tenant_size_model",
    "libs/metrics",
+    "libs/neonart",
    "libs/postgres_connection",
    "libs/remote_storage",
    "libs/tracing-utils",
@@ -92,6 +93,7 @@ clap = { version = "4.0", features = ["derive", "env"] }
 clashmap = { version = "1.0", features = ["raw-api"] }
 comfy-table = "7.1"
 const_format = "0.2"
+crossbeam-utils = "0.8.21"
 crc32c = "0.6"
 diatomic-waker = { version = "0.2.3" }
 either = "1.8"
@@ -150,6 +152,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pem = "3.0.3"
+peekable = "0.3.0"
 pin-project-lite = "0.2"
 pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
 procfs = "0.16"
@@ -186,6 +189,7 @@ smallvec = "1.11"
 smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 spki = "0.7.3"
+spin = "0.9.8"
 strum = "0.26"
 strum_macros = "0.26"
 "subtle"  = "2.5.0"
@@ -197,7 +201,6 @@ thiserror = "1.0"
 tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
 tokio = { version = "1.43.1", features = ["macros"] }
-tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
 tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
@@ -239,6 +242,9 @@ x509-cert = { version = "0.2.5" }
 env_logger = "0.11"
 log = "0.4"

+tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
+uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
+
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,4 +1,4 @@
-use anyhow::{Context, Result};
+use anyhow::{Context, Result, anyhow};
 use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
 use compute_api::responses::{
@@ -6,7 +6,8 @@ use compute_api::responses::{
    LfcPrewarmState, PromoteState, TlsConfig,
 };
 use compute_api::spec::{
-    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
+    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverConnectionInfo,
+    PageserverShardConnectionInfo, PgIdent,
 };
 use futures::StreamExt;
 use futures::future::join_all;
@@ -225,7 +226,7 @@ pub struct ParsedSpec {
    pub spec: ComputeSpec,
    pub tenant_id: TenantId,
    pub timeline_id: TimelineId,
-    pub pageserver_connstr: String,
+    pub pageserver_conninfo: PageserverConnectionInfo,
    pub safekeeper_connstrings: Vec<String>,
    pub storage_auth_token: Option<String>,
    /// k8s dns name and port
@@ -272,6 +273,27 @@ impl ParsedSpec {
    }
 }

+fn extract_pageserver_conninfo_from_guc(
+    pageserver_connstring_guc: &str,
+) -> PageserverConnectionInfo {
+    PageserverConnectionInfo {
+        shards: pageserver_connstring_guc
+            .split(',')
+            .enumerate()
+            .map(|(i, connstr)| {
+                (
+                    i as u32,
+                    PageserverShardConnectionInfo {
+                        libpq_url: Some(connstr.to_string()),
+                        grpc_url: None,
+                    },
+                )
+            })
+            .collect(),
+        prefer_grpc: false,
+    }
+}
+
 impl TryFrom<ComputeSpec> for ParsedSpec {
    type Error = String;
    fn try_from(spec: ComputeSpec) -> Result<Self, String> {
@@ -281,11 +303,17 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
        // For backwards-compatibility, the top-level fields in the spec file
        // may be empty. In that case, we need to dig them from the GUCs in the
        // cluster.settings field.
-        let pageserver_connstr = spec
-            .pageserver_connstring
-            .clone()
-            .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
-            .ok_or("pageserver connstr should be provided")?;
+        let pageserver_conninfo = match &spec.pageserver_connection_info {
+            Some(x) => x.clone(),
+            None => {
+                if let Some(guc) = spec.cluster.settings.find("neon.pageserver_connstring") {
+                    extract_pageserver_conninfo_from_guc(&guc)
+                } else {
+                    return Err("pageserver connstr should be provided".to_string());
+                }
+            }
+        };
+
        let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
            if matches!(spec.mode, ComputeMode::Primary) {
                spec.cluster
@@ -335,7 +363,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {

        let res = ParsedSpec {
            spec,
-            pageserver_connstr,
+            pageserver_conninfo,
            safekeeper_connstrings,
            storage_auth_token,
            tenant_id,
@@ -425,7 +453,7 @@ impl ComputeNode {

        let mut new_state = ComputeState::new();
        if let Some(spec) = config.spec {
-            let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
+            let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow!(msg))?;
            new_state.pspec = Some(pspec);
        }

@@ -1032,12 +1060,11 @@ impl ComputeNode {
    fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");

-        let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
        let started = Instant::now();
-
-        let (connected, size) = match PageserverProtocol::from_connstring(shard0_connstr)? {
-            PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?,
-            PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
+        let (connected, size) = if spec.pageserver_conninfo.prefer_grpc {
+            self.try_get_basebackup_grpc(spec, lsn)?
+        } else {
+            self.try_get_basebackup_libpq(spec, lsn)?
        };

        let mut state = self.state.lock().unwrap();
@@ -1052,20 +1079,21 @@ impl ComputeNode {
    /// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
    /// the connection was established, and the (compressed) size of the basebackup.
    fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
-        let shard0_connstr = spec
-            .pageserver_connstr
-            .split(',')
-            .next()
-            .unwrap()
-            .to_string();
-        let shard_index = match spec.pageserver_connstr.split(',').count() as u8 {
+        let shard0 = spec
+            .pageserver_conninfo
+            .shards
+            .get(&0)
+            .expect("shard 0 connection info missing");
+        let shard0_url = shard0.grpc_url.clone().expect("no grpc_url for shard 0");
+
+        let shard_index = match spec.pageserver_conninfo.shards.len() as u8 {
            0 | 1 => ShardIndex::unsharded(),
            count => ShardIndex::new(ShardNumber(0), ShardCount(count)),
        };

        let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
            let mut client = page_api::Client::connect(
-                shard0_connstr,
+                shard0_url,
                spec.tenant_id,
                spec.timeline_id,
                shard_index,
@@ -1100,8 +1128,13 @@ impl ComputeNode {
    /// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp
    /// when the connection was established, and the (compressed) size of the basebackup.
    fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
-        let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
-        let mut config = postgres::Config::from_str(shard0_connstr)?;
+        let shard0 = spec
+            .pageserver_conninfo
+            .shards
+            .get(&0)
+            .expect("shard 0 connection info missing");
+        let shard0_connstr = shard0.libpq_url.clone().expect("no libpq_url for shard 0");
+        let mut config = postgres::Config::from_str(&shard0_connstr)?;

        // Use the storage auth token from the config file, if given.
        // Note: this overrides any password set in the connection string.
@@ -1187,10 +1220,7 @@ impl ComputeNode {
                    return result;
                }
                Err(ref e) if attempts < max_attempts => {
-                    warn!(
-                        "Failed to get basebackup: {} (attempt {}/{})",
-                        e, attempts, max_attempts
-                    );
+                    warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})");
                    std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
                    retry_period_ms *= 1.5;
                }
@@ -1399,16 +1429,8 @@ impl ComputeNode {
            }
        };

-        info!(
-            "getting basebackup@{} from pageserver {}",
-            lsn, &pspec.pageserver_connstr
-        );
-        self.get_basebackup(compute_state, lsn).with_context(|| {
-            format!(
-                "failed to get basebackup@{} from pageserver {}",
-                lsn, &pspec.pageserver_connstr
-            )
-        })?;
+        self.get_basebackup(compute_state, lsn)
+            .with_context(|| format!("failed to get basebackup@{lsn}"))?;

        // Update pg_hba.conf received with basebackup.
        update_pg_hba(pgdata_path)?;
@@ -2074,7 +2096,7 @@ LIMIT 100",
            self.params
                .remote_ext_base_url
                .as_ref()
-                .ok_or(DownloadError::BadInput(anyhow::anyhow!(
+                .ok_or(DownloadError::BadInput(anyhow!(
                    "Remote extensions storage is not configured",
                )))?;

@@ -2270,7 +2292,7 @@ LIMIT 100",
        let remote_extensions = spec
            .remote_extensions
            .as_ref()
-            .ok_or(anyhow::anyhow!("Remote extensions are not configured"))?;
+            .ok_or(anyhow!("Remote extensions are not configured"))?;

        info!("parse shared_preload_libraries from spec.cluster.settings");
        let mut libs_vec = Vec::new();
@@ -2349,22 +2371,22 @@ LIMIT 100",
    /// The operation will time out after a specified duration.
    pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
        let state = self.state.lock().unwrap();
-        let old_pageserver_connstr = state
+        let old_pageserver_conninfo = state
            .pspec
            .as_ref()
            .expect("spec must be set")
-            .pageserver_connstr
+            .pageserver_conninfo
            .clone();
        let mut unchanged = true;
        let _ = self
            .state_changed
            .wait_timeout_while(state, duration, |s| {
-                let pageserver_connstr = &s
+                let pageserver_conninfo = &s
                    .pspec
                    .as_ref()
                    .expect("spec must be set")
-                    .pageserver_connstr;
-                unchanged = pageserver_connstr == &old_pageserver_connstr;
+                    .pageserver_conninfo;
+                unchanged = pageserver_conninfo == &old_pageserver_conninfo;
                unchanged
            })
            .unwrap();
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -56,9 +56,51 @@ pub fn write_postgres_conf(

    // Add options for connecting to storage
    writeln!(file, "# Neon storage settings")?;
-    if let Some(s) = &spec.pageserver_connstring {
-        writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
+
+    if let Some(conninfo) = &spec.pageserver_connection_info {
+        let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
+        let mut grpc_urls: Option<Vec<String>> = Some(Vec::new());
+
+        for shardno in 0..conninfo.shards.len() {
+            let info = conninfo.shards.get(&(shardno as u32)).ok_or_else(|| {
+                anyhow::anyhow!("shard {shardno} missing from pageserver_connection_info shard map")
+            })?;
+
+            if let Some(url) = &info.libpq_url {
+                if let Some(ref mut urls) = libpq_urls {
+                    urls.push(url.clone());
+                }
+            } else {
+                libpq_urls = None
+            }
+            if let Some(url) = &info.grpc_url {
+                if let Some(ref mut urls) = grpc_urls {
+                    urls.push(url.clone());
+                }
+            } else {
+                grpc_urls = None
+            }
+        }
+        if let Some(libpq_urls) = libpq_urls {
+            writeln!(
+                file,
+                "neon.pageserver_connstring={}",
+                escape_conf_value(&libpq_urls.join(","))
+            )?;
+        } else {
+            writeln!(file, "# no neon.pageserver_connstring")?;
+        }
+        if let Some(grpc_urls) = grpc_urls {
+            writeln!(
+                file,
+                "neon.pageserver_grpc_urls={}",
+                escape_conf_value(&grpc_urls.join(","))
+            )?;
+        } else {
+            writeln!(file, "# no neon.pageserver_grpc_urls")?;
+        }
    }
+
    if let Some(stripe_size) = spec.shard_stripe_size {
        writeln!(file, "neon.stripe_size={stripe_size}")?;
    }
--- a/compute_tools/src/lsn_lease.rs
+++ b/compute_tools/src/lsn_lease.rs
@@ -4,8 +4,7 @@ use std::thread;
 use std::time::{Duration, SystemTime};

 use anyhow::{Result, bail};
-use compute_api::spec::{ComputeMode, PageserverProtocol};
-use itertools::Itertools as _;
+use compute_api::spec::{ComputeMode, PageserverConnectionInfo};
 use pageserver_page_api as page_api;
 use postgres::{NoTls, SimpleQueryMessage};
 use tracing::{info, warn};
@@ -78,17 +77,16 @@ fn acquire_lsn_lease_with_retry(

    loop {
        // Note: List of pageservers is dynamic, need to re-read configs before each attempt.
-        let (connstrings, auth) = {
+        let (conninfo, auth) = {
            let state = compute.state.lock().unwrap();
            let spec = state.pspec.as_ref().expect("spec must be set");
            (
-                spec.pageserver_connstr.clone(),
+                spec.pageserver_conninfo.clone(),
                spec.storage_auth_token.clone(),
            )
        };

-        let result =
-            try_acquire_lsn_lease(&connstrings, auth.as_deref(), tenant_id, timeline_id, lsn);
+        let result = try_acquire_lsn_lease(conninfo, auth.as_deref(), tenant_id, timeline_id, lsn);
        match result {
            Ok(Some(res)) => {
                return Ok(res);
@@ -112,17 +110,16 @@ fn acquire_lsn_lease_with_retry(

 /// Tries to acquire LSN leases on all Pageserver shards.
 fn try_acquire_lsn_lease(
-    connstrings: &str,
+    conninfo: PageserverConnectionInfo,
    auth: Option<&str>,
    tenant_id: TenantId,
    timeline_id: TimelineId,
    lsn: Lsn,
 ) -> Result<Option<SystemTime>> {
-    let connstrings = connstrings.split(',').collect_vec();
-    let shard_count = connstrings.len();
+    let shard_count = conninfo.shards.len();
    let mut leases = Vec::new();

-    for (shard_number, &connstring) in connstrings.iter().enumerate() {
+    for (shard_number, shard) in conninfo.shards.into_iter() {
        let tenant_shard_id = match shard_count {
            0 | 1 => TenantShardId::unsharded(tenant_id),
            shard_count => TenantShardId {
@@ -132,13 +129,22 @@ fn try_acquire_lsn_lease(
            },
        };

-        let lease = match PageserverProtocol::from_connstring(connstring)? {
-            PageserverProtocol::Libpq => {
-                acquire_lsn_lease_libpq(connstring, auth, tenant_shard_id, timeline_id, lsn)?
-            }
-            PageserverProtocol::Grpc => {
-                acquire_lsn_lease_grpc(connstring, auth, tenant_shard_id, timeline_id, lsn)?
-            }
+        let lease = if conninfo.prefer_grpc {
+            acquire_lsn_lease_grpc(
+                &shard.grpc_url.unwrap(),
+                auth,
+                tenant_shard_id,
+                timeline_id,
+                lsn,
+            )?
+        } else {
+            acquire_lsn_lease_libpq(
+                &shard.libpq_url.unwrap(),
+                auth,
+                tenant_shard_id,
+                timeline_id,
+                lsn,
+            )?
        };
        leases.push(lease);
    }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -16,7 +16,7 @@ use std::time::Duration;
 use anyhow::{Context, Result, anyhow, bail};
 use clap::Parser;
 use compute_api::requests::ComputeClaimsScope;
-use compute_api::spec::{ComputeMode, PageserverProtocol};
+use compute_api::spec::{ComputeMode, PageserverConnectionInfo, PageserverShardConnectionInfo};
 use control_plane::broker::StorageBroker;
 use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode};
 use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
@@ -1516,29 +1516,35 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                )?;
            }

-            let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
-                let conf = env.get_pageserver_conf(pageserver_id).unwrap();
-                // Use gRPC if requested.
-                let pageserver = if endpoint.grpc {
-                    let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
-                    let (host, port) = parse_host_port(grpc_addr)?;
-                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
-                    (PageserverProtocol::Grpc, host, port)
-                } else {
+            let (shards, stripe_size) = if let Some(ps_id) = pageserver_id {
+                let conf = env.get_pageserver_conf(ps_id).unwrap();
+                let libpq_url = Some({
                    let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
                    let port = port.unwrap_or(5432);
-                    (PageserverProtocol::Libpq, host, port)
+                    format!("postgres://no_user@{host}:{port}")
+                });
+                let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
+                    let (host, port) = parse_host_port(grpc_addr)?;
+                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
+                    Some(format!("grpc://no_user@{host}:{port}"))
+                } else {
+                    None
                };
+                let pageserver = PageserverShardConnectionInfo {
+                    libpq_url,
+                    grpc_url,
+                };
+
                // If caller is telling us what pageserver to use, this is not a tenant which is
                // fully managed by storage controller, therefore not sharded.
-                (vec![pageserver], DEFAULT_STRIPE_SIZE)
+                (vec![(0, pageserver)], DEFAULT_STRIPE_SIZE)
            } else {
                // Look up the currently attached location of the tenant, and its striping metadata,
                // to pass these on to postgres.
                let storage_controller = StorageController::from_env(env);
                let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
-                let pageservers = futures::future::try_join_all(
-                    locate_result.shards.into_iter().map(|shard| async move {
+                let shards = futures::future::try_join_all(locate_result.shards.into_iter().map(
+                    |shard| async move {
                        if let ComputeMode::Static(lsn) = endpoint.mode {
                            // Initialize LSN leases for static computes.
                            let conf = env.get_pageserver_conf(shard.node_id).unwrap();
@@ -1550,28 +1556,34 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                                .await?;
                        }

-                        let pageserver = if endpoint.grpc {
-                            (
-                                PageserverProtocol::Grpc,
-                                Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))?,
-                                shard.listen_grpc_port.expect("no gRPC port"),
-                            )
+                        let libpq_host = Host::parse(&shard.listen_pg_addr)?;
+                        let libpq_port = shard.listen_pg_port;
+                        let libpq_url =
+                            Some(format!("postgres://no_user@{libpq_host}:{libpq_port}"));
+
+                        let grpc_url = if let Some(grpc_host) = shard.listen_grpc_addr {
+                            let grpc_port = shard.listen_grpc_port.expect("no gRPC port");
+                            Some(format!("grpc://no_user@{grpc_host}:{grpc_port}"))
                        } else {
-                            (
-                                PageserverProtocol::Libpq,
-                                Host::parse(&shard.listen_pg_addr)?,
-                                shard.listen_pg_port,
-                            )
+                            None
                        };
-                        anyhow::Ok(pageserver)
-                    }),
-                )
+                        let pageserver = PageserverShardConnectionInfo {
+                            libpq_url,
+                            grpc_url,
+                        };
+                        anyhow::Ok((shard.shard_id.shard_number.0 as u32, pageserver))
+                    },
+                ))
                .await?;
                let stripe_size = locate_result.shard_params.stripe_size;

-                (pageservers, stripe_size)
+                (shards, stripe_size)
+            };
+            assert!(!shards.is_empty());
+            let pageserver_conninfo = PageserverConnectionInfo {
+                shards: shards.into_iter().collect(),
+                prefer_grpc: endpoint.grpc,
            };
-            assert!(!pageservers.is_empty());

            let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
@@ -1601,7 +1613,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                endpoint_storage_addr,
                safekeepers_generation,
                safekeepers,
-                pageservers,
+                pageserver_conninfo,
                remote_ext_base_url: remote_ext_base_url.clone(),
                shard_stripe_size: stripe_size.0 as usize,
                create_test_user: args.create_test_user,
@@ -1620,20 +1632,27 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                .endpoints
                .get(endpoint_id.as_str())
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id {
+            let shards = if let Some(ps_id) = args.endpoint_pageserver_id {
                let conf = env.get_pageserver_conf(ps_id)?;
-                // Use gRPC if requested.
-                let pageserver = if endpoint.grpc {
-                    let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
-                    let (host, port) = parse_host_port(grpc_addr)?;
-                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
-                    (PageserverProtocol::Grpc, host, port)
-                } else {
+                let libpq_url = Some({
                    let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
                    let port = port.unwrap_or(5432);
-                    (PageserverProtocol::Libpq, host, port)
+                    format!("postgres://no_user@{host}:{port}")
+                });
+                let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
+                    let (host, port) = parse_host_port(grpc_addr)?;
+                    let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
+                    Some(format!("grpc://no_user@{host}:{port}"))
+                } else {
+                    None
                };
-                vec![pageserver]
+                let pageserver = PageserverShardConnectionInfo {
+                    libpq_url,
+                    grpc_url,
+                };
+                // If caller is telling us what pageserver to use, this is not a tenant which is
+                // fully managed by storage controller, therefore not sharded.
+                vec![(0, pageserver)]
            } else {
                let storage_controller = StorageController::from_env(env);
                storage_controller
@@ -1643,28 +1662,36 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                    .into_iter()
                    .map(|shard| {
                        // Use gRPC if requested.
-                        if endpoint.grpc {
-                            (
-                                PageserverProtocol::Grpc,
-                                Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))
-                                    .expect("bad hostname"),
-                                shard.listen_grpc_port.expect("no gRPC port"),
-                            )
+                        let libpq_host = Host::parse(&shard.listen_pg_addr).expect("bad hostname");
+                        let libpq_port = shard.listen_pg_port;
+                        let libpq_url =
+                            Some(format!("postgres://no_user@{libpq_host}:{libpq_port}"));
+
+                        let grpc_url = if let Some(grpc_host) = shard.listen_grpc_addr {
+                            let grpc_port = shard.listen_grpc_port.expect("no gRPC port");
+                            Some(format!("grpc://no_user@{grpc_host}:{grpc_port}"))
                        } else {
-                            (
-                                PageserverProtocol::Libpq,
-                                Host::parse(&shard.listen_pg_addr).expect("bad hostname"),
-                                shard.listen_pg_port,
-                            )
-                        }
+                            None
+                        };
+                        (
+                            shard.shard_id.shard_number.0 as u32,
+                            PageserverShardConnectionInfo {
+                                libpq_url,
+                                grpc_url,
+                            },
+                        )
                    })
                    .collect::<Vec<_>>()
            };
+            let pageserver_conninfo = PageserverConnectionInfo {
+                shards: shards.into_iter().collect(),
+                prefer_grpc: endpoint.grpc,
+            };
            // If --safekeepers argument is given, use only the listed
            // safekeeper nodes; otherwise all from the env.
            let safekeepers = parse_safekeepers(&args.safekeepers)?;
            endpoint
-                .reconfigure(Some(pageservers), None, safekeepers, None)
+                .reconfigure(Some(pageserver_conninfo), None, safekeepers, None)
                .await?;
        }
        EndpointCmd::Stop(args) => {
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -56,9 +56,13 @@ use compute_api::responses::{
    TlsConfig,
 };
 use compute_api::spec::{
-    Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol,
-    PgIdent, RemoteExtSpec, Role,
+    Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
+    RemoteExtSpec, Role,
 };
+
+// re-export these, because they're used in the reconfigure() function
+pub use compute_api::spec::{PageserverConnectionInfo, PageserverShardConnectionInfo};
+
 use jsonwebtoken::jwk::{
    AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
    OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
@@ -74,7 +78,6 @@ use sha2::{Digest, Sha256};
 use spki::der::Decode;
 use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
 use tracing::debug;
-use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};

 use crate::local_env::LocalEnv;
@@ -379,7 +382,7 @@ pub struct EndpointStartArgs {
    pub endpoint_storage_addr: String,
    pub safekeepers_generation: Option<SafekeeperGeneration>,
    pub safekeepers: Vec<NodeId>,
-    pub pageservers: Vec<(PageserverProtocol, Host, u16)>,
+    pub pageserver_conninfo: PageserverConnectionInfo,
    pub remote_ext_base_url: Option<String>,
    pub shard_stripe_size: usize,
    pub create_test_user: bool,
@@ -653,14 +656,6 @@ impl Endpoint {
        }
    }

-    fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String {
-        pageservers
-            .iter()
-            .map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}"))
-            .collect::<Vec<_>>()
-            .join(",")
-    }
-
    /// Map safekeepers ids to the actual connection strings.
    fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
        let mut safekeeper_connstrings = Vec::new();
@@ -706,9 +701,6 @@ impl Endpoint {
            std::fs::remove_dir_all(self.pgdata())?;
        }

-        let pageserver_connstring = Self::build_pageserver_connstr(&args.pageservers);
-        assert!(!pageserver_connstring.is_empty());
-
        let safekeeper_connstrings = self.build_safekeepers_connstrs(args.safekeepers)?;

        // check for file remote_extensions_spec.json
@@ -767,7 +759,7 @@ impl Endpoint {
                branch_id: None,
                endpoint_id: Some(self.endpoint_id.clone()),
                mode: self.mode,
-                pageserver_connstring: Some(pageserver_connstring),
+                pageserver_connection_info: Some(args.pageserver_conninfo),
                safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()),
                safekeeper_connstrings,
                storage_auth_token: args.auth_token.clone(),
@@ -981,7 +973,7 @@ impl Endpoint {

    pub async fn reconfigure(
        &self,
-        pageservers: Option<Vec<(PageserverProtocol, Host, u16)>>,
+        pageserver_conninfo: Option<PageserverConnectionInfo>,
        stripe_size: Option<ShardStripeSize>,
        safekeepers: Option<Vec<NodeId>>,
        safekeeper_generation: Option<SafekeeperGeneration>,
@@ -997,15 +989,17 @@ impl Endpoint {
        let postgresql_conf = self.read_postgresql_conf()?;
        spec.cluster.postgresql_conf = Some(postgresql_conf);

-        // If pageservers are not specified, don't change them.
-        if let Some(pageservers) = pageservers {
-            anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
-
-            let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
-            spec.pageserver_connstring = Some(pageserver_connstr);
-            if stripe_size.is_some() {
-                spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
-            }
+        if let Some(pageserver_conninfo) = pageserver_conninfo {
+            // If pageservers are provided, we need to ensure that they are not empty.
+            // This is a requirement for the compute_ctl configuration.
+            anyhow::ensure!(
+                !pageserver_conninfo.shards.is_empty(),
+                "no pageservers provided"
+            );
+            spec.pageserver_connection_info = Some(pageserver_conninfo);
+        }
+        if stripe_size.is_some() {
+            spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
        }

        // If safekeepers are not specified, don't change them.
@@ -1054,7 +1048,7 @@ impl Endpoint {

    pub async fn reconfigure_pageservers(
        &self,
-        pageservers: Vec<(PageserverProtocol, Host, u16)>,
+        pageservers: PageserverConnectionInfo,
        stripe_size: Option<ShardStripeSize>,
    ) -> Result<()> {
        self.reconfigure(Some(pageservers), stripe_size, None, None)
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -105,7 +105,11 @@ pub struct ComputeSpec {
    // updated to fill these fields, we can make these non optional.
    pub tenant_id: Option<TenantId>,
    pub timeline_id: Option<TimelineId>,
-    pub pageserver_connstring: Option<String>,
+
+    // Pageserver information can be passed in two different ways:
+    // 1. Here
+    // 2. in cluster.settings. This is legacy, we are switching to method 1.
+    pub pageserver_connection_info: Option<PageserverConnectionInfo>,

    // More neon ids that we expose to the compute_ctl
    // and to postgres as neon extension GUCs.
@@ -214,6 +218,20 @@ pub enum ComputeFeature {
    UnknownFeature,
 }

+/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
+#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
+pub struct PageserverConnectionInfo {
+    pub shards: HashMap<u32, PageserverShardConnectionInfo>,
+
+    pub prefer_grpc: bool,
+}
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
+pub struct PageserverShardConnectionInfo {
+    pub libpq_url: Option<String>,
+    pub grpc_url: Option<String>,
+}
+
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct RemoteExtSpec {
    pub public_extensions: Option<Vec<String>>,
@@ -331,6 +349,12 @@ impl ComputeMode {
    }
 }

+impl Display for ComputeMode {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.to_type_str())
+    }
+}
+
 /// Log level for audit logging
 #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeAudit {
--- a/libs/neon-shmem/Cargo.toml
+++ b/libs/neon-shmem/Cargo.toml
@@ -6,8 +6,27 @@ license.workspace = true

 [dependencies]
 thiserror.workspace = true
-nix.workspace=true
+nix.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+rustc-hash = { version = "2.1.1" }
+rand = "0.9.1"
+libc.workspace = true
+lock_api = "0.4.13"
+
+[dev-dependencies]
+criterion = { workspace = true, features = ["html_reports"] }
+rand_distr = "0.5.1"
+xxhash-rust = { version = "0.8.15", features = ["xxh3"] }
+ahash.workspace = true
+twox-hash = { version = "2.1.1" }
+seahash = "4.1.0"
+hashbrown = { git = "https://github.com/quantumish/hashbrown.git", rev = "6610e6d" }
+foldhash = "0.1.5"
+

 [target.'cfg(target_os = "macos")'.dependencies]
 tempfile = "3.14.0"
+
+[[bench]]
+name = "hmap_resize"
+harness = false
--- a/libs/neon-shmem/benches/hmap_resize.rs
+++ b/libs/neon-shmem/benches/hmap_resize.rs
@@ -0,0 +1,330 @@
+use criterion::{BatchSize, BenchmarkId, Criterion, criterion_group, criterion_main};
+use neon_shmem::hash::HashMapAccess;
+use neon_shmem::hash::HashMapInit;
+use neon_shmem::hash::entry::Entry;
+use rand::distr::{Distribution, StandardUniform};
+use rand::prelude::*;
+use std::default::Default;
+use std::hash::BuildHasher;
+
+// Taken from bindings to C code
+
+#[derive(Clone, Debug, Hash, Eq, PartialEq)]
+#[repr(C)]
+pub struct FileCacheKey {
+    pub _spc_id: u32,
+    pub _db_id: u32,
+    pub _rel_number: u32,
+    pub _fork_num: u32,
+    pub _block_num: u32,
+}
+
+impl Distribution<FileCacheKey> for StandardUniform {
+    // questionable, but doesn't need to be good randomness
+    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> FileCacheKey {
+        FileCacheKey {
+            _spc_id: rng.random(),
+            _db_id: rng.random(),
+            _rel_number: rng.random(),
+            _fork_num: rng.random(),
+            _block_num: rng.random(),
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+#[repr(C)]
+pub struct FileCacheEntry {
+    pub _offset: u32,
+    pub _access_count: u32,
+    pub _prev: *mut FileCacheEntry,
+    pub _next: *mut FileCacheEntry,
+    pub _state: [u32; 8],
+}
+
+impl FileCacheEntry {
+    fn dummy() -> Self {
+        Self {
+            _offset: 0,
+            _access_count: 0,
+            _prev: std::ptr::null_mut(),
+            _next: std::ptr::null_mut(),
+            _state: [0; 8],
+        }
+    }
+}
+
+// Utilities for applying operations.
+
+#[derive(Clone, Debug)]
+struct TestOp<K, V>(K, Option<V>);
+
+fn apply_op<K: Clone + std::hash::Hash + Eq, V, S: std::hash::BuildHasher>(
+    op: TestOp<K, V>,
+    map: &mut HashMapAccess<K, V, S>,
+) {
+    let entry = map.entry(op.0);
+
+    match op.1 {
+        Some(new) => match entry {
+            Entry::Occupied(mut e) => Some(e.insert(new)),
+            Entry::Vacant(e) => {
+                _ = e.insert(new).unwrap();
+                None
+            }
+        },
+        None => match entry {
+            Entry::Occupied(e) => Some(e.remove()),
+            Entry::Vacant(_) => None,
+        },
+    };
+}
+
+// Hash utilities
+
+struct SeaRandomState {
+    k1: u64,
+    k2: u64,
+    k3: u64,
+    k4: u64,
+}
+
+impl std::hash::BuildHasher for SeaRandomState {
+    type Hasher = seahash::SeaHasher;
+
+    fn build_hasher(&self) -> Self::Hasher {
+        seahash::SeaHasher::with_seeds(self.k1, self.k2, self.k3, self.k4)
+    }
+}
+
+impl SeaRandomState {
+    fn new() -> Self {
+        let mut rng = rand::rng();
+        Self {
+            k1: rng.random(),
+            k2: rng.random(),
+            k3: rng.random(),
+            k4: rng.random(),
+        }
+    }
+}
+
+fn small_benchs(c: &mut Criterion) {
+    let mut group = c.benchmark_group("Small maps");
+    group.sample_size(10);
+
+    group.bench_function("small_rehash", |b| {
+        let ideal_filled = 4_000_000;
+        let size = 5_000_000;
+        let mut writer = HashMapInit::new_resizeable(size, size * 2).attach_writer();
+        let mut rng = rand::rng();
+        while writer.get_num_buckets_in_use() < ideal_filled as usize {
+            let key: FileCacheKey = rng.random();
+            let val = FileCacheEntry::dummy();
+            apply_op(TestOp(key, Some(val)), &mut writer);
+        }
+        b.iter(|| writer.shuffle());
+    });
+
+    group.bench_function("small_rehash_xxhash", |b| {
+        let ideal_filled = 4_000_000;
+        let size = 5_000_000;
+        let mut writer = HashMapInit::new_resizeable(size, size * 2)
+            .with_hasher(twox_hash::xxhash64::RandomState::default())
+            .attach_writer();
+        let mut rng = rand::rng();
+        while writer.get_num_buckets_in_use() < ideal_filled as usize {
+            let key: FileCacheKey = rng.random();
+            let val = FileCacheEntry::dummy();
+            apply_op(TestOp(key, Some(val)), &mut writer);
+        }
+        b.iter(|| writer.shuffle());
+    });
+
+    group.bench_function("small_rehash_ahash", |b| {
+        let ideal_filled = 4_000_000;
+        let size = 5_000_000;
+        let mut writer = HashMapInit::new_resizeable(size, size * 2)
+            .with_hasher(ahash::RandomState::default())
+            .attach_writer();
+        let mut rng = rand::rng();
+        while writer.get_num_buckets_in_use() < ideal_filled as usize {
+            let key: FileCacheKey = rng.random();
+            let val = FileCacheEntry::dummy();
+            apply_op(TestOp(key, Some(val)), &mut writer);
+        }
+        b.iter(|| writer.shuffle());
+    });
+
+    group.bench_function("small_rehash_seahash", |b| {
+        let ideal_filled = 4_000_000;
+        let size = 5_000_000;
+        let mut writer = HashMapInit::new_resizeable(size, size * 2)
+            .with_hasher(SeaRandomState::new())
+            .attach_writer();
+        let mut rng = rand::rng();
+        while writer.get_num_buckets_in_use() < ideal_filled as usize {
+            let key: FileCacheKey = rng.random();
+            let val = FileCacheEntry::dummy();
+            apply_op(TestOp(key, Some(val)), &mut writer);
+        }
+        b.iter(|| writer.shuffle());
+    });
+
+    group.finish();
+}
+
+fn real_benchs(c: &mut Criterion) {
+    let mut group = c.benchmark_group("Realistic workloads");
+    group.sample_size(10);
+    group.bench_function("real_bulk_insert", |b| {
+        let size = 125_000_000;
+        let ideal_filled = 100_000_000;
+        let mut rng = rand::rng();
+        b.iter_batched(
+            || HashMapInit::new_resizeable(size, size * 2).attach_writer(),
+            |writer| {
+                for _ in 0..ideal_filled {
+                    let key: FileCacheKey = rng.random();
+                    let val = FileCacheEntry::dummy();
+                    let entry = writer.entry(key);
+                    std::hint::black_box(match entry {
+                        Entry::Occupied(mut e) => {
+                            e.insert(val);
+                        }
+                        Entry::Vacant(e) => {
+                            _ = e.insert(val).unwrap();
+                        }
+                    })
+                }
+            },
+            BatchSize::SmallInput,
+        )
+    });
+
+    group.bench_function("real_rehash", |b| {
+        let size = 125_000_000;
+        let ideal_filled = 100_000_000;
+        let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
+        let mut rng = rand::rng();
+        while writer.get_num_buckets_in_use() < ideal_filled {
+            let key: FileCacheKey = rng.random();
+            let val = FileCacheEntry::dummy();
+            apply_op(TestOp(key, Some(val)), &mut writer);
+        }
+        b.iter(|| writer.shuffle());
+    });
+
+    group.bench_function("real_rehash_hashbrown", |b| {
+        let size = 125_000_000;
+        let ideal_filled = 100_000_000;
+        let mut writer = hashbrown::raw::RawTable::new();
+        let mut rng = rand::rng();
+        let hasher = rustc_hash::FxBuildHasher::default();
+        unsafe {
+            writer
+                .resize(
+                    size,
+                    |(k, _)| hasher.hash_one(&k),
+                    hashbrown::raw::Fallibility::Infallible,
+                )
+                .unwrap();
+        }
+        while writer.len() < ideal_filled as usize {
+            let key: FileCacheKey = rng.random();
+            let val = FileCacheEntry::dummy();
+            writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
+                hasher.hash_one(&k)
+            });
+        }
+        b.iter(|| unsafe {
+            writer.table.rehash_in_place(
+                &|table, index| {
+                    hasher.hash_one(
+                        &table
+                            .bucket::<(FileCacheKey, FileCacheEntry)>(index)
+                            .as_ref()
+                            .0,
+                    )
+                },
+                std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
+                if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
+                    Some(|ptr| std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry)))
+                } else {
+                    None
+                },
+            )
+        });
+    });
+
+    for elems in [2, 4, 8, 16, 32, 64, 96, 112] {
+        group.bench_with_input(
+            BenchmarkId::new("real_rehash_varied", elems),
+            &elems,
+            |b, &size| {
+                let ideal_filled = size * 1_000_000;
+                let size = 125_000_000;
+                let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
+                let mut rng = rand::rng();
+                while writer.get_num_buckets_in_use() < ideal_filled as usize {
+                    let key: FileCacheKey = rng.random();
+                    let val = FileCacheEntry::dummy();
+                    apply_op(TestOp(key, Some(val)), &mut writer);
+                }
+                b.iter(|| writer.shuffle());
+            },
+        );
+        group.bench_with_input(
+            BenchmarkId::new("real_rehash_varied_hashbrown", elems),
+            &elems,
+            |b, &size| {
+                let ideal_filled = size * 1_000_000;
+                let size = 125_000_000;
+                let mut writer = hashbrown::raw::RawTable::new();
+                let mut rng = rand::rng();
+                let hasher = rustc_hash::FxBuildHasher::default();
+                unsafe {
+                    writer
+                        .resize(
+                            size,
+                            |(k, _)| hasher.hash_one(&k),
+                            hashbrown::raw::Fallibility::Infallible,
+                        )
+                        .unwrap();
+                }
+                while writer.len() < ideal_filled as usize {
+                    let key: FileCacheKey = rng.random();
+                    let val = FileCacheEntry::dummy();
+                    writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
+                        hasher.hash_one(&k)
+                    });
+                }
+                b.iter(|| unsafe {
+                    writer.table.rehash_in_place(
+                        &|table, index| {
+                            hasher.hash_one(
+                                &table
+                                    .bucket::<(FileCacheKey, FileCacheEntry)>(index)
+                                    .as_ref()
+                                    .0,
+                            )
+                        },
+                        std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
+                        if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
+                            Some(|ptr| {
+                                std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry))
+                            })
+                        } else {
+                            None
+                        },
+                    )
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, small_benchs, real_benchs);
+criterion_main!(benches);
--- a/libs/neon-shmem/src/hash.rs
+++ b/libs/neon-shmem/src/hash.rs
@@ -0,0 +1,598 @@
+//! Resizable hash table implementation on top of byte-level storage (either a [`ShmemHandle`] or a fixed byte array).
+//!
+//! This hash table has two major components: the bucket array and the dictionary. Each bucket within the
+//! bucket array contains a `Option<(K, V)>` and an index of another bucket. In this way there is both an
+//! implicit freelist within the bucket array (`None` buckets point to other `None` entries) and various hash
+//! chains within the bucket array (a Some bucket will point to other Some buckets that had the same hash).
+//!
+//! Buckets are never moved unless they are within a region that is being shrunk, and so the actual hash-
+//! dependent component is done with the dictionary. When a new key is inserted into the map, a position
+//! within the dictionary is decided based on its hash, the data is inserted into an empty bucket based
+//! off of the freelist, and then the index of said bucket is placed in the dictionary.
+//!
+//! This map is resizable (if initialized on top of a [`ShmemHandle`]). Both growing and shrinking happen
+//! in-place and are at a high level achieved by expanding/reducing the bucket array and rebuilding the
+//! dictionary by rehashing all keys.
+
+use std::fmt::Debug;
+use std::hash::{BuildHasher, Hash};
+use std::mem::MaybeUninit;
+
+use crate::shmem::ShmemHandle;
+use crate::{shmem, sync::*};
+
+mod core;
+pub mod entry;
+
+#[cfg(test)]
+mod tests;
+
+use core::{Bucket, CoreHashMap, INVALID_POS};
+use entry::{Entry, OccupiedEntry, PrevPos, VacantEntry};
+
+/// This represents a hash table that (possibly) lives in shared memory.
+/// If a new process is launched with fork(), the child process inherits
+/// this struct.
+#[must_use]
+pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+    shared_size: usize,
+    hasher: S,
+    num_buckets: u32,
+}
+
+impl<'a, K, V, S> Debug for HashMapInit<'a, K, V, S>
+where
+    K: Debug,
+    V: Debug,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("HashMapInit")
+            .field("shmem_handle", &self.shmem_handle)
+            .field("shared_ptr", &self.shared_ptr)
+            .field("shared_size", &self.shared_size)
+            // .field("hasher", &self.hasher)
+            .field("num_buckets", &self.num_buckets)
+            .finish()
+    }
+}
+
+/// This is a per-process handle to a hash table that (possibly) lives in shared memory.
+/// If a child process is launched with fork(), the child process should
+/// get its own HashMapAccess by calling HashMapInit::attach_writer/reader().
+///
+/// XXX: We're not making use of it at the moment, but this struct could
+/// hold process-local information in the future.
+pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+    hasher: S,
+}
+
+unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<'_, K, V, S> {}
+unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<'_, K, V, S> {}
+
+impl<'a, K, V, S> Debug for HashMapAccess<'a, K, V, S>
+where
+    K: Debug,
+    V: Debug,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("HashMapAccess")
+            .field("shmem_handle", &self.shmem_handle)
+            .field("shared_ptr", &self.shared_ptr)
+            // .field("hasher", &self.hasher)
+            .finish()
+    }
+}
+
+impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
+    /// Change the 'hasher' used by the hash table.
+    ///
+    /// NOTE: This must be called right after creating the hash table,
+    /// before inserting any entries and before calling attach_writer/reader.
+    /// Otherwise different accessors could be using different hash function,
+    /// with confusing results.
+    pub fn with_hasher<T: BuildHasher>(self, hasher: T) -> HashMapInit<'a, K, V, T> {
+        HashMapInit {
+            hasher,
+            shmem_handle: self.shmem_handle,
+            shared_ptr: self.shared_ptr,
+            shared_size: self.shared_size,
+            num_buckets: self.num_buckets,
+        }
+    }
+
+    /// Loosely (over)estimate the size needed to store a hash table with `num_buckets` buckets.
+    pub fn estimate_size(num_buckets: u32) -> usize {
+        // add some margin to cover alignment etc.
+        CoreHashMap::<K, V>::estimate_size(num_buckets) + size_of::<HashMapShared<K, V>>() + 1000
+    }
+
+    fn new(
+        num_buckets: u32,
+        shmem_handle: Option<ShmemHandle>,
+        area_ptr: *mut u8,
+        area_size: usize,
+        hasher: S,
+    ) -> Self {
+        let mut ptr: *mut u8 = area_ptr;
+        let end_ptr: *mut u8 = unsafe { ptr.add(area_size) };
+
+        // carve out area for the One Big Lock (TM) and the HashMapShared.
+        ptr = unsafe { ptr.add(ptr.align_offset(align_of::<libc::pthread_rwlock_t>())) };
+        let raw_lock_ptr = ptr;
+        ptr = unsafe { ptr.add(size_of::<libc::pthread_rwlock_t>()) };
+        ptr = unsafe { ptr.add(ptr.align_offset(align_of::<HashMapShared<K, V>>())) };
+        let shared_ptr: *mut HashMapShared<K, V> = ptr.cast();
+        ptr = unsafe { ptr.add(size_of::<HashMapShared<K, V>>()) };
+
+        // carve out the buckets
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<core::Bucket<K, V>>())) };
+        let buckets_ptr = ptr;
+        ptr = unsafe { ptr.add(size_of::<core::Bucket<K, V>>() * num_buckets as usize) };
+
+        // use remaining space for the dictionary
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<u32>())) };
+        assert!(ptr.addr() < end_ptr.addr());
+        let dictionary_ptr = ptr;
+        let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::<u32>() as isize };
+        assert!(dictionary_size > 0);
+
+        let buckets =
+            unsafe { std::slice::from_raw_parts_mut(buckets_ptr.cast(), num_buckets as usize) };
+        let dictionary = unsafe {
+            std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize)
+        };
+
+        let hashmap = CoreHashMap::new(buckets, dictionary);
+        let lock = RwLock::from_raw(PthreadRwLock::new(raw_lock_ptr.cast()), hashmap);
+        unsafe {
+            std::ptr::write(shared_ptr, lock);
+        }
+
+        Self {
+            num_buckets,
+            shmem_handle,
+            shared_ptr,
+            shared_size: area_size,
+            hasher,
+        }
+    }
+
+    /// Attach to a hash table for writing.
+    pub fn attach_writer(self) -> HashMapAccess<'a, K, V, S> {
+        HashMapAccess {
+            shmem_handle: self.shmem_handle,
+            shared_ptr: self.shared_ptr,
+            hasher: self.hasher,
+        }
+    }
+
+    /// Initialize a table for reading. Currently identical to [`HashMapInit::attach_writer`].
+    pub fn attach_reader(self) -> HashMapAccess<'a, K, V, S> {
+        self.attach_writer()
+    }
+}
+
+/// Hash table data that is actually stored in the shared memory area.
+///
+/// NOTE: We carve out the parts from a contiguous chunk. Growing and shrinking the hash table
+/// relies on the memory layout! The data structures are laid out in the contiguous shared memory
+/// area as follows:
+///
+/// [`libc::pthread_rwlock_t`]
+/// [`HashMapShared`]
+/// [buckets]
+/// [dictionary]
+///
+/// In between the above parts, there can be padding bytes to align the parts correctly.
+type HashMapShared<'a, K, V> = RwLock<CoreHashMap<'a, K, V>>;
+
+impl<'a, K, V> HashMapInit<'a, K, V, rustc_hash::FxBuildHasher>
+where
+    K: Clone + Hash + Eq,
+{
+    /// Place the hash table within a user-supplied fixed memory area.
+    pub fn with_fixed(num_buckets: u32, area: &'a mut [MaybeUninit<u8>]) -> Self {
+        Self::new(
+            num_buckets,
+            None,
+            area.as_mut_ptr().cast(),
+            area.len(),
+            rustc_hash::FxBuildHasher,
+        )
+    }
+
+    /// Place a new hash map in the given shared memory area
+    ///
+    /// # Panics
+    /// Will panic on failure to resize area to expected map size.
+    pub fn with_shmem(num_buckets: u32, shmem: ShmemHandle) -> Self {
+        let size = Self::estimate_size(num_buckets);
+        shmem
+            .set_size(size)
+            .expect("could not resize shared memory area");
+        let ptr = shmem.data_ptr.as_ptr().cast();
+        Self::new(
+            num_buckets,
+            Some(shmem),
+            ptr,
+            size,
+            rustc_hash::FxBuildHasher,
+        )
+    }
+
+    /// Make a resizable hash map within a new shared memory area with the given name.
+    pub fn new_resizeable_named(num_buckets: u32, max_buckets: u32, name: &str) -> Self {
+        let size = Self::estimate_size(num_buckets);
+        let max_size = Self::estimate_size(max_buckets);
+        let shmem =
+            ShmemHandle::new(name, size, max_size).expect("failed to make shared memory area");
+        let ptr = shmem.data_ptr.as_ptr().cast();
+
+        Self::new(
+            num_buckets,
+            Some(shmem),
+            ptr,
+            size,
+            rustc_hash::FxBuildHasher,
+        )
+    }
+
+    /// Make a resizable hash map within a new anonymous shared memory area.
+    pub fn new_resizeable(num_buckets: u32, max_buckets: u32) -> Self {
+        use std::sync::atomic::{AtomicUsize, Ordering};
+        static COUNTER: AtomicUsize = AtomicUsize::new(0);
+        let val = COUNTER.fetch_add(1, Ordering::Relaxed);
+        let name = format!("neon_shmem_hmap{val}");
+        Self::new_resizeable_named(num_buckets, max_buckets, &name)
+    }
+}
+
+impl<'a, K, V, S: BuildHasher> HashMapAccess<'a, K, V, S>
+where
+    K: Clone + Hash + Eq,
+{
+    /// Hash a key using the map's hasher.
+    #[inline]
+    fn get_hash_value(&self, key: &K) -> u64 {
+        self.hasher.hash_one(key)
+    }
+
+    fn entry_with_hash(&self, key: K, hash: u64) -> Entry<'a, '_, K, V> {
+        let mut map = unsafe { self.shared_ptr.as_ref() }.unwrap().write();
+        let dict_pos = hash as usize % map.dictionary.len();
+        let first = map.dictionary[dict_pos];
+        if first == INVALID_POS {
+            // no existing entry
+            return Entry::Vacant(VacantEntry {
+                map,
+                key,
+                dict_pos: dict_pos as u32,
+            });
+        }
+
+        let mut prev_pos = PrevPos::First(dict_pos as u32);
+        let mut next = first;
+        loop {
+            let bucket = &mut map.buckets[next as usize];
+            let (bucket_key, _bucket_value) = bucket.inner.as_mut().expect("entry is in use");
+            if *bucket_key == key {
+                // found existing entry
+                return Entry::Occupied(OccupiedEntry {
+                    map,
+                    _key: key,
+                    prev_pos,
+                    bucket_pos: next,
+                });
+            }
+
+            if bucket.next == INVALID_POS {
+                // No existing entry
+                return Entry::Vacant(VacantEntry {
+                    map,
+                    key,
+                    dict_pos: dict_pos as u32,
+                });
+            }
+            prev_pos = PrevPos::Chained(next);
+            next = bucket.next;
+        }
+    }
+
+    /// Get a reference to the corresponding value for a key.
+    pub fn get<'e>(&'e self, key: &K) -> Option<ValueReadGuard<'e, V>> {
+        let hash = self.get_hash_value(key);
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+        RwLockReadGuard::try_map(map, |m| m.get_with_hash(key, hash)).ok()
+    }
+
+    /// Get a reference to the entry containing a key.
+    pub fn entry(&self, key: K) -> Entry<'a, '_, K, V> {
+        let hash = self.get_hash_value(&key);
+        self.entry_with_hash(key, hash)
+    }
+
+    /// Remove a key given its hash. Returns the associated value if it existed.
+    pub fn remove(&self, key: &K) -> Option<V> {
+        let hash = self.get_hash_value(&key);
+        match self.entry_with_hash(key.clone(), hash) {
+            Entry::Occupied(e) => Some(e.remove()),
+            Entry::Vacant(_) => None,
+        }
+    }
+
+    /// Insert/update a key. Returns the previous associated value if it existed.
+    ///
+    /// # Errors
+    /// Will return [`core::FullError`] if there is no more space left in the map.
+    pub fn insert(&self, key: K, value: V) -> Result<Option<V>, core::FullError> {
+        let hash = self.get_hash_value(&key);
+        match self.entry_with_hash(key.clone(), hash) {
+            Entry::Occupied(mut e) => Ok(Some(e.insert(value))),
+            Entry::Vacant(e) => {
+                _ = e.insert(value)?;
+                Ok(None)
+            }
+        }
+    }
+
+    /// Optionally return the entry for a bucket at a given index if it exists.
+    ///
+    /// Has more overhead than one would intuitively expect: performs both a clone of the key
+    /// due to the [`OccupiedEntry`] type owning the key and also a hash of the key in order
+    /// to enable repairing the hash chain if the entry is removed.
+    pub fn entry_at_bucket(&self, pos: usize) -> Option<OccupiedEntry<'a, '_, K, V>> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        if pos >= map.buckets.len() {
+            return None;
+        }
+
+        let entry = map.buckets[pos].inner.as_ref();
+        match entry {
+            Some((key, _)) => Some(OccupiedEntry {
+                _key: key.clone(),
+                bucket_pos: pos as u32,
+                prev_pos: entry::PrevPos::Unknown(self.get_hash_value(&key)),
+                map,
+            }),
+            _ => None,
+        }
+    }
+
+    /// Returns the number of buckets in the table.
+    pub fn get_num_buckets(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+        map.get_num_buckets()
+    }
+
+    /// Return the key and value stored in bucket with given index. This can be used to
+    /// iterate through the hash map.
+    // TODO: An Iterator might be nicer. The communicator's clock algorithm needs to
+    // _slowly_ iterate through all buckets with its clock hand,  without holding a lock.
+    // If we switch to an Iterator, it must not hold the lock.
+    pub fn get_at_bucket(&self, pos: usize) -> Option<ValueReadGuard<(K, V)>> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+        if pos >= map.buckets.len() {
+            return None;
+        }
+        RwLockReadGuard::try_map(map, |m| m.buckets[pos].inner.as_ref()).ok()
+    }
+
+    /// Returns the index of the bucket a given value corresponds to.
+    pub fn get_bucket_for_value(&self, val_ptr: *const V) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+
+        let origin = map.buckets.as_ptr();
+        let idx = (val_ptr as usize - origin as usize) / size_of::<Bucket<K, V>>();
+        assert!(idx < map.buckets.len());
+
+        idx
+    }
+
+    /// Returns the number of occupied buckets in the table.
+    pub fn get_num_buckets_in_use(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+        map.buckets_in_use as usize
+    }
+
+    /// Clears all entries in a table. Does not reset any shrinking operations.
+    pub fn clear(&self) {
+        let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        map.clear();
+    }
+
+    /// Perform an in-place rehash of some region (0..`rehash_buckets`) of the table and reset
+    /// the `buckets` and `dictionary` slices to be as long as `num_buckets`. Resets the freelist
+    /// in the process.
+    fn rehash_dict(
+        &self,
+        inner: &mut CoreHashMap<'a, K, V>,
+        buckets_ptr: *mut core::Bucket<K, V>,
+        end_ptr: *mut u8,
+        num_buckets: u32,
+        rehash_buckets: u32,
+    ) {
+        inner.free_head = INVALID_POS;
+
+        let buckets;
+        let dictionary;
+        unsafe {
+            let buckets_end_ptr = buckets_ptr.add(num_buckets as usize);
+            let dictionary_ptr: *mut u32 = buckets_end_ptr
+                .byte_add(buckets_end_ptr.align_offset(align_of::<u32>()))
+                .cast();
+            let dictionary_size: usize =
+                end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::<u32>();
+
+            buckets = std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize);
+            dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size);
+        }
+        for e in dictionary.iter_mut() {
+            *e = INVALID_POS;
+        }
+
+        for (i, bucket) in buckets.iter_mut().enumerate().take(rehash_buckets as usize) {
+            if bucket.inner.is_none() {
+                bucket.next = inner.free_head;
+                inner.free_head = i as u32;
+                continue;
+            }
+
+            let hash = self.hasher.hash_one(&bucket.inner.as_ref().unwrap().0);
+            let pos: usize = (hash % dictionary.len() as u64) as usize;
+            bucket.next = dictionary[pos];
+            dictionary[pos] = i as u32;
+        }
+
+        inner.dictionary = dictionary;
+        inner.buckets = buckets;
+    }
+
+    /// Rehash the map without growing or shrinking.
+    pub fn shuffle(&self) {
+        let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        let num_buckets = map.get_num_buckets() as u32;
+        let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+        let end_ptr: *mut u8 = unsafe { self.shared_ptr.byte_add(size_bytes).cast() };
+        let buckets_ptr = map.buckets.as_mut_ptr();
+        self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, num_buckets);
+    }
+
+    /// Grow the number of buckets within the table.
+    ///
+    /// 1. Grows the underlying shared memory area
+    /// 2. Initializes new buckets and overwrites the current dictionary
+    /// 3. Rehashes the dictionary
+    ///
+    /// # Panics
+    /// Panics if called on a map initialized with [`HashMapInit::with_fixed`].
+    ///
+    /// # Errors
+    /// Returns an [`shmem::Error`] if any errors occur resizing the memory region.
+    pub fn grow(&self, num_buckets: u32) -> Result<(), shmem::Error> {
+        let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        let old_num_buckets = map.buckets.len() as u32;
+
+        assert!(
+            num_buckets >= old_num_buckets,
+            "grow called with a smaller number of buckets"
+        );
+        if num_buckets == old_num_buckets {
+            return Ok(());
+        }
+        let shmem_handle = self
+            .shmem_handle
+            .as_ref()
+            .expect("grow called on a fixed-size hash table");
+
+        let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+        shmem_handle.set_size(size_bytes)?;
+        let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
+
+        // Initialize new buckets. The new buckets are linked to the free list.
+        // NB: This overwrites the dictionary!
+        let buckets_ptr = map.buckets.as_mut_ptr();
+        unsafe {
+            for i in old_num_buckets..num_buckets {
+                let bucket = buckets_ptr.add(i as usize);
+                bucket.write(core::Bucket {
+                    next: if i < num_buckets - 1 {
+                        i + 1
+                    } else {
+                        map.free_head
+                    },
+                    inner: None,
+                });
+            }
+        }
+
+        self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, old_num_buckets);
+        map.free_head = old_num_buckets;
+
+        Ok(())
+    }
+
+    /// Begin a shrink, limiting all new allocations to be in buckets with index below `num_buckets`.
+    ///
+    /// # Panics
+    /// Panics if called on a map initialized with [`HashMapInit::with_fixed`] or if `num_buckets` is
+    /// greater than the number of buckets in the map.
+    pub fn begin_shrink(&mut self, num_buckets: u32) {
+        let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        assert!(
+            num_buckets <= map.get_num_buckets() as u32,
+            "shrink called with a larger number of buckets"
+        );
+        _ = self
+            .shmem_handle
+            .as_ref()
+            .expect("shrink called on a fixed-size hash table");
+        map.alloc_limit = num_buckets;
+    }
+
+    /// If a shrink operation is underway, returns the target size of the map. Otherwise, returns None.
+    pub fn shrink_goal(&self) -> Option<usize> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap().read();
+        let goal = map.alloc_limit;
+        if goal == INVALID_POS {
+            None
+        } else {
+            Some(goal as usize)
+        }
+    }
+
+    /// Complete a shrink after caller has evicted entries, removing the unused buckets and rehashing.
+    ///
+    /// # Panics
+    /// The following cases result in a panic:
+    /// - Calling this function on a map initialized with [`HashMapInit::with_fixed`].
+    /// - Calling this function on a map when no shrink operation is in progress.
+    /// - Calling this function on a map with `shrink_mode` set to [`HashMapShrinkMode::Remap`] and
+    ///   there are more buckets in use than the value returned by [`HashMapAccess::shrink_goal`].
+    ///
+    /// # Errors
+    /// Returns an [`shmem::Error`] if any errors occur resizing the memory region.
+    pub fn finish_shrink(&self) -> Result<(), shmem::Error> {
+        let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        assert!(
+            map.alloc_limit != INVALID_POS,
+            "called finish_shrink when no shrink is in progress"
+        );
+
+        let num_buckets = map.alloc_limit;
+
+        if map.get_num_buckets() == num_buckets as usize {
+            return Ok(());
+        }
+
+        assert!(
+            map.buckets_in_use <= num_buckets,
+            "called finish_shrink before enough entries were removed"
+        );
+
+        for i in (num_buckets as usize)..map.buckets.len() {
+            if let Some((k, v)) = map.buckets[i].inner.take() {
+                // alloc_bucket increases count, so need to decrease since we're just moving
+                map.buckets_in_use -= 1;
+                map.alloc_bucket(k, v).unwrap();
+            }
+        }
+
+        let shmem_handle = self
+            .shmem_handle
+            .as_ref()
+            .expect("shrink called on a fixed-size hash table");
+
+        let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+        shmem_handle.set_size(size_bytes)?;
+        let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
+        let buckets_ptr = map.buckets.as_mut_ptr();
+        self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, num_buckets);
+        map.alloc_limit = INVALID_POS;
+
+        Ok(())
+    }
+}
--- a/libs/neon-shmem/src/hash/core.rs
+++ b/libs/neon-shmem/src/hash/core.rs
@@ -0,0 +1,208 @@
+//! Simple hash table with chaining.
+
+use std::fmt::Debug;
+use std::hash::Hash;
+use std::mem::MaybeUninit;
+
+use crate::hash::entry::*;
+
+/// Invalid position within the map (either within the dictionary or bucket array).
+pub(crate) const INVALID_POS: u32 = u32::MAX;
+
+/// Fundamental storage unit within the hash table. Either empty or contains a key-value pair.
+/// Always part of a chain of some kind (either a freelist if empty or a hash chain if full).
+pub(crate) struct Bucket<K, V> {
+    /// Index of next bucket in the chain.
+    pub(crate) next: u32,
+    /// Key-value pair contained within bucket.
+    pub(crate) inner: Option<(K, V)>,
+}
+
+impl<K, V> Debug for Bucket<K, V>
+where
+    K: Debug,
+    V: Debug,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Bucket")
+            .field("next", &self.next)
+            .field("inner", &self.inner)
+            .finish()
+    }
+}
+
+/// Core hash table implementation.
+pub(crate) struct CoreHashMap<'a, K, V> {
+    /// Dictionary used to map hashes to bucket indices.
+    pub(crate) dictionary: &'a mut [u32],
+    /// Buckets containing key-value pairs.
+    pub(crate) buckets: &'a mut [Bucket<K, V>],
+    /// Head of the freelist.
+    pub(crate) free_head: u32,
+    /// Maximum index of a bucket allowed to be allocated. [`INVALID_POS`] if no limit.
+    pub(crate) alloc_limit: u32,
+    /// The number of currently occupied buckets.
+    pub(crate) buckets_in_use: u32,
+    // pub(crate) lock: libc::pthread_mutex_t,
+    // Unclear what the purpose of this is.
+    pub(crate) _user_list_head: u32,
+}
+
+impl<'a, K, V> Debug for CoreHashMap<'a, K, V>
+where
+    K: Debug,
+    V: Debug,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("CoreHashMap")
+            .field("dictionary", &self.dictionary)
+            .field("buckets", &self.buckets)
+            .field("free_head", &self.free_head)
+            .field("alloc_limit", &self.alloc_limit)
+            .field("buckets_in_use", &self.buckets_in_use)
+            .finish()
+    }
+}
+
+/// Error for when there are no empty buckets left but one is needed.
+#[derive(Debug, PartialEq)]
+pub struct FullError();
+
+impl<'a, K: Clone + Hash + Eq, V> CoreHashMap<'a, K, V> {
+    const FILL_FACTOR: f32 = 0.60;
+
+    /// Estimate the size of data contained within the the hash map.
+    pub fn estimate_size(num_buckets: u32) -> usize {
+        let mut size = 0;
+
+        // buckets
+        size += size_of::<Bucket<K, V>>() * num_buckets as usize;
+
+        // dictionary
+        size += (f32::ceil((size_of::<u32>() * num_buckets as usize) as f32 / Self::FILL_FACTOR))
+            as usize;
+
+        size
+    }
+
+    pub fn new(
+        buckets: &'a mut [MaybeUninit<Bucket<K, V>>],
+        dictionary: &'a mut [MaybeUninit<u32>],
+    ) -> Self {
+        // Initialize the buckets
+        for i in 0..buckets.len() {
+            buckets[i].write(Bucket {
+                next: if i < buckets.len() - 1 {
+                    i as u32 + 1
+                } else {
+                    INVALID_POS
+                },
+                inner: None,
+            });
+        }
+
+        // Initialize the dictionary
+        for e in dictionary.iter_mut() {
+            e.write(INVALID_POS);
+        }
+
+        // TODO: use std::slice::assume_init_mut() once it stabilizes
+        let buckets =
+            unsafe { std::slice::from_raw_parts_mut(buckets.as_mut_ptr().cast(), buckets.len()) };
+        let dictionary = unsafe {
+            std::slice::from_raw_parts_mut(dictionary.as_mut_ptr().cast(), dictionary.len())
+        };
+
+        Self {
+            dictionary,
+            buckets,
+            free_head: 0,
+            buckets_in_use: 0,
+            _user_list_head: INVALID_POS,
+            alloc_limit: INVALID_POS,
+        }
+    }
+
+    /// Get the value associated with a key (if it exists) given its hash.
+    pub fn get_with_hash(&self, key: &K, hash: u64) -> Option<&V> {
+        let mut next = self.dictionary[hash as usize % self.dictionary.len()];
+        loop {
+            if next == INVALID_POS {
+                return None;
+            }
+
+            let bucket = &self.buckets[next as usize];
+            let (bucket_key, bucket_value) = bucket.inner.as_ref().expect("entry is in use");
+            if bucket_key == key {
+                return Some(bucket_value);
+            }
+            next = bucket.next;
+        }
+    }
+
+    /// Get number of buckets in map.
+    pub fn get_num_buckets(&self) -> usize {
+        self.buckets.len()
+    }
+
+    /// Clears all entries from the hashmap.
+    ///
+    /// Does not reset any allocation limits, but does clear any entries beyond them.
+    pub fn clear(&mut self) {
+        for i in 0..self.buckets.len() {
+            self.buckets[i] = Bucket {
+                next: if i < self.buckets.len() - 1 {
+                    i as u32 + 1
+                } else {
+                    INVALID_POS
+                },
+                inner: None,
+            }
+        }
+        for i in 0..self.dictionary.len() {
+            self.dictionary[i] = INVALID_POS;
+        }
+
+        self.free_head = 0;
+        self.buckets_in_use = 0;
+    }
+
+    /// Find the position of an unused bucket via the freelist and initialize it.
+    pub(crate) fn alloc_bucket(&mut self, key: K, value: V) -> Result<u32, FullError> {
+        let mut pos = self.free_head;
+
+        // Find the first bucket we're *allowed* to use.
+        let mut prev = PrevPos::First(self.free_head);
+        while pos != INVALID_POS && pos >= self.alloc_limit {
+            let bucket = &mut self.buckets[pos as usize];
+            prev = PrevPos::Chained(pos);
+            pos = bucket.next;
+        }
+        if pos == INVALID_POS {
+            return Err(FullError());
+        }
+
+        // Repair the freelist.
+        match prev {
+            PrevPos::First(_) => {
+                let next_pos = self.buckets[pos as usize].next;
+                self.free_head = next_pos;
+            }
+            PrevPos::Chained(p) => {
+                if p != INVALID_POS {
+                    let next_pos = self.buckets[pos as usize].next;
+                    self.buckets[p as usize].next = next_pos;
+                }
+            }
+            _ => unreachable!(),
+        }
+
+        // Initialize the bucket.
+        let bucket = &mut self.buckets[pos as usize];
+        self.buckets_in_use += 1;
+        bucket.next = INVALID_POS;
+        bucket.inner = Some((key, value));
+
+        Ok(pos)
+    }
+}
--- a/libs/neon-shmem/src/hash/entry.rs
+++ b/libs/neon-shmem/src/hash/entry.rs
@@ -0,0 +1,138 @@
+//! Equivalent of [`std::collections::hash_map::Entry`] for this hashmap.
+
+use crate::hash::core::{CoreHashMap, FullError, INVALID_POS};
+use crate::sync::{RwLockWriteGuard, ValueWriteGuard};
+
+use std::hash::Hash;
+use std::mem;
+
+pub enum Entry<'a, 'b, K, V> {
+    Occupied(OccupiedEntry<'a, 'b, K, V>),
+    Vacant(VacantEntry<'a, 'b, K, V>),
+}
+
+/// Enum representing the previous position within a chain.
+#[derive(Clone, Copy)]
+pub(crate) enum PrevPos {
+    /// Starting index within the dictionary.  
+    First(u32),
+    /// Regular index within the buckets.
+    Chained(u32),
+    /// Unknown - e.g. the associated entry was retrieved by index instead of chain.
+    Unknown(u64),
+}
+
+pub struct OccupiedEntry<'a, 'b, K, V> {
+    /// Mutable reference to the map containing this entry.
+    pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
+    /// The key of the occupied entry
+    pub(crate) _key: K,
+    /// The index of the previous entry in the chain.
+    pub(crate) prev_pos: PrevPos,
+    /// The position of the bucket in the [`CoreHashMap`] bucket array.
+    pub(crate) bucket_pos: u32,
+}
+
+impl<K, V> OccupiedEntry<'_, '_, K, V> {
+    pub fn get(&self) -> &V {
+        &self.map.buckets[self.bucket_pos as usize]
+            .inner
+            .as_ref()
+            .unwrap()
+            .1
+    }
+
+    pub fn get_mut(&mut self) -> &mut V {
+        &mut self.map.buckets[self.bucket_pos as usize]
+            .inner
+            .as_mut()
+            .unwrap()
+            .1
+    }
+
+    /// Inserts a value into the entry, replacing (and returning) the existing value.
+    pub fn insert(&mut self, value: V) -> V {
+        let bucket = &mut self.map.buckets[self.bucket_pos as usize];
+        // This assumes inner is Some, which it must be for an OccupiedEntry
+        mem::replace(&mut bucket.inner.as_mut().unwrap().1, value)
+    }
+
+    /// Removes the entry from the hash map, returning the value originally stored within it.
+    ///
+    /// This may result in multiple bucket accesses if the entry was obtained by index as the
+    /// previous chain entry needs to be discovered in this case.
+    ///
+    /// # Panics
+    /// Panics if the `prev_pos` field is equal to [`PrevPos::Unknown`]. In practice, this means
+    /// the entry was obtained via calling something like [`CoreHashMap::entry_at_bucket`].
+    pub fn remove(mut self) -> V {
+        // If this bucket was queried by index, go ahead and follow its chain from the start.
+        let prev = if let PrevPos::Unknown(hash) = self.prev_pos {
+            let dict_idx = hash as usize % self.map.dictionary.len();
+            let mut prev = PrevPos::First(dict_idx as u32);
+            let mut curr = self.map.dictionary[dict_idx];
+            while curr != self.bucket_pos {
+                assert!(curr != INVALID_POS);
+                prev = PrevPos::Chained(curr);
+                curr = self.map.buckets[curr as usize].next;
+            }
+            prev
+        } else {
+            self.prev_pos
+        };
+
+        // CoreHashMap::remove returns Option<(K, V)>. We know it's Some for an OccupiedEntry.
+        let bucket = &mut self.map.buckets[self.bucket_pos as usize];
+
+        // unlink it from the chain
+        match prev {
+            PrevPos::First(dict_pos) => {
+                self.map.dictionary[dict_pos as usize] = bucket.next;
+            }
+            PrevPos::Chained(bucket_pos) => {
+                // println!("we think prev of {} is {bucket_pos}", self.bucket_pos);
+                self.map.buckets[bucket_pos as usize].next = bucket.next;
+            }
+            _ => unreachable!(),
+        }
+
+        // and add it to the freelist
+        let free = self.map.free_head;
+        let bucket = &mut self.map.buckets[self.bucket_pos as usize];
+        let old_value = bucket.inner.take();
+        bucket.next = free;
+        self.map.free_head = self.bucket_pos;
+        self.map.buckets_in_use -= 1;
+
+        old_value.unwrap().1
+    }
+}
+
+/// An abstract view into a vacant entry within the map.
+pub struct VacantEntry<'a, 'b, K, V> {
+    /// Mutable reference to the map containing this entry.
+    pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
+    /// The key to be inserted into this entry.
+    pub(crate) key: K,
+    /// The position within the dictionary corresponding to the key's hash.
+    pub(crate) dict_pos: u32,
+}
+
+impl<'b, K: Clone + Hash + Eq, V> VacantEntry<'_, 'b, K, V> {
+    /// Insert a value into the vacant entry, finding and populating an empty bucket in the process.
+    ///
+    /// # Errors
+    /// Will return [`FullError`] if there are no unoccupied buckets in the map.
+    pub fn insert(mut self, value: V) -> Result<ValueWriteGuard<'b, V>, FullError> {
+        let pos = self.map.alloc_bucket(self.key, value)?;
+        if pos == INVALID_POS {
+            return Err(FullError());
+        }
+        self.map.buckets[pos as usize].next = self.map.dictionary[self.dict_pos as usize];
+        self.map.dictionary[self.dict_pos as usize] = pos;
+
+        Ok(RwLockWriteGuard::map(self.map, |m| {
+            &mut m.buckets[pos as usize].inner.as_mut().unwrap().1
+        }))
+    }
+}
--- a/libs/neon-shmem/src/hash/tests.rs
+++ b/libs/neon-shmem/src/hash/tests.rs
@@ -0,0 +1,429 @@
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fmt::Debug;
+use std::mem::MaybeUninit;
+
+use crate::hash::Entry;
+use crate::hash::HashMapAccess;
+use crate::hash::HashMapInit;
+use crate::hash::core::FullError;
+
+use rand::seq::SliceRandom;
+use rand::{Rng, RngCore};
+use rand_distr::Zipf;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl From<&TestKey> for u128 {
+    fn from(val: &TestKey) -> u128 {
+        u128::from_be_bytes(val.0)
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl<'a> From<&'a [u8]> for TestKey {
+    fn from(bytes: &'a [u8]) -> TestKey {
+        TestKey(bytes.try_into().unwrap())
+    }
+}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
+    let w = HashMapInit::<TestKey, usize>::new_resizeable_named(100000, 120000, "test_inserts")
+        .attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+        let res = w.entry((*k).into());
+        match res {
+            Entry::Occupied(mut e) => {
+                e.insert(idx);
+            }
+            Entry::Vacant(e) => {
+                let res = e.insert(idx);
+                assert!(res.is_ok());
+            }
+        };
+    }
+
+    for (idx, k) in keys.iter().enumerate() {
+        let x = w.get(&(*k).into());
+        let value = x.as_deref().copied();
+        assert_eq!(value, Some(idx));
+    }
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut rand::rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.contains(&key) {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
+
+#[derive(Clone, Debug)]
+struct TestOp(TestKey, Option<usize>);
+
+fn apply_op(
+    op: &TestOp,
+    map: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    // apply the change to the shadow tree first
+    let shadow_existing = if let Some(v) = op.1 {
+        shadow.insert(op.0, v)
+    } else {
+        shadow.remove(&op.0)
+    };
+
+    let entry = map.entry(op.0);
+    let hash_existing = match op.1 {
+        Some(new) => match entry {
+            Entry::Occupied(mut e) => Some(e.insert(new)),
+            Entry::Vacant(e) => {
+                _ = e.insert(new).unwrap();
+                None
+            }
+        },
+        None => match entry {
+            Entry::Occupied(e) => Some(e.remove()),
+            Entry::Vacant(_) => None,
+        },
+    };
+
+    assert_eq!(shadow_existing, hash_existing);
+}
+
+fn do_random_ops(
+    num_ops: usize,
+    size: u32,
+    del_prob: f64,
+    writer: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+    rng: &mut rand::rngs::ThreadRng,
+) {
+    for i in 0..num_ops {
+        let key: TestKey = ((rng.next_u32() % size) as u128).into();
+        let op = TestOp(
+            key,
+            if rng.random_bool(del_prob) {
+                Some(i)
+            } else {
+                None
+            },
+        );
+        apply_op(&op, writer, shadow);
+    }
+}
+
+fn do_deletes(
+    num_ops: usize,
+    writer: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    for _ in 0..num_ops {
+        let (k, _) = shadow.pop_first().unwrap();
+        writer.remove(&k);
+    }
+}
+
+fn do_shrink(
+    writer: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+    to: u32,
+) {
+    assert!(writer.shrink_goal().is_none());
+    writer.begin_shrink(to);
+    assert_eq!(writer.shrink_goal(), Some(to as usize));
+    while writer.get_num_buckets_in_use() > to as usize {
+        let (k, _) = shadow.pop_first().unwrap();
+        let entry = writer.entry(k);
+        if let Entry::Occupied(e) = entry {
+            e.remove();
+        }
+    }
+    let old_usage = writer.get_num_buckets_in_use();
+    writer.finish_shrink().unwrap();
+    assert!(writer.shrink_goal().is_none());
+    assert_eq!(writer.get_num_buckets_in_use(), old_usage);
+}
+
+#[test]
+fn random_ops() {
+    let mut writer =
+        HashMapInit::<TestKey, usize>::new_resizeable_named(100000, 120000, "test_random")
+            .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
+    let mut rng = rand::rng();
+    for i in 0..100000 {
+        let key: TestKey = (rng.sample(distribution) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &mut writer, &mut shadow);
+    }
+}
+
+#[test]
+fn test_shuffle() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 1200, "test_shuf")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+    writer.shuffle();
+    do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_grow() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 2000, "test_grow")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+    let old_usage = writer.get_num_buckets_in_use();
+    writer.grow(1500).unwrap();
+    assert_eq!(writer.get_num_buckets_in_use(), old_usage);
+    assert_eq!(writer.get_num_buckets(), 1500);
+    do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_clear() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_clear")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+    do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+    writer.clear();
+    assert_eq!(writer.get_num_buckets_in_use(), 0);
+    assert_eq!(writer.get_num_buckets(), 1500);
+    while let Some((key, _)) = shadow.pop_first() {
+        assert!(writer.get(&key).is_none());
+    }
+    do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+    for i in 0..(1500 - writer.get_num_buckets_in_use()) {
+        writer.insert((1500 + i as u128).into(), 0).unwrap();
+    }
+    assert_eq!(writer.insert(5000.into(), 0), Err(FullError {}));
+    writer.clear();
+    assert!(writer.insert(5000.into(), 0).is_ok());
+}
+
+#[test]
+fn test_idx_remove() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_clear")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+    do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+    for _ in 0..100 {
+        let idx = (rng.next_u32() % 1500) as usize;
+        if let Some(e) = writer.entry_at_bucket(idx) {
+            shadow.remove(&e._key);
+            e.remove();
+        }
+    }
+    while let Some((key, val)) = shadow.pop_first() {
+        assert_eq!(*writer.get(&key).unwrap(), val);
+    }
+}
+
+#[test]
+fn test_idx_get() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_clear")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+    do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+    for _ in 0..100 {
+        let idx = (rng.next_u32() % 1500) as usize;
+        if let Some(pair) = writer.get_at_bucket(idx) {
+            {
+                let v: *const usize = &pair.1;
+                assert_eq!(writer.get_bucket_for_value(v), idx);
+            }
+            {
+                let v: *const usize = &pair.1;
+                assert_eq!(writer.get_bucket_for_value(v), idx);
+            }
+        }
+    }
+}
+
+#[test]
+fn test_shrink() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_shrink")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+    do_shrink(&mut writer, &mut shadow, 1000);
+    assert_eq!(writer.get_num_buckets(), 1000);
+    do_deletes(500, &mut writer, &mut shadow);
+    do_random_ops(10000, 500, 0.75, &mut writer, &mut shadow, &mut rng);
+    assert!(writer.get_num_buckets_in_use() <= 1000);
+}
+
+#[test]
+fn test_shrink_grow_seq() {
+    let mut writer =
+        HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 20000, "test_grow_seq")
+            .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(500, 1000, 0.1, &mut writer, &mut shadow, &mut rng);
+    eprintln!("Shrinking to 750");
+    do_shrink(&mut writer, &mut shadow, 750);
+    do_random_ops(200, 1000, 0.5, &mut writer, &mut shadow, &mut rng);
+    eprintln!("Growing to 1500");
+    writer.grow(1500).unwrap();
+    do_random_ops(600, 1500, 0.1, &mut writer, &mut shadow, &mut rng);
+    eprintln!("Shrinking to 200");
+    while shadow.len() > 100 {
+        do_deletes(1, &mut writer, &mut shadow);
+    }
+    do_shrink(&mut writer, &mut shadow, 200);
+    do_random_ops(50, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+    eprintln!("Growing to 10k");
+    writer.grow(10000).unwrap();
+    do_random_ops(10000, 5000, 0.25, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_bucket_ops() {
+    let writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 1200, "test_bucket_ops")
+        .attach_writer();
+    match writer.entry(1.into()) {
+        Entry::Occupied(mut e) => {
+            e.insert(2);
+        }
+        Entry::Vacant(e) => {
+            _ = e.insert(2).unwrap();
+        }
+    }
+    assert_eq!(writer.get_num_buckets_in_use(), 1);
+    assert_eq!(writer.get_num_buckets(), 1000);
+    assert_eq!(*writer.get(&1.into()).unwrap(), 2);
+    let pos = match writer.entry(1.into()) {
+        Entry::Occupied(e) => {
+            assert_eq!(e._key, 1.into());
+            let pos = e.bucket_pos as usize;
+            pos
+        }
+        Entry::Vacant(_) => {
+            panic!("Insert didn't affect entry");
+        }
+    };
+    assert_eq!(writer.entry_at_bucket(pos).unwrap()._key, 1.into());
+    assert_eq!(*writer.get_at_bucket(pos).unwrap(), (1.into(), 2));
+    {
+        let ptr: *const usize = &*writer.get(&1.into()).unwrap();
+        assert_eq!(writer.get_bucket_for_value(ptr), pos);
+    }
+    writer.remove(&1.into());
+    assert!(writer.get(&1.into()).is_none());
+}
+
+#[test]
+fn test_shrink_zero() {
+    let mut writer =
+        HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_shrink_zero")
+            .attach_writer();
+    writer.begin_shrink(0);
+    for i in 0..1500 {
+        writer.entry_at_bucket(i).map(|x| x.remove());
+    }
+    writer.finish_shrink().unwrap();
+    assert_eq!(writer.get_num_buckets_in_use(), 0);
+    let entry = writer.entry(1.into());
+    if let Entry::Vacant(v) = entry {
+        assert!(v.insert(2).is_err());
+    } else {
+        panic!("Somehow got non-vacant entry in empty map.")
+    }
+    writer.grow(50).unwrap();
+    let entry = writer.entry(1.into());
+    if let Entry::Vacant(v) = entry {
+        assert!(v.insert(2).is_ok());
+    } else {
+        panic!("Somehow got non-vacant entry in empty map.")
+    }
+    assert_eq!(writer.get_num_buckets_in_use(), 1);
+}
+
+#[test]
+#[should_panic]
+fn test_grow_oom() {
+    let writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_grow_oom")
+        .attach_writer();
+    writer.grow(20000).unwrap();
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_bigger() {
+    let mut writer =
+        HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2500, "test_shrink_bigger")
+            .attach_writer();
+    writer.begin_shrink(2000);
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_early_finish() {
+    let writer =
+        HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2500, "test_shrink_early_finish")
+            .attach_writer();
+    writer.finish_shrink().unwrap();
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_fixed_size() {
+    let mut area = [MaybeUninit::uninit(); 10000];
+    let init_struct = HashMapInit::<TestKey, usize>::with_fixed(3, &mut area);
+    let mut writer = init_struct.attach_writer();
+    writer.begin_shrink(1);
+}
--- a/libs/neon-shmem/src/lib.rs
+++ b/libs/neon-shmem/src/lib.rs
@@ -1,418 +1,5 @@
 //! Shared memory utilities for neon communicator

-use std::num::NonZeroUsize;
-use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
-use std::ptr::NonNull;
-use std::sync::atomic::{AtomicUsize, Ordering};
-
-use nix::errno::Errno;
-use nix::sys::mman::MapFlags;
-use nix::sys::mman::ProtFlags;
-use nix::sys::mman::mmap as nix_mmap;
-use nix::sys::mman::munmap as nix_munmap;
-use nix::unistd::ftruncate as nix_ftruncate;
-
-/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
-/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
-/// specified at creation.
-///
-/// The area is backed by an anonymous file created with memfd_create(). The full address space for
-/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
-/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
-/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
-/// future.
-pub struct ShmemHandle {
-    /// memfd file descriptor
-    fd: OwnedFd,
-
-    max_size: usize,
-
-    // Pointer to the beginning of the shared memory area. The header is stored there.
-    shared_ptr: NonNull<SharedStruct>,
-
-    // Pointer to the beginning of the user data
-    pub data_ptr: NonNull<u8>,
-}
-
-/// This is stored at the beginning in the shared memory area.
-struct SharedStruct {
-    max_size: usize,
-
-    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
-    current_size: AtomicUsize,
-}
-
-const RESIZE_IN_PROGRESS: usize = 1 << 63;
-
-const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
-
-/// Error type returned by the ShmemHandle functions.
-#[derive(thiserror::Error, Debug)]
-#[error("{msg}: {errno}")]
-pub struct Error {
-    pub msg: String,
-    pub errno: Errno,
-}
-
-impl Error {
-    fn new(msg: &str, errno: Errno) -> Error {
-        Error {
-            msg: msg.to_string(),
-            errno,
-        }
-    }
-}
-
-impl ShmemHandle {
-    /// Create a new shared memory area. To communicate between processes, the processes need to be
-    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
-    ///
-    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
-    /// processes can continue using it, however.
-    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
-        // create the backing anonymous file.
-        let fd = create_backing_file(name)?;
-
-        Self::new_with_fd(fd, initial_size, max_size)
-    }
-
-    fn new_with_fd(
-        fd: OwnedFd,
-        initial_size: usize,
-        max_size: usize,
-    ) -> Result<ShmemHandle, Error> {
-        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
-        // is a little larger than this because of the SharedStruct header. Make the upper limit
-        // somewhat smaller than that, because with anything close to that, you'll run out of
-        // memory anyway.
-        if max_size >= 1 << 48 {
-            panic!("max size {max_size} too large");
-        }
-        if initial_size > max_size {
-            panic!("initial size {initial_size} larger than max size {max_size}");
-        }
-
-        // The actual initial / max size is the one given by the caller, plus the size of
-        // 'SharedStruct'.
-        let initial_size = HEADER_SIZE + initial_size;
-        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
-
-        // Reserve address space for it with mmap
-        //
-        // TODO: Use MAP_HUGETLB if possible
-        let start_ptr = unsafe {
-            nix_mmap(
-                None,
-                max_size,
-                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
-                MapFlags::MAP_SHARED,
-                &fd,
-                0,
-            )
-        }
-        .map_err(|e| Error::new("mmap failed: {e}", e))?;
-
-        // Reserve space for the initial size
-        enlarge_file(fd.as_fd(), initial_size as u64)?;
-
-        // Initialize the header
-        let shared: NonNull<SharedStruct> = start_ptr.cast();
-        unsafe {
-            shared.write(SharedStruct {
-                max_size: max_size.into(),
-                current_size: AtomicUsize::new(initial_size),
-            })
-        };
-
-        // The user data begins after the header
-        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
-
-        Ok(ShmemHandle {
-            fd,
-            max_size: max_size.into(),
-            shared_ptr: shared,
-            data_ptr,
-        })
-    }
-
-    // return reference to the header
-    fn shared(&self) -> &SharedStruct {
-        unsafe { self.shared_ptr.as_ref() }
-    }
-
-    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
-    /// when creating the area.
-    ///
-    /// This may only be called from one process/thread concurrently. We detect that case
-    /// and return an Error.
-    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
-        let new_size = new_size + HEADER_SIZE;
-        let shared = self.shared();
-
-        if new_size > self.max_size {
-            panic!(
-                "new size ({} is greater than max size ({})",
-                new_size, self.max_size
-            );
-        }
-        assert_eq!(self.max_size, shared.max_size);
-
-        // Lock the area by setting the bit in 'current_size'
-        //
-        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
-        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
-        // since this is not performance-critical, better safe than sorry .
-        let mut old_size = shared.current_size.load(Ordering::Acquire);
-        loop {
-            if (old_size & RESIZE_IN_PROGRESS) != 0 {
-                return Err(Error::new(
-                    "concurrent resize detected",
-                    Errno::UnknownErrno,
-                ));
-            }
-            match shared.current_size.compare_exchange(
-                old_size,
-                new_size,
-                Ordering::Acquire,
-                Ordering::Relaxed,
-            ) {
-                Ok(_) => break,
-                Err(x) => old_size = x,
-            }
-        }
-
-        // Ok, we got the lock.
-        //
-        // NB: If anything goes wrong, we *must* clear the bit!
-        let result = {
-            use std::cmp::Ordering::{Equal, Greater, Less};
-            match new_size.cmp(&old_size) {
-                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
-                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
-                }),
-                Equal => Ok(()),
-                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
-            }
-        };
-
-        // Unlock
-        shared.current_size.store(
-            if result.is_ok() { new_size } else { old_size },
-            Ordering::Release,
-        );
-
-        result
-    }
-
-    /// Returns the current user-visible size of the shared memory segment.
-    ///
-    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
-    /// responsibility not to access the area beyond the current size.
-    pub fn current_size(&self) -> usize {
-        let total_current_size =
-            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
-        total_current_size - HEADER_SIZE
-    }
-}
-
-impl Drop for ShmemHandle {
-    fn drop(&mut self) {
-        // SAFETY: The pointer was obtained from mmap() with the given size.
-        // We unmap the entire region.
-        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
-        // The fd is dropped automatically by OwnedFd.
-    }
-}
-
-/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
-/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
-/// development and testing, but in production we want the file to stay in memory.
-///
-/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
-#[allow(unused_variables)]
-fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
-            .map_err(|e| Error::new("memfd_create failed: {e}", e))
-    }
-    #[cfg(target_os = "macos")]
-    {
-        let file = tempfile::tempfile().map_err(|e| {
-            Error::new(
-                "could not create temporary file to back shmem area: {e}",
-                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
-            )
-        })?;
-        Ok(OwnedFd::from(file))
-    }
-}
-
-fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
-    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
-    // we don't get a segfault later when trying to actually use it.
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
-            Error::new(
-                "could not grow shmem segment, posix_fallocate failed: {e}",
-                e,
-            )
-        })
-    }
-    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
-    #[cfg(target_os = "macos")]
-    {
-        nix::unistd::ftruncate(fd, size as i64)
-            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use nix::unistd::ForkResult;
-    use std::ops::Range;
-
-    /// check that all bytes in given range have the expected value.
-    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
-        for i in range {
-            let b = unsafe { *(ptr.add(i)) };
-            assert_eq!(expected, b, "unexpected byte at offset {i}");
-        }
-    }
-
-    /// Write 'b' to all bytes in the given range
-    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
-        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
-    }
-
-    // simple single-process test of growing and shrinking
-    #[test]
-    fn test_shmem_resize() -> Result<(), Error> {
-        let max_size = 1024 * 1024;
-        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
-
-        assert_eq!(init_struct.current_size(), 0);
-
-        // Initial grow
-        let size1 = 10000;
-        init_struct.set_size(size1).unwrap();
-        assert_eq!(init_struct.current_size(), size1);
-
-        // Write some data
-        let data_ptr = init_struct.data_ptr.as_ptr();
-        write_range(data_ptr, 0xAA, 0..size1);
-        assert_range(data_ptr, 0xAA, 0..size1);
-
-        // Shrink
-        let size2 = 5000;
-        init_struct.set_size(size2).unwrap();
-        assert_eq!(init_struct.current_size(), size2);
-
-        // Grow again
-        let size3 = 20000;
-        init_struct.set_size(size3).unwrap();
-        assert_eq!(init_struct.current_size(), size3);
-
-        // Try to read it. The area that was shrunk and grown again should read as all zeros now
-        assert_range(data_ptr, 0xAA, 0..5000);
-        assert_range(data_ptr, 0, 5000..size1);
-
-        // Try to grow beyond max_size
-        //let size4 = max_size + 1;
-        //assert!(init_struct.set_size(size4).is_err());
-
-        // Dropping init_struct should unmap the memory
-        drop(init_struct);
-
-        Ok(())
-    }
-
-    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
-    /// but is stored in the shared memory area and works across processes. It's implemented by
-    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
-    struct SimpleBarrier {
-        num_procs: usize,
-        count: AtomicUsize,
-    }
-
-    impl SimpleBarrier {
-        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
-            unsafe {
-                *ptr = SimpleBarrier {
-                    num_procs,
-                    count: AtomicUsize::new(0),
-                }
-            }
-        }
-
-        pub fn wait(&self) {
-            let old = self.count.fetch_add(1, Ordering::Relaxed);
-
-            let generation = old / self.num_procs;
-
-            let mut current = old + 1;
-            while current < (generation + 1) * self.num_procs {
-                std::thread::sleep(std::time::Duration::from_millis(10));
-                current = self.count.load(Ordering::Relaxed);
-            }
-        }
-    }
-
-    #[test]
-    fn test_multi_process() {
-        // Initialize
-        let max_size = 1_000_000_000_000;
-        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
-        let ptr = init_struct.data_ptr.as_ptr();
-
-        // Store the SimpleBarrier in the first 1k of the area.
-        init_struct.set_size(10000).unwrap();
-        let barrier_ptr: *mut SimpleBarrier = unsafe {
-            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
-                .cast()
-        };
-        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
-        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
-
-        // Fork another test process. The code after this runs in both processes concurrently.
-        let fork_result = unsafe { nix::unistd::fork().unwrap() };
-
-        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, 1000..2000);
-        } else {
-            write_range(ptr, 0xBB, 2000..3000);
-        }
-        barrier.wait();
-        // Verify the contents. (in both processes)
-        assert_range(ptr, 0xAA, 1000..2000);
-        assert_range(ptr, 0xBB, 2000..3000);
-
-        // Grow, from the child this time
-        let size = 10_000_000;
-        if !fork_result.is_parent() {
-            init_struct.set_size(size).unwrap();
-        }
-        barrier.wait();
-
-        // make some writes at the end
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, (size - 10)..size);
-        } else {
-            write_range(ptr, 0xBB, (size - 20)..(size - 10));
-        }
-        barrier.wait();
-
-        // Verify the contents. (This runs in both processes)
-        assert_range(ptr, 0, (size - 1000)..(size - 20));
-        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
-        assert_range(ptr, 0xAA, (size - 10)..size);
-
-        if let ForkResult::Parent { child } = fork_result {
-            nix::sys::wait::waitpid(child, None).unwrap();
-        }
-    }
-}
+pub mod hash;
+pub mod shmem;
+pub mod sync;
--- a/libs/neon-shmem/src/shmem.rs
+++ b/libs/neon-shmem/src/shmem.rs
@@ -0,0 +1,411 @@
+//! Dynamically resizable contiguous chunk of shared memory
+
+use std::num::NonZeroUsize;
+use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use nix::errno::Errno;
+use nix::sys::mman::MapFlags;
+use nix::sys::mman::ProtFlags;
+use nix::sys::mman::mmap as nix_mmap;
+use nix::sys::mman::munmap as nix_munmap;
+use nix::unistd::ftruncate as nix_ftruncate;
+
+/// `ShmemHandle` represents a shared memory area that can be shared by processes over `fork()`.
+/// Unlike shared memory allocated by Postgres, this area is resizable, up to `max_size` that's
+/// specified at creation.
+///
+/// The area is backed by an anonymous file created with `memfd_create()`. The full address space for
+/// `max_size` is reserved up-front with `mmap()`, but whenever you call [`ShmemHandle::set_size`],
+/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
+/// will cause the file to be expanded, but we might use `mprotect()` etc. to enforce that in the
+/// future.
+#[derive(Debug)]
+pub struct ShmemHandle {
+    /// memfd file descriptor
+    fd: OwnedFd,
+
+    max_size: usize,
+
+    // Pointer to the beginning of the shared memory area. The header is stored there.
+    shared_ptr: NonNull<SharedStruct>,
+
+    // Pointer to the beginning of the user data
+    pub data_ptr: NonNull<u8>,
+}
+
+/// This is stored at the beginning in the shared memory area.
+#[derive(Debug)]
+struct SharedStruct {
+    max_size: usize,
+
+    /// Current size of the backing file. The high-order bit is used for the [`RESIZE_IN_PROGRESS`] flag.
+    current_size: AtomicUsize,
+}
+
+const RESIZE_IN_PROGRESS: usize = 1 << 63;
+
+const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
+
+/// Error type returned by the [`ShmemHandle`] functions.
+#[derive(thiserror::Error, Debug)]
+#[error("{msg}: {errno}")]
+pub struct Error {
+    pub msg: String,
+    pub errno: Errno,
+}
+
+impl Error {
+    fn new(msg: &str, errno: Errno) -> Self {
+        Self {
+            msg: msg.to_string(),
+            errno,
+        }
+    }
+}
+
+impl ShmemHandle {
+    /// Create a new shared memory area. To communicate between processes, the processes need to be
+    /// `fork()`'d after calling this, so that the `ShmemHandle` is inherited by all processes.
+    ///
+    /// If the `ShmemHandle` is dropped, the memory is unmapped from the current process. Other
+    /// processes can continue using it, however.
+    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<Self, Error> {
+        // create the backing anonymous file.
+        let fd = create_backing_file(name)?;
+
+        Self::new_with_fd(fd, initial_size, max_size)
+    }
+
+    fn new_with_fd(fd: OwnedFd, initial_size: usize, max_size: usize) -> Result<Self, Error> {
+        // We reserve the high-order bit for the `RESIZE_IN_PROGRESS` flag, and the actual size
+        // is a little larger than this because of the SharedStruct header. Make the upper limit
+        // somewhat smaller than that, because with anything close to that, you'll run out of
+        // memory anyway.
+        assert!(max_size < 1 << 48, "max size {max_size} too large");
+
+        assert!(
+            initial_size <= max_size,
+            "initial size {initial_size} larger than max size {max_size}"
+        );
+
+        // The actual initial / max size is the one given by the caller, plus the size of
+        // 'SharedStruct'.
+        let initial_size = HEADER_SIZE + initial_size;
+        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
+
+        // Reserve address space for it with mmap
+        //
+        // TODO: Use MAP_HUGETLB if possible
+        let start_ptr = unsafe {
+            nix_mmap(
+                None,
+                max_size,
+                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
+                MapFlags::MAP_SHARED,
+                &fd,
+                0,
+            )
+        }
+        .map_err(|e| Error::new("mmap failed", e))?;
+
+        // Reserve space for the initial size
+        enlarge_file(fd.as_fd(), initial_size as u64)?;
+
+        // Initialize the header
+        let shared: NonNull<SharedStruct> = start_ptr.cast();
+        unsafe {
+            shared.write(SharedStruct {
+                max_size: max_size.into(),
+                current_size: AtomicUsize::new(initial_size),
+            });
+        }
+
+        // The user data begins after the header
+        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
+
+        Ok(Self {
+            fd,
+            max_size: max_size.into(),
+            shared_ptr: shared,
+            data_ptr,
+        })
+    }
+
+    // return reference to the header
+    fn shared(&self) -> &SharedStruct {
+        unsafe { self.shared_ptr.as_ref() }
+    }
+
+    /// Resize the shared memory area. `new_size` must not be larger than the `max_size` specified
+    /// when creating the area.
+    ///
+    /// This may only be called from one process/thread concurrently. We detect that case
+    /// and return an [`shmem::Error`](Error).
+    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
+        let new_size = new_size + HEADER_SIZE;
+        let shared = self.shared();
+
+        assert!(
+            new_size <= self.max_size,
+            "new size ({new_size}) is greater than max size ({})",
+            self.max_size
+        );
+
+        assert_eq!(self.max_size, shared.max_size);
+
+        // Lock the area by setting the bit in `current_size`
+        //
+        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
+        // and the `posix_fallocate`/`ftruncate` call is surely a synchronization point anyway. But
+        // since this is not performance-critical, better safe than sorry.
+        let mut old_size = shared.current_size.load(Ordering::Acquire);
+        loop {
+            if (old_size & RESIZE_IN_PROGRESS) != 0 {
+                return Err(Error::new(
+                    "concurrent resize detected",
+                    Errno::UnknownErrno,
+                ));
+            }
+            match shared.current_size.compare_exchange(
+                old_size,
+                new_size,
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => break,
+                Err(x) => old_size = x,
+            }
+        }
+
+        // Ok, we got the lock.
+        //
+        // NB: If anything goes wrong, we *must* clear the bit!
+        let result = {
+            use std::cmp::Ordering::{Equal, Greater, Less};
+            match new_size.cmp(&old_size) {
+                Less => nix_ftruncate(&self.fd, new_size as i64)
+                    .map_err(|e| Error::new("could not shrink shmem segment, ftruncate failed", e)),
+                Equal => Ok(()),
+                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
+            }
+        };
+
+        // Unlock
+        shared.current_size.store(
+            if result.is_ok() { new_size } else { old_size },
+            Ordering::Release,
+        );
+
+        result
+    }
+
+    /// Returns the current user-visible size of the shared memory segment.
+    ///
+    /// NOTE: a concurrent [`ShmemHandle::set_size()`] call can change the size at any time.
+    /// It is the caller's responsibility not to access the area beyond the current size.
+    pub fn current_size(&self) -> usize {
+        let total_current_size =
+            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
+        total_current_size - HEADER_SIZE
+    }
+}
+
+impl Drop for ShmemHandle {
+    fn drop(&mut self) {
+        // SAFETY: The pointer was obtained from mmap() with the given size.
+        // We unmap the entire region.
+        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
+        // The fd is dropped automatically by OwnedFd.
+    }
+}
+
+/// Create a "backing file" for the shared memory area. On Linux, use `memfd_create()`, to create an
+/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
+/// development and testing, but in production we want the file to stay in memory.
+///
+/// Disable unused variables warnings because `name` is unused in the macos path.
+#[allow(unused_variables)]
+fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
+            .map_err(|e| Error::new("memfd_create failed", e))
+    }
+    #[cfg(target_os = "macos")]
+    {
+        let file = tempfile::tempfile().map_err(|e| {
+            Error::new(
+                "could not create temporary file to back shmem area",
+                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
+            )
+        })?;
+        Ok(OwnedFd::from(file))
+    }
+}
+
+fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
+    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
+    // we don't get a segfault later when trying to actually use it.
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::fcntl::posix_fallocate(fd, 0, size as i64)
+            .map_err(|e| Error::new("could not grow shmem segment, posix_fallocate failed", e))
+    }
+    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
+    #[cfg(target_os = "macos")]
+    {
+        nix::unistd::ftruncate(fd, size as i64)
+            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed", e))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use nix::unistd::ForkResult;
+    use std::ops::Range;
+
+    /// check that all bytes in given range have the expected value.
+    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
+        for i in range {
+            let b = unsafe { *(ptr.add(i)) };
+            assert_eq!(expected, b, "unexpected byte at offset {i}");
+        }
+    }
+
+    /// Write 'b' to all bytes in the given range
+    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
+        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
+    }
+
+    // simple single-process test of growing and shrinking
+    #[test]
+    fn test_shmem_resize() -> Result<(), Error> {
+        let max_size = 1024 * 1024;
+        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
+
+        assert_eq!(init_struct.current_size(), 0);
+
+        // Initial grow
+        let size1 = 10000;
+        init_struct.set_size(size1).unwrap();
+        assert_eq!(init_struct.current_size(), size1);
+
+        // Write some data
+        let data_ptr = init_struct.data_ptr.as_ptr();
+        write_range(data_ptr, 0xAA, 0..size1);
+        assert_range(data_ptr, 0xAA, 0..size1);
+
+        // Shrink
+        let size2 = 5000;
+        init_struct.set_size(size2).unwrap();
+        assert_eq!(init_struct.current_size(), size2);
+
+        // Grow again
+        let size3 = 20000;
+        init_struct.set_size(size3).unwrap();
+        assert_eq!(init_struct.current_size(), size3);
+
+        // Try to read it. The area that was shrunk and grown again should read as all zeros now
+        assert_range(data_ptr, 0xAA, 0..5000);
+        assert_range(data_ptr, 0, 5000..size1);
+
+        // Try to grow beyond max_size
+        //let size4 = max_size + 1;
+        //assert!(init_struct.set_size(size4).is_err());
+
+        // Dropping init_struct should unmap the memory
+        drop(init_struct);
+
+        Ok(())
+    }
+
+    /// This is used in tests to coordinate between test processes. It's like `std::sync::Barrier`,
+    /// but is stored in the shared memory area and works across processes. It's implemented by
+    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
+    struct SimpleBarrier {
+        num_procs: usize,
+        count: AtomicUsize,
+    }
+
+    impl SimpleBarrier {
+        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
+            unsafe {
+                *ptr = SimpleBarrier {
+                    num_procs,
+                    count: AtomicUsize::new(0),
+                }
+            }
+        }
+
+        pub fn wait(&self) {
+            let old = self.count.fetch_add(1, Ordering::Relaxed);
+
+            let generation = old / self.num_procs;
+
+            let mut current = old + 1;
+            while current < (generation + 1) * self.num_procs {
+                std::thread::sleep(std::time::Duration::from_millis(10));
+                current = self.count.load(Ordering::Relaxed);
+            }
+        }
+    }
+
+    #[test]
+    fn test_multi_process() {
+        // Initialize
+        let max_size = 1_000_000_000_000;
+        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
+        let ptr = init_struct.data_ptr.as_ptr();
+
+        // Store the SimpleBarrier in the first 1k of the area.
+        init_struct.set_size(10000).unwrap();
+        let barrier_ptr: *mut SimpleBarrier = unsafe {
+            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
+                .cast()
+        };
+        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
+        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
+
+        // Fork another test process. The code after this runs in both processes concurrently.
+        let fork_result = unsafe { nix::unistd::fork().unwrap() };
+
+        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, 1000..2000);
+        } else {
+            write_range(ptr, 0xBB, 2000..3000);
+        }
+        barrier.wait();
+        // Verify the contents. (in both processes)
+        assert_range(ptr, 0xAA, 1000..2000);
+        assert_range(ptr, 0xBB, 2000..3000);
+
+        // Grow, from the child this time
+        let size = 10_000_000;
+        if !fork_result.is_parent() {
+            init_struct.set_size(size).unwrap();
+        }
+        barrier.wait();
+
+        // make some writes at the end
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, (size - 10)..size);
+        } else {
+            write_range(ptr, 0xBB, (size - 20)..(size - 10));
+        }
+        barrier.wait();
+
+        // Verify the contents. (This runs in both processes)
+        assert_range(ptr, 0, (size - 1000)..(size - 20));
+        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
+        assert_range(ptr, 0xAA, (size - 10)..size);
+
+        if let ForkResult::Parent { child } = fork_result {
+            nix::sys::wait::waitpid(child, None).unwrap();
+        }
+    }
+}
--- a/libs/neon-shmem/src/sync.rs
+++ b/libs/neon-shmem/src/sync.rs
@@ -0,0 +1,104 @@
+//! Simple utilities akin to what's in [`std::sync`] but designed to work with shared memory.
+
+use std::mem::MaybeUninit;
+use std::ptr::NonNull;
+
+use nix::errno::Errno;
+
+pub type RwLock<T> = lock_api::RwLock<PthreadRwLock, T>;
+pub(crate) type RwLockReadGuard<'a, T> = lock_api::RwLockReadGuard<'a, PthreadRwLock, T>;
+pub type RwLockWriteGuard<'a, T> = lock_api::RwLockWriteGuard<'a, PthreadRwLock, T>;
+pub type ValueReadGuard<'a, T> = lock_api::MappedRwLockReadGuard<'a, PthreadRwLock, T>;
+pub type ValueWriteGuard<'a, T> = lock_api::MappedRwLockWriteGuard<'a, PthreadRwLock, T>;
+
+/// Shared memory read-write lock.
+pub struct PthreadRwLock(Option<NonNull<libc::pthread_rwlock_t>>);
+
+impl PthreadRwLock {
+    pub fn new(lock: *mut libc::pthread_rwlock_t) -> Self {
+        unsafe {
+            let mut attrs = MaybeUninit::uninit();
+            // Ignoring return value here - only possible error is OOM.
+            libc::pthread_rwlockattr_init(attrs.as_mut_ptr());
+            libc::pthread_rwlockattr_setpshared(attrs.as_mut_ptr(), libc::PTHREAD_PROCESS_SHARED);
+            // TODO(quantumish): worth making this function return Result?
+            libc::pthread_rwlock_init(lock, attrs.as_mut_ptr());
+            // Safety: POSIX specifies that "any function affecting the attributes
+            // object (including destruction) shall not affect any previously
+            // initialized read-write locks".
+            libc::pthread_rwlockattr_destroy(attrs.as_mut_ptr());
+            Self(Some(NonNull::new_unchecked(lock)))
+        }
+    }
+
+    fn inner(&self) -> NonNull<libc::pthread_rwlock_t> {
+        match self.0 {
+            None => {
+                panic!("PthreadRwLock constructed badly - something likely used RawMutex::INIT")
+            }
+            Some(x) => x,
+        }
+    }
+}
+
+unsafe impl lock_api::RawRwLock for PthreadRwLock {
+    type GuardMarker = lock_api::GuardSend;
+    const INIT: Self = Self(None);
+
+    fn lock_shared(&self) {
+        unsafe {
+            let res = libc::pthread_rwlock_rdlock(self.inner().as_ptr());
+            if res != 0 {
+                panic!("rdlock failed with {}", Errno::from_raw(res));
+            }
+        }
+    }
+
+    fn try_lock_shared(&self) -> bool {
+        unsafe {
+            let res = libc::pthread_rwlock_tryrdlock(self.inner().as_ptr());
+            match res {
+                0 => true,
+                libc::EAGAIN => false,
+                _ => panic!("try_rdlock failed with {}", Errno::from_raw(res)),
+            }
+        }
+    }
+
+    fn lock_exclusive(&self) {
+        unsafe {
+            let res = libc::pthread_rwlock_wrlock(self.inner().as_ptr());
+            if res != 0 {
+                panic!("wrlock failed with {}", Errno::from_raw(res));
+            }
+        }
+    }
+
+    fn try_lock_exclusive(&self) -> bool {
+        unsafe {
+            let res = libc::pthread_rwlock_trywrlock(self.inner().as_ptr());
+            match res {
+                0 => true,
+                libc::EAGAIN => false,
+                _ => panic!("try_wrlock failed with {}", Errno::from_raw(res)),
+            }
+        }
+    }
+
+    unsafe fn unlock_exclusive(&self) {
+        unsafe {
+            let res = libc::pthread_rwlock_unlock(self.inner().as_ptr());
+            if res != 0 {
+                panic!("unlock failed with {}", Errno::from_raw(res));
+            }
+        }
+    }
+    unsafe fn unlock_shared(&self) {
+        unsafe {
+            let res = libc::pthread_rwlock_unlock(self.inner().as_ptr());
+            if res != 0 {
+                panic!("unlock failed with {}", Errno::from_raw(res));
+            }
+        }
+    }
+}
--- a/libs/neonart/Cargo.toml
+++ b/libs/neonart/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "neonart"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+crossbeam-utils.workspace = true
+spin.workspace = true
+tracing.workspace = true
+
+[dev-dependencies]
+rand = "0.9.1"
+rand_distr = "0.5.1"
--- a/libs/neonart/src/algorithm.rs
+++ b/libs/neonart/src/algorithm.rs
@@ -0,0 +1,599 @@
+mod lock_and_version;
+pub(crate) mod node_ptr;
+mod node_ref;
+
+use std::vec::Vec;
+
+use crate::algorithm::lock_and_version::ConcurrentUpdateError;
+use crate::algorithm::node_ptr::MAX_PREFIX_LEN;
+use crate::algorithm::node_ref::{NewNodeRef, NodeRef, ReadLockedNodeRef, WriteLockedNodeRef};
+use crate::allocator::OutOfMemoryError;
+
+use crate::TreeWriteGuard;
+use crate::UpdateAction;
+use crate::allocator::ArtAllocator;
+use crate::epoch::EpochPin;
+use crate::{Key, Value};
+
+pub(crate) type RootPtr<V> = node_ptr::NodePtr<V>;
+
+#[derive(Debug)]
+pub enum ArtError {
+    ConcurrentUpdate, // need to retry
+    OutOfMemory,
+}
+
+impl From<ConcurrentUpdateError> for ArtError {
+    fn from(_: ConcurrentUpdateError) -> ArtError {
+        ArtError::ConcurrentUpdate
+    }
+}
+
+impl From<OutOfMemoryError> for ArtError {
+    fn from(_: OutOfMemoryError) -> ArtError {
+        ArtError::OutOfMemory
+    }
+}
+
+pub fn new_root<V: Value>(
+    allocator: &impl ArtAllocator<V>,
+) -> Result<RootPtr<V>, OutOfMemoryError> {
+    node_ptr::new_root(allocator)
+}
+
+pub(crate) fn search<'e, K: Key, V: Value>(
+    key: &K,
+    root: RootPtr<V>,
+    epoch_pin: &'e EpochPin,
+) -> Option<&'e V> {
+    loop {
+        let root_ref = NodeRef::from_root_ptr(root);
+        if let Ok(result) = lookup_recurse(key.as_bytes(), root_ref, None, epoch_pin) {
+            break result;
+        }
+        // retry
+    }
+}
+
+pub(crate) fn iter_next<'e, V: Value>(
+    key: &[u8],
+    root: RootPtr<V>,
+    epoch_pin: &'e EpochPin,
+) -> Option<(Vec<u8>, &'e V)> {
+    loop {
+        let mut path = Vec::new();
+        let root_ref = NodeRef::from_root_ptr(root);
+
+        match next_recurse(key, &mut path, root_ref, epoch_pin) {
+            Ok(Some(v)) => {
+                assert_eq!(path.len(), key.len());
+                break Some((path, v));
+            }
+            Ok(None) => break None,
+            Err(ConcurrentUpdateError()) => {
+                // retry
+                continue;
+            }
+        }
+    }
+}
+
+pub(crate) fn update_fn<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>, F>(
+    key: &K,
+    value_fn: F,
+    root: RootPtr<V>,
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), OutOfMemoryError>
+where
+    F: FnOnce(Option<&V>) -> UpdateAction<V>,
+{
+    let value_fn_cell = std::cell::Cell::new(Some(value_fn));
+    loop {
+        let root_ref = NodeRef::from_root_ptr(root);
+        let this_value_fn = |arg: Option<&V>| value_fn_cell.take().unwrap()(arg);
+        let key_bytes = key.as_bytes();
+
+        match update_recurse(
+            key_bytes,
+            this_value_fn,
+            root_ref,
+            None,
+            None,
+            guard,
+            0,
+            key_bytes,
+        ) {
+            Ok(()) => break Ok(()),
+            Err(ArtError::ConcurrentUpdate) => {
+                continue; // retry
+            }
+            Err(ArtError::OutOfMemory) => break Err(OutOfMemoryError()),
+        }
+    }
+}
+
+// Error means you must retry.
+//
+// This corresponds to the 'lookupOpt' function in the paper
+#[allow(clippy::only_used_in_recursion)]
+fn lookup_recurse<'e, V: Value>(
+    key: &[u8],
+    node: NodeRef<'e, V>,
+    parent: Option<ReadLockedNodeRef<V>>,
+    epoch_pin: &'e EpochPin,
+) -> Result<Option<&'e V>, ConcurrentUpdateError> {
+    let rnode = node.read_lock_or_restart()?;
+    if let Some(parent) = parent {
+        parent.read_unlock_or_restart()?;
+    }
+
+    // check if the prefix matches, may increment level
+    let prefix_len = if let Some(prefix_len) = rnode.prefix_matches(key) {
+        prefix_len
+    } else {
+        rnode.read_unlock_or_restart()?;
+        return Ok(None);
+    };
+
+    if rnode.is_leaf() {
+        assert_eq!(key.len(), prefix_len);
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let v = unsafe { vptr.as_ref().unwrap() };
+        return Ok(Some(v));
+    }
+
+    let key = &key[prefix_len..];
+
+    // find child (or leaf value)
+    let next_node = rnode.find_child_or_restart(key[0])?;
+
+    match next_node {
+        None => Ok(None), // key not found
+        Some(child) => lookup_recurse(&key[1..], child, Some(rnode), epoch_pin),
+    }
+}
+
+#[allow(clippy::only_used_in_recursion)]
+fn next_recurse<'e, V: Value>(
+    min_key: &[u8],
+    path: &mut Vec<u8>,
+    node: NodeRef<'e, V>,
+    epoch_pin: &'e EpochPin,
+) -> Result<Option<&'e V>, ConcurrentUpdateError> {
+    let rnode = node.read_lock_or_restart()?;
+    let prefix = rnode.get_prefix();
+    if !prefix.is_empty() {
+        path.extend_from_slice(prefix);
+    }
+
+    use std::cmp::Ordering;
+    let comparison = path.as_slice().cmp(&min_key[0..path.len()]);
+    if comparison == Ordering::Less {
+        rnode.read_unlock_or_restart()?;
+        return Ok(None);
+    }
+
+    if rnode.is_leaf() {
+        assert_eq!(path.len(), min_key.len());
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let v = unsafe { vptr.as_ref().unwrap() };
+        return Ok(Some(v));
+    }
+
+    let mut min_key_byte = match comparison {
+        Ordering::Less => unreachable!(), // checked this above already
+        Ordering::Equal => min_key[path.len()],
+        Ordering::Greater => 0,
+    };
+
+    loop {
+        match rnode.find_next_child_or_restart(min_key_byte)? {
+            None => {
+                return Ok(None);
+            }
+            Some((key_byte, child_ref)) => {
+                let path_len = path.len();
+                path.push(key_byte);
+                let result = next_recurse(min_key, path, child_ref, epoch_pin)?;
+                if result.is_some() {
+                    return Ok(result);
+                }
+                if key_byte == u8::MAX {
+                    return Ok(None);
+                }
+                path.truncate(path_len);
+                min_key_byte = key_byte + 1;
+            }
+        }
+    }
+}
+
+// This corresponds to the 'insertOpt' function in the paper
+#[allow(clippy::only_used_in_recursion)]
+#[allow(clippy::too_many_arguments)]
+pub(crate) fn update_recurse<'e, K: Key, V: Value, A: ArtAllocator<V>, F>(
+    key: &[u8],
+    value_fn: F,
+    node: NodeRef<'e, V>,
+    rparent: Option<(ReadLockedNodeRef<V>, u8)>,
+    rgrandparent: Option<(ReadLockedNodeRef<V>, u8)>,
+    guard: &'_ mut TreeWriteGuard<'e, K, V, A>,
+    level: usize,
+    orig_key: &[u8],
+) -> Result<(), ArtError>
+where
+    F: FnOnce(Option<&V>) -> UpdateAction<V>,
+{
+    let rnode = node.read_lock_or_restart()?;
+
+    let prefix_match_len = rnode.prefix_matches(key);
+    if prefix_match_len.is_none() {
+        let (rparent, parent_key) = rparent.expect("direct children of the root have no prefix");
+        let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+        let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+        match value_fn(None) {
+            UpdateAction::Nothing => {}
+            UpdateAction::Insert(new_value) => {
+                insert_split_prefix(key, new_value, &mut wnode, &mut wparent, parent_key, guard)?;
+            }
+            UpdateAction::Remove => {
+                panic!("unexpected Remove action on insertion");
+            }
+        }
+        wnode.write_unlock();
+        wparent.write_unlock();
+        return Ok(());
+    }
+    let prefix_match_len = prefix_match_len.unwrap();
+    let key = &key[prefix_match_len..];
+    let level = level + prefix_match_len;
+
+    if rnode.is_leaf() {
+        assert_eq!(key.len(), 0);
+        let (rparent, parent_key) = rparent.expect("root cannot be leaf");
+        let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+        let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+        // safety: Now that we have acquired the write lock, we have exclusive access to the
+        // value. XXX: There might be concurrent reads though?
+        let value_mut = wnode.get_leaf_value_mut();
+
+        match value_fn(Some(value_mut)) {
+            UpdateAction::Nothing => {
+                wparent.write_unlock();
+                wnode.write_unlock();
+            }
+            UpdateAction::Insert(_) => panic!("cannot insert over existing value"),
+            UpdateAction::Remove => {
+                guard.remember_obsolete_node(wnode.as_ptr());
+                wparent.delete_child(parent_key);
+                wnode.write_unlock_obsolete();
+
+                if let Some(rgrandparent) = rgrandparent {
+                    // FIXME: Ignore concurrency error. It doesn't lead to
+                    // corruption, but it means we might leak something. Until
+                    // another update cleans it up.
+                    let _ = cleanup_parent(wparent, rgrandparent, guard);
+                }
+            }
+        }
+
+        return Ok(());
+    }
+
+    let next_node = rnode.find_child_or_restart(key[0])?;
+
+    if next_node.is_none() {
+        if rnode.is_full() {
+            let (rparent, parent_key) = rparent.expect("root node cannot become full");
+            let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+            let wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+            match value_fn(None) {
+                UpdateAction::Nothing => {
+                    wnode.write_unlock();
+                    wparent.write_unlock();
+                }
+                UpdateAction::Insert(new_value) => {
+                    insert_and_grow(key, new_value, wnode, &mut wparent, parent_key, guard)?;
+                    wparent.write_unlock();
+                }
+                UpdateAction::Remove => {
+                    panic!("unexpected Remove action on insertion");
+                }
+            };
+        } else {
+            let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+            if let Some((rparent, _)) = rparent {
+                rparent.read_unlock_or_restart()?;
+            }
+            match value_fn(None) {
+                UpdateAction::Nothing => {}
+                UpdateAction::Insert(new_value) => {
+                    insert_to_node(&mut wnode, key, new_value, guard)?;
+                }
+                UpdateAction::Remove => {
+                    panic!("unexpected Remove action on insertion");
+                }
+            };
+            wnode.write_unlock();
+        }
+        Ok(())
+    } else {
+        let next_child = next_node.unwrap(); // checked above it's not None
+        if let Some((ref rparent, _)) = rparent {
+            rparent.check_or_restart()?;
+        }
+
+        // recurse to next level
+        update_recurse(
+            &key[1..],
+            value_fn,
+            next_child,
+            Some((rnode, key[0])),
+            rparent,
+            guard,
+            level + 1,
+            orig_key,
+        )
+    }
+}
+
+#[derive(Clone)]
+enum PathElement {
+    Prefix(Vec<u8>),
+    KeyByte(u8),
+}
+
+impl std::fmt::Debug for PathElement {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            PathElement::Prefix(prefix) => write!(fmt, "{prefix:?}"),
+            PathElement::KeyByte(key_byte) => write!(fmt, "{key_byte}"),
+        }
+    }
+}
+
+pub(crate) fn dump_tree<V: Value + std::fmt::Debug>(
+    root: RootPtr<V>,
+    epoch_pin: &'_ EpochPin,
+    dst: &mut dyn std::io::Write,
+) {
+    let root_ref = NodeRef::from_root_ptr(root);
+
+    let _ = dump_recurse(&[], root_ref, epoch_pin, 0, dst);
+}
+
+// TODO: return an Err if writeln!() returns error, instead of unwrapping
+#[allow(clippy::only_used_in_recursion)]
+fn dump_recurse<'e, V: Value + std::fmt::Debug>(
+    path: &[PathElement],
+    node: NodeRef<'e, V>,
+    epoch_pin: &'e EpochPin,
+    level: usize,
+    dst: &mut dyn std::io::Write,
+) -> Result<(), ConcurrentUpdateError> {
+    let indent = str::repeat(" ", level);
+
+    let rnode = node.read_lock_or_restart()?;
+    let mut path = Vec::from(path);
+    let prefix = rnode.get_prefix();
+    if !prefix.is_empty() {
+        path.push(PathElement::Prefix(Vec::from(prefix)));
+    }
+
+    if rnode.is_leaf() {
+        let vptr = rnode.get_leaf_value_ptr()?;
+        // safety: It's OK to return a ref of the pointer because we checked the version
+        // and the lifetime of 'epoch_pin' enforces that the reference is only accessible
+        // as long as the epoch is pinned.
+        let val = unsafe { vptr.as_ref().unwrap() };
+        writeln!(dst, "{indent} {path:?}: {val:?}").unwrap();
+        return Ok(());
+    }
+
+    for key_byte in 0..=u8::MAX {
+        match rnode.find_child_or_restart(key_byte)? {
+            None => continue,
+            Some(child_ref) => {
+                let rchild = child_ref.read_lock_or_restart()?;
+                writeln!(
+                    dst,
+                    "{} {:?}, {}: prefix {:?}",
+                    indent,
+                    &path,
+                    key_byte,
+                    rchild.get_prefix()
+                )
+                .unwrap();
+
+                let mut child_path = path.clone();
+                child_path.push(PathElement::KeyByte(key_byte));
+
+                dump_recurse(&child_path, child_ref, epoch_pin, level + 1, dst)?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+///```text
+///        [fooba]r -> value
+///
+/// [foo]b -> [a]r  -> value
+///      e -> [ls]e -> value
+///```
+fn insert_split_prefix<K: Key, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    node: &mut WriteLockedNodeRef<V>,
+    parent: &mut WriteLockedNodeRef<V>,
+    parent_key: u8,
+    guard: &'_ TreeWriteGuard<K, V, A>,
+) -> Result<(), OutOfMemoryError> {
+    let old_node = node;
+    let old_prefix = old_node.get_prefix();
+    let common_prefix_len = common_prefix(key, old_prefix);
+
+    // Allocate a node for the new value.
+    let new_value_node = allocate_node_for_value(
+        &key[common_prefix_len + 1..],
+        value,
+        guard.tree_writer.allocator,
+    )?;
+
+    // Allocate a new internal node with the common prefix
+    // FIXME: deallocate 'new_value_node' on OOM
+    let mut prefix_node =
+        node_ref::new_internal(&key[..common_prefix_len], guard.tree_writer.allocator)?;
+
+    // Add the old node and the new nodes to the new internal node
+    prefix_node.insert_old_child(old_prefix[common_prefix_len], old_node);
+    prefix_node.insert_new_child(key[common_prefix_len], new_value_node);
+
+    // Modify the prefix of the old child in place
+    old_node.truncate_prefix(old_prefix.len() - common_prefix_len - 1);
+
+    // replace the pointer in the parent
+    parent.replace_child(parent_key, prefix_node.into_ptr());
+
+    Ok(())
+}
+
+fn insert_to_node<K: Key, V: Value, A: ArtAllocator<V>>(
+    wnode: &mut WriteLockedNodeRef<V>,
+    key: &[u8],
+    value: V,
+    guard: &'_ TreeWriteGuard<K, V, A>,
+) -> Result<(), OutOfMemoryError> {
+    let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
+    wnode.insert_child(key[0], value_child.into_ptr());
+    Ok(())
+}
+
+// On entry: 'parent' and 'node' are locked
+fn insert_and_grow<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    wnode: WriteLockedNodeRef<V>,
+    parent: &mut WriteLockedNodeRef<V>,
+    parent_key_byte: u8,
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), ArtError> {
+    let mut bigger_node = wnode.grow(guard.tree_writer.allocator)?;
+
+    // FIXME: deallocate 'bigger_node' on OOM
+    let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
+    bigger_node.insert_new_child(key[0], value_child);
+
+    // Replace the pointer in the parent
+    parent.replace_child(parent_key_byte, bigger_node.into_ptr());
+
+    guard.remember_obsolete_node(wnode.as_ptr());
+    wnode.write_unlock_obsolete();
+
+    Ok(())
+}
+
+fn cleanup_parent<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
+    wparent: WriteLockedNodeRef<V>,
+    rgrandparent: (ReadLockedNodeRef<V>, u8),
+    guard: &'g mut TreeWriteGuard<'e, K, V, A>,
+) -> Result<(), ArtError> {
+    let (rgrandparent, grandparent_key_byte) = rgrandparent;
+
+    // If the parent becomes completely empty after the deletion, remove the parent from the
+    // grandparent. (This case is possible because we reserve only 8 bytes for the prefix.)
+    // TODO: not implemented.
+
+    // If the parent has only one child, replace the parent with the remaining child. (This is not
+    // possible if the child's prefix field cannot absorb the parent's)
+    if wparent.num_children() == 1 {
+        // Try to lock the remaining child. This can fail if the child is updated
+        // concurrently.
+        let (key_byte, remaining_child) = wparent.find_remaining_child();
+
+        let mut wremaining_child = remaining_child.write_lock_or_restart()?;
+
+        if 1 + wremaining_child.get_prefix().len() + wparent.get_prefix().len() <= MAX_PREFIX_LEN {
+            let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
+
+            // Ok, we have locked the leaf, the parent, the grandparent, and the parent's only
+            // remaining leaf. Proceed with the updates.
+
+            // Update the prefix on the remaining leaf
+            wremaining_child.prepend_prefix(wparent.get_prefix(), key_byte);
+
+            // Replace the pointer in the grandparent to point directly to the remaining leaf
+            wgrandparent.replace_child(grandparent_key_byte, wremaining_child.as_ptr());
+
+            // Mark the parent as deleted.
+            guard.remember_obsolete_node(wparent.as_ptr());
+            wparent.write_unlock_obsolete();
+            return Ok(());
+        }
+    }
+
+    // If the parent's children would fit on a smaller node type after the deletion, replace it with
+    // a smaller node.
+    if wparent.can_shrink() {
+        let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
+        let smaller_node = wparent.shrink(guard.tree_writer.allocator)?;
+
+        // Replace the pointer in the grandparent
+        wgrandparent.replace_child(grandparent_key_byte, smaller_node.into_ptr());
+
+        guard.remember_obsolete_node(wparent.as_ptr());
+        wparent.write_unlock_obsolete();
+        return Ok(());
+    }
+
+    // nothing to do
+    wparent.write_unlock();
+    Ok(())
+}
+
+// Allocate a new leaf node to hold 'value'. If the key is long, we
+// may need to allocate new internal nodes to hold it too
+fn allocate_node_for_value<'a, V: Value, A: ArtAllocator<V>>(
+    key: &[u8],
+    value: V,
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError> {
+    let mut prefix_off = key.len().saturating_sub(MAX_PREFIX_LEN);
+
+    let leaf_node = node_ref::new_leaf(&key[prefix_off..key.len()], value, allocator)?;
+
+    let mut node = leaf_node;
+    while prefix_off > 0 {
+        // Need another internal node
+        let remain_prefix = &key[0..prefix_off];
+
+        prefix_off = remain_prefix.len().saturating_sub(MAX_PREFIX_LEN + 1);
+        let mut internal_node = node_ref::new_internal(
+            &remain_prefix[prefix_off..remain_prefix.len() - 1],
+            allocator,
+        )?;
+        internal_node.insert_new_child(*remain_prefix.last().unwrap(), node);
+        node = internal_node;
+    }
+
+    Ok(node)
+}
+
+fn common_prefix(a: &[u8], b: &[u8]) -> usize {
+    for i in 0..MAX_PREFIX_LEN {
+        if a[i] != b[i] {
+            return i;
+        }
+    }
+    panic!("prefixes are equal");
+}
--- a/libs/neonart/src/algorithm/lock_and_version.rs
+++ b/libs/neonart/src/algorithm/lock_and_version.rs
@@ -0,0 +1,117 @@
+//! Each node in the tree has contains one atomic word that stores three things:
+//!
+//! Bit 0: set if the node is "obsolete". An obsolete node has been removed from the tree,
+//!        but might still be accessed by concurrent readers until the epoch expires.
+//! Bit 1: set if the node is currently write-locked. Used as a spinlock.
+//! Bits 2-63: Version number, incremented every time the node is modified.
+//!
+//! AtomicLockAndVersion represents that.
+
+use std::sync::atomic::{AtomicU64, Ordering};
+
+pub(crate) struct ConcurrentUpdateError();
+
+pub(crate) struct AtomicLockAndVersion {
+    inner: AtomicU64,
+}
+
+impl AtomicLockAndVersion {
+    pub(crate) fn new() -> AtomicLockAndVersion {
+        AtomicLockAndVersion {
+            inner: AtomicU64::new(0),
+        }
+    }
+}
+
+impl AtomicLockAndVersion {
+    pub(crate) fn read_lock_or_restart(&self) -> Result<u64, ConcurrentUpdateError> {
+        let version = self.await_node_unlocked();
+        if is_obsolete(version) {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(version)
+    }
+
+    pub(crate) fn check_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
+        self.read_unlock_or_restart(version)
+    }
+
+    pub(crate) fn read_unlock_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
+        if self.inner.load(Ordering::Acquire) != version {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn upgrade_to_write_lock_or_restart(
+        &self,
+        version: u64,
+    ) -> Result<(), ConcurrentUpdateError> {
+        if self
+            .inner
+            .compare_exchange(
+                version,
+                set_locked_bit(version),
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            )
+            .is_err()
+        {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn write_lock_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
+        let old = self.inner.load(Ordering::Relaxed);
+        if is_obsolete(old) || is_locked(old) {
+            return Err(ConcurrentUpdateError());
+        }
+        if self
+            .inner
+            .compare_exchange(
+                old,
+                set_locked_bit(old),
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            )
+            .is_err()
+        {
+            return Err(ConcurrentUpdateError());
+        }
+        Ok(())
+    }
+
+    pub(crate) fn write_unlock(&self) {
+        // reset locked bit and overflow into version
+        self.inner.fetch_add(2, Ordering::Release);
+    }
+
+    pub(crate) fn write_unlock_obsolete(&self) {
+        // set obsolete, reset locked, overflow into version
+        self.inner.fetch_add(3, Ordering::Release);
+    }
+
+    // Helper functions
+    fn await_node_unlocked(&self) -> u64 {
+        let mut version = self.inner.load(Ordering::Acquire);
+        while is_locked(version) {
+            // spinlock
+            std::thread::yield_now();
+            version = self.inner.load(Ordering::Acquire)
+        }
+        version
+    }
+}
+
+fn set_locked_bit(version: u64) -> u64 {
+    version + 2
+}
+
+fn is_obsolete(version: u64) -> bool {
+    (version & 1) == 1
+}
+
+fn is_locked(version: u64) -> bool {
+    (version & 2) == 2
+}
--- a/libs/neonart/src/algorithm/node_ptr.rs
+++ b/libs/neonart/src/algorithm/node_ptr.rs
--- a/libs/neonart/src/algorithm/node_ref.rs
+++ b/libs/neonart/src/algorithm/node_ref.rs
@@ -0,0 +1,349 @@
+use std::fmt::Debug;
+use std::marker::PhantomData;
+
+use super::node_ptr;
+use super::node_ptr::NodePtr;
+use crate::EpochPin;
+use crate::Value;
+use crate::algorithm::lock_and_version::AtomicLockAndVersion;
+use crate::algorithm::lock_and_version::ConcurrentUpdateError;
+use crate::allocator::ArtAllocator;
+use crate::allocator::OutOfMemoryError;
+
+pub struct NodeRef<'e, V> {
+    ptr: NodePtr<V>,
+
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V> Debug for NodeRef<'e, V> {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.ptr)
+    }
+}
+
+impl<'e, V: Value> NodeRef<'e, V> {
+    pub(crate) fn from_root_ptr(root_ptr: NodePtr<V>) -> NodeRef<'e, V> {
+        NodeRef {
+            ptr: root_ptr,
+            phantom: PhantomData,
+        }
+    }
+
+    pub(crate) fn read_lock_or_restart(
+        &self,
+    ) -> Result<ReadLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        let version = self.lockword().read_lock_or_restart()?;
+        Ok(ReadLockedNodeRef {
+            ptr: self.ptr,
+            version,
+            phantom: self.phantom,
+        })
+    }
+
+    pub(crate) fn write_lock_or_restart(
+        &self,
+    ) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        self.lockword().write_lock_or_restart()?;
+        Ok(WriteLockedNodeRef {
+            ptr: self.ptr,
+            phantom: self.phantom,
+        })
+    }
+
+    fn lockword(&self) -> &AtomicLockAndVersion {
+        self.ptr.lockword()
+    }
+}
+
+/// A reference to a node that has been optimistically read-locked. The functions re-check
+/// the version after each read.
+pub struct ReadLockedNodeRef<'e, V> {
+    ptr: NodePtr<V>,
+    version: u64,
+
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V: Value> ReadLockedNodeRef<'e, V> {
+    pub(crate) fn is_leaf(&self) -> bool {
+        self.ptr.is_leaf()
+    }
+
+    pub(crate) fn is_full(&self) -> bool {
+        self.ptr.is_full()
+    }
+
+    pub(crate) fn get_prefix(&self) -> &[u8] {
+        self.ptr.get_prefix()
+    }
+
+    /// Note: because we're only holding a read lock, the prefix can change concurrently.
+    /// You must be prepared to restart, if read_unlock() returns error later.
+    ///
+    /// Returns the length of the prefix, or None if it's not a match
+    pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
+        self.ptr.prefix_matches(key)
+    }
+
+    pub(crate) fn find_child_or_restart(
+        &self,
+        key_byte: u8,
+    ) -> Result<Option<NodeRef<'e, V>>, ConcurrentUpdateError> {
+        let child_or_value = self.ptr.find_child(key_byte);
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        match child_or_value {
+            None => Ok(None),
+            Some(child_ptr) => Ok(Some(NodeRef {
+                ptr: child_ptr,
+                phantom: self.phantom,
+            })),
+        }
+    }
+
+    pub(crate) fn find_next_child_or_restart(
+        &self,
+        min_key_byte: u8,
+    ) -> Result<Option<(u8, NodeRef<'e, V>)>, ConcurrentUpdateError> {
+        let child_or_value = self.ptr.find_next_child(min_key_byte);
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        match child_or_value {
+            None => Ok(None),
+            Some((k, child_ptr)) => Ok(Some((
+                k,
+                NodeRef {
+                    ptr: child_ptr,
+                    phantom: self.phantom,
+                },
+            ))),
+        }
+    }
+
+    pub(crate) fn get_leaf_value_ptr(&self) -> Result<*const V, ConcurrentUpdateError> {
+        let result = self.ptr.get_leaf_value();
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        // Extend the lifetime.
+        let result = std::ptr::from_ref(result);
+
+        Ok(result)
+    }
+
+    pub(crate) fn upgrade_to_write_lock_or_restart(
+        self,
+    ) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
+        self.ptr
+            .lockword()
+            .upgrade_to_write_lock_or_restart(self.version)?;
+
+        Ok(WriteLockedNodeRef {
+            ptr: self.ptr,
+            phantom: self.phantom,
+        })
+    }
+
+    pub(crate) fn read_unlock_or_restart(self) -> Result<(), ConcurrentUpdateError> {
+        self.ptr.lockword().check_or_restart(self.version)?;
+        Ok(())
+    }
+
+    pub(crate) fn check_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
+        self.ptr.lockword().check_or_restart(self.version)?;
+        Ok(())
+    }
+}
+
+/// A reference to a node that has been optimistically read-locked. The functions re-check
+/// the version after each read.
+pub struct WriteLockedNodeRef<'e, V> {
+    ptr: NodePtr<V>,
+    phantom: PhantomData<&'e EpochPin<'e>>,
+}
+
+impl<'e, V: Value> WriteLockedNodeRef<'e, V> {
+    pub(crate) fn can_shrink(&self) -> bool {
+        self.ptr.can_shrink()
+    }
+
+    pub(crate) fn num_children(&self) -> usize {
+        self.ptr.num_children()
+    }
+
+    pub(crate) fn write_unlock(mut self) {
+        self.ptr.lockword().write_unlock();
+        self.ptr = NodePtr::null();
+    }
+
+    pub(crate) fn write_unlock_obsolete(mut self) {
+        self.ptr.lockword().write_unlock_obsolete();
+        self.ptr = NodePtr::null();
+    }
+
+    pub(crate) fn get_prefix(&self) -> &[u8] {
+        self.ptr.get_prefix()
+    }
+
+    pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
+        self.ptr.truncate_prefix(new_prefix_len)
+    }
+
+    pub(crate) fn prepend_prefix(&mut self, prefix: &[u8], prefix_byte: u8) {
+        self.ptr.prepend_prefix(prefix, prefix_byte)
+    }
+
+    pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
+        self.ptr.insert_child(key_byte, child)
+    }
+
+    pub(crate) fn get_leaf_value_mut(&mut self) -> &mut V {
+        self.ptr.get_leaf_value_mut()
+    }
+
+    pub(crate) fn grow<'a, A>(
+        &self,
+        allocator: &'a A,
+    ) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+    where
+        A: ArtAllocator<V>,
+    {
+        let new_node = self.ptr.grow(allocator)?;
+        Ok(NewNodeRef {
+            ptr: new_node,
+            allocator,
+            extra_nodes: Vec::new(),
+        })
+    }
+
+    pub(crate) fn shrink<'a, A>(
+        &self,
+        allocator: &'a A,
+    ) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+    where
+        A: ArtAllocator<V>,
+    {
+        let new_node = self.ptr.shrink(allocator)?;
+        Ok(NewNodeRef {
+            ptr: new_node,
+            allocator,
+            extra_nodes: Vec::new(),
+        })
+    }
+
+    pub(crate) fn as_ptr(&self) -> NodePtr<V> {
+        self.ptr
+    }
+
+    pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
+        self.ptr.replace_child(key_byte, replacement);
+    }
+
+    pub(crate) fn delete_child(&mut self, key_byte: u8) {
+        self.ptr.delete_child(key_byte);
+    }
+
+    pub(crate) fn find_remaining_child(&self) -> (u8, NodeRef<'e, V>) {
+        assert_eq!(self.num_children(), 1);
+        let child_or_value = self.ptr.find_next_child(0);
+
+        match child_or_value {
+            None => panic!("could not find only child in node"),
+            Some((k, child_ptr)) => (
+                k,
+                NodeRef {
+                    ptr: child_ptr,
+                    phantom: self.phantom,
+                },
+            ),
+        }
+    }
+}
+
+impl<'e, V> Drop for WriteLockedNodeRef<'e, V> {
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            self.ptr.lockword().write_unlock();
+        }
+    }
+}
+
+pub(crate) struct NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    ptr: NodePtr<V>,
+    allocator: &'a A,
+
+    extra_nodes: Vec<NodePtr<V>>,
+}
+
+impl<'a, V, A> NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    pub(crate) fn insert_old_child(&mut self, key_byte: u8, child: &WriteLockedNodeRef<V>) {
+        self.ptr.insert_child(key_byte, child.as_ptr())
+    }
+
+    pub(crate) fn into_ptr(mut self) -> NodePtr<V> {
+        let ptr = self.ptr;
+        self.ptr = NodePtr::null();
+        ptr
+    }
+
+    pub(crate) fn insert_new_child(&mut self, key_byte: u8, child: NewNodeRef<'a, V, A>) {
+        let child_ptr = child.into_ptr();
+        self.ptr.insert_child(key_byte, child_ptr);
+        self.extra_nodes.push(child_ptr);
+    }
+}
+
+impl<'a, V, A> Drop for NewNodeRef<'a, V, A>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    /// This drop implementation deallocates the newly allocated node, if into_ptr() was not called.
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            self.ptr.deallocate(self.allocator);
+            for p in self.extra_nodes.iter() {
+                p.deallocate(self.allocator);
+            }
+        }
+    }
+}
+
+pub(crate) fn new_internal<'a, V, A>(
+    prefix: &[u8],
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    Ok(NewNodeRef {
+        ptr: node_ptr::new_internal(prefix, allocator)?,
+        allocator,
+        extra_nodes: Vec::new(),
+    })
+}
+
+pub(crate) fn new_leaf<'a, V, A>(
+    prefix: &[u8],
+    value: V,
+    allocator: &'a A,
+) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
+where
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    Ok(NewNodeRef {
+        ptr: node_ptr::new_leaf(prefix, value, allocator)?,
+        allocator,
+        extra_nodes: Vec::new(),
+    })
+}
--- a/libs/neonart/src/allocator.rs
+++ b/libs/neonart/src/allocator.rs
@@ -0,0 +1,156 @@
+pub mod block;
+mod multislab;
+mod slab;
+pub mod r#static;
+
+use std::alloc::Layout;
+use std::marker::PhantomData;
+use std::mem::MaybeUninit;
+use std::sync::atomic::Ordering;
+
+use crate::allocator::multislab::MultiSlabAllocator;
+use crate::allocator::r#static::alloc_from_slice;
+
+use spin;
+
+use crate::Tree;
+pub use crate::algorithm::node_ptr::{
+    NodeInternal4, NodeInternal16, NodeInternal48, NodeInternal256, NodeLeaf,
+};
+
+#[derive(Debug)]
+pub struct OutOfMemoryError();
+
+pub trait ArtAllocator<V: crate::Value> {
+    fn alloc_tree(&self) -> *mut Tree<V>;
+
+    fn alloc_node_internal4(&self) -> *mut NodeInternal4<V>;
+    fn alloc_node_internal16(&self) -> *mut NodeInternal16<V>;
+    fn alloc_node_internal48(&self) -> *mut NodeInternal48<V>;
+    fn alloc_node_internal256(&self) -> *mut NodeInternal256<V>;
+    fn alloc_node_leaf(&self) -> *mut NodeLeaf<V>;
+
+    fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>);
+    fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>);
+    fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>);
+    fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>);
+    fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>);
+}
+
+pub struct ArtMultiSlabAllocator<'t, V>
+where
+    V: crate::Value,
+{
+    tree_area: spin::Mutex<Option<&'t mut MaybeUninit<Tree<V>>>>,
+
+    pub(crate) inner: MultiSlabAllocator<'t, 5>,
+
+    phantom_val: PhantomData<V>,
+}
+
+impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
+    const LAYOUTS: [Layout; 5] = [
+        Layout::new::<NodeInternal4<V>>(),
+        Layout::new::<NodeInternal16<V>>(),
+        Layout::new::<NodeInternal48<V>>(),
+        Layout::new::<NodeInternal256<V>>(),
+        Layout::new::<NodeLeaf<V>>(),
+    ];
+
+    pub fn new(area: &'t mut [MaybeUninit<u8>]) -> &'t mut ArtMultiSlabAllocator<'t, V> {
+        let (allocator_area, remain) = alloc_from_slice::<ArtMultiSlabAllocator<V>>(area);
+        let (tree_area, remain) = alloc_from_slice::<Tree<V>>(remain);
+
+        allocator_area.write(ArtMultiSlabAllocator {
+            tree_area: spin::Mutex::new(Some(tree_area)),
+            inner: MultiSlabAllocator::new(remain, &Self::LAYOUTS),
+            phantom_val: PhantomData,
+        })
+    }
+}
+
+impl<'t, V: crate::Value> ArtAllocator<V> for ArtMultiSlabAllocator<'t, V> {
+    fn alloc_tree(&self) -> *mut Tree<V> {
+        let mut t = self.tree_area.lock();
+        if let Some(tree_area) = t.take() {
+            return tree_area.as_mut_ptr().cast();
+        }
+        panic!("cannot allocate more than one tree");
+    }
+
+    fn alloc_node_internal4(&self) -> *mut NodeInternal4<V> {
+        self.inner.alloc_slab(0).cast()
+    }
+    fn alloc_node_internal16(&self) -> *mut NodeInternal16<V> {
+        self.inner.alloc_slab(1).cast()
+    }
+    fn alloc_node_internal48(&self) -> *mut NodeInternal48<V> {
+        self.inner.alloc_slab(2).cast()
+    }
+    fn alloc_node_internal256(&self) -> *mut NodeInternal256<V> {
+        self.inner.alloc_slab(3).cast()
+    }
+    fn alloc_node_leaf(&self) -> *mut NodeLeaf<V> {
+        self.inner.alloc_slab(4).cast()
+    }
+
+    fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>) {
+        self.inner.dealloc_slab(0, ptr.cast())
+    }
+
+    fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>) {
+        self.inner.dealloc_slab(1, ptr.cast())
+    }
+    fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>) {
+        self.inner.dealloc_slab(2, ptr.cast())
+    }
+    fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>) {
+        self.inner.dealloc_slab(3, ptr.cast())
+    }
+    fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>) {
+        self.inner.dealloc_slab(4, ptr.cast())
+    }
+}
+
+impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
+    pub(crate) fn get_statistics(&self) -> ArtMultiSlabStats {
+        ArtMultiSlabStats {
+            num_internal4: self.inner.slab_descs[0]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal16: self.inner.slab_descs[1]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal48: self.inner.slab_descs[2]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_internal256: self.inner.slab_descs[3]
+                .num_allocated
+                .load(Ordering::Relaxed),
+            num_leaf: self.inner.slab_descs[4]
+                .num_allocated
+                .load(Ordering::Relaxed),
+
+            num_blocks_internal4: self.inner.slab_descs[0].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal16: self.inner.slab_descs[1].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal48: self.inner.slab_descs[2].num_blocks.load(Ordering::Relaxed),
+            num_blocks_internal256: self.inner.slab_descs[3].num_blocks.load(Ordering::Relaxed),
+            num_blocks_leaf: self.inner.slab_descs[4].num_blocks.load(Ordering::Relaxed),
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ArtMultiSlabStats {
+    pub num_internal4: u64,
+    pub num_internal16: u64,
+    pub num_internal48: u64,
+    pub num_internal256: u64,
+    pub num_leaf: u64,
+
+    pub num_blocks_internal4: u64,
+    pub num_blocks_internal16: u64,
+    pub num_blocks_internal48: u64,
+    pub num_blocks_internal256: u64,
+    pub num_blocks_leaf: u64,
+}
--- a/libs/neonart/src/allocator/block.rs
+++ b/libs/neonart/src/allocator/block.rs
@@ -0,0 +1,191 @@
+//! Simple allocator of fixed-size blocks
+
+use std::mem::MaybeUninit;
+use std::sync::atomic::{AtomicU64, Ordering};
+
+use spin;
+
+pub const BLOCK_SIZE: usize = 16 * 1024;
+
+const INVALID_BLOCK: u64 = u64::MAX;
+
+pub(crate) struct BlockAllocator<'t> {
+    blocks_ptr: &'t [MaybeUninit<u8>],
+    num_blocks: u64,
+    num_initialized: AtomicU64,
+
+    freelist_head: spin::Mutex<u64>,
+}
+
+struct FreeListBlock {
+    inner: spin::Mutex<FreeListBlockInner>,
+}
+
+struct FreeListBlockInner {
+    next: u64,
+
+    num_free_blocks: u64,
+    free_blocks: [u64; 100], // FIXME: fill the rest of the block
+}
+
+impl<'t> BlockAllocator<'t> {
+    pub(crate) fn new(area: &'t mut [MaybeUninit<u8>]) -> Self {
+        // Use all the space for the blocks
+        let padding = area.as_ptr().align_offset(BLOCK_SIZE);
+        let remain = &mut area[padding..];
+
+        let num_blocks = (remain.len() / BLOCK_SIZE) as u64;
+
+        BlockAllocator {
+            blocks_ptr: remain,
+            num_blocks,
+            num_initialized: AtomicU64::new(0),
+            freelist_head: spin::Mutex::new(INVALID_BLOCK),
+        }
+    }
+
+    /// safety: you must hold a lock on the pointer to this block, otherwise it might get
+    /// reused for another kind of block
+    fn read_freelist_block(&self, blkno: u64) -> &FreeListBlock {
+        let ptr: *const FreeListBlock = self.get_block_ptr(blkno).cast();
+        unsafe { ptr.as_ref().unwrap() }
+    }
+
+    fn get_block_ptr(&self, blkno: u64) -> *mut u8 {
+        assert!(blkno < self.num_blocks);
+        unsafe {
+            self.blocks_ptr
+                .as_ptr()
+                .byte_offset(blkno as isize * BLOCK_SIZE as isize)
+        }
+        .cast_mut()
+        .cast()
+    }
+
+    #[allow(clippy::mut_from_ref)]
+    pub(crate) fn alloc_block(&self) -> &mut [MaybeUninit<u8>] {
+        // FIXME: handle OOM
+        let blkno = self.alloc_block_internal();
+        if blkno == INVALID_BLOCK {
+            panic!("out of memory");
+        }
+
+        let ptr: *mut MaybeUninit<u8> = self.get_block_ptr(blkno).cast();
+        unsafe { std::slice::from_raw_parts_mut(ptr, BLOCK_SIZE) }
+    }
+
+    fn alloc_block_internal(&self) -> u64 {
+        //  check the free list.
+        {
+            let mut freelist_head = self.freelist_head.lock();
+            if *freelist_head != INVALID_BLOCK {
+                let freelist_block = self.read_freelist_block(*freelist_head);
+
+                // acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
+                let mut g = freelist_block.inner.lock();
+
+                if g.num_free_blocks > 0 {
+                    g.num_free_blocks -= 1;
+                    let result = g.free_blocks[g.num_free_blocks as usize];
+                    return result;
+                } else {
+                    // consume the freelist block itself
+                    let result = *freelist_head;
+                    *freelist_head = g.next;
+                    // This freelist block is now unlinked and can be repurposed
+                    drop(g);
+                    return result;
+                }
+            }
+        }
+
+        // If there are some blocks left that we've never used, pick next such block
+        let mut next_uninitialized = self.num_initialized.load(Ordering::Relaxed);
+        while next_uninitialized < self.num_blocks {
+            match self.num_initialized.compare_exchange(
+                next_uninitialized,
+                next_uninitialized + 1,
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => {
+                    return next_uninitialized;
+                }
+                Err(old) => {
+                    next_uninitialized = old;
+                    continue;
+                }
+            }
+        }
+
+        // out of blocks
+        INVALID_BLOCK
+    }
+
+    // TODO: this is currently unused. The slab allocator never releases blocks
+    #[allow(dead_code)]
+    pub(crate) fn release_block(&self, block_ptr: *mut u8) {
+        let blockno = unsafe { block_ptr.byte_offset_from(self.blocks_ptr) / BLOCK_SIZE as isize };
+        self.release_block_internal(blockno as u64);
+    }
+
+    fn release_block_internal(&self, blockno: u64) {
+        let mut freelist_head = self.freelist_head.lock();
+        if *freelist_head != INVALID_BLOCK {
+            let freelist_block = self.read_freelist_block(*freelist_head);
+
+            // acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
+            let mut g = freelist_block.inner.lock();
+
+            let num_free_blocks = g.num_free_blocks;
+            if num_free_blocks < g.free_blocks.len() as u64 {
+                g.free_blocks[num_free_blocks as usize] = blockno;
+                g.num_free_blocks += 1;
+                return;
+            }
+        }
+
+        // Convert the block into a new freelist block
+        let block_ptr: *mut FreeListBlock = self.get_block_ptr(blockno).cast();
+        let init = FreeListBlock {
+            inner: spin::Mutex::new(FreeListBlockInner {
+                next: *freelist_head,
+                num_free_blocks: 0,
+                free_blocks: [INVALID_BLOCK; 100],
+            }),
+        };
+        unsafe { (*block_ptr) = init };
+        *freelist_head = blockno;
+    }
+
+    // for debugging
+    pub(crate) fn get_statistics(&self) -> BlockAllocatorStats {
+        let mut num_free_blocks = 0;
+
+        let mut _prev_lock = None;
+        let head_lock = self.freelist_head.lock();
+        let mut next_blk = *head_lock;
+        let mut _head_lock = Some(head_lock);
+        while next_blk != INVALID_BLOCK {
+            let freelist_block = self.read_freelist_block(next_blk);
+            let lock = freelist_block.inner.lock();
+            num_free_blocks += lock.num_free_blocks;
+            next_blk = lock.next;
+            _prev_lock = Some(lock); // hold the lock until we've read the next block
+            _head_lock = None;
+        }
+
+        BlockAllocatorStats {
+            num_blocks: self.num_blocks,
+            num_initialized: self.num_initialized.load(Ordering::Relaxed),
+            num_free_blocks,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct BlockAllocatorStats {
+    pub num_blocks: u64,
+    pub num_initialized: u64,
+    pub num_free_blocks: u64,
+}
--- a/libs/neonart/src/allocator/multislab.rs
+++ b/libs/neonart/src/allocator/multislab.rs
@@ -0,0 +1,33 @@
+use std::alloc::Layout;
+use std::mem::MaybeUninit;
+
+use crate::allocator::block::BlockAllocator;
+use crate::allocator::slab::SlabDesc;
+
+pub struct MultiSlabAllocator<'t, const N: usize> {
+    pub(crate) block_allocator: BlockAllocator<'t>,
+
+    pub(crate) slab_descs: [SlabDesc; N],
+}
+
+impl<'t, const N: usize> MultiSlabAllocator<'t, N> {
+    pub(crate) fn new(
+        area: &'t mut [MaybeUninit<u8>],
+        layouts: &[Layout; N],
+    ) -> MultiSlabAllocator<'t, N> {
+        let block_allocator = BlockAllocator::new(area);
+        MultiSlabAllocator {
+            block_allocator,
+
+            slab_descs: std::array::from_fn(|i| SlabDesc::new(&layouts[i])),
+        }
+    }
+
+    pub(crate) fn alloc_slab(&self, slab_idx: usize) -> *mut u8 {
+        self.slab_descs[slab_idx].alloc_chunk(&self.block_allocator)
+    }
+
+    pub(crate) fn dealloc_slab(&self, slab_idx: usize, ptr: *mut u8) {
+        self.slab_descs[slab_idx].dealloc_chunk(ptr, &self.block_allocator)
+    }
+}
--- a/libs/neonart/src/allocator/slab.rs
+++ b/libs/neonart/src/allocator/slab.rs
@@ -0,0 +1,433 @@
+//! A slab allocator that carves out fixed-size chunks from larger blocks.
+//!
+//!
+
+use std::alloc::Layout;
+use std::mem::MaybeUninit;
+use std::ops::Deref;
+use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
+
+use spin;
+
+use super::alloc_from_slice;
+use super::block::BlockAllocator;
+
+use crate::allocator::block::BLOCK_SIZE;
+
+pub(crate) struct SlabDesc {
+    pub(crate) layout: Layout,
+
+    block_lists: spin::RwLock<BlockLists>,
+
+    pub(crate) num_blocks: AtomicU64,
+    pub(crate) num_allocated: AtomicU64,
+}
+
+// FIXME: Not sure if SlabDesc is really Sync or Send. It probably is when it's empty, but
+// 'block_lists' contains pointers when it's not empty. In the current use as part of the
+// the art tree, SlabDescs are only moved during initialization.
+unsafe impl Sync for SlabDesc {}
+unsafe impl Send for SlabDesc {}
+
+#[derive(Default, Debug)]
+struct BlockLists {
+    full_blocks: BlockList,
+    nonfull_blocks: BlockList,
+}
+
+impl BlockLists {
+    // Unlink a node. It must be in either one of the two lists.
+    unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
+        let list = unsafe {
+            if (*elem).next.is_null() {
+                if self.full_blocks.tail == elem {
+                    Some(&mut self.full_blocks)
+                } else {
+                    Some(&mut self.nonfull_blocks)
+                }
+            } else if (*elem).prev.is_null() {
+                if self.full_blocks.head == elem {
+                    Some(&mut self.full_blocks)
+                } else {
+                    Some(&mut self.nonfull_blocks)
+                }
+            } else {
+                None
+            }
+        };
+        unsafe { unlink_slab_block(list, elem) };
+    }
+}
+
+unsafe fn unlink_slab_block(mut list: Option<&mut BlockList>, elem: *mut SlabBlockHeader) {
+    unsafe {
+        if (*elem).next.is_null() {
+            assert_eq!(list.as_ref().unwrap().tail, elem);
+            list.as_mut().unwrap().tail = (*elem).prev;
+        } else {
+            assert_eq!((*(*elem).next).prev, elem);
+            (*(*elem).next).prev = (*elem).prev;
+        }
+        if (*elem).prev.is_null() {
+            assert_eq!(list.as_ref().unwrap().head, elem);
+            list.as_mut().unwrap().head = (*elem).next;
+        } else {
+            assert_eq!((*(*elem).prev).next, elem);
+            (*(*elem).prev).next = (*elem).next;
+        }
+    }
+}
+
+#[derive(Debug)]
+struct BlockList {
+    head: *mut SlabBlockHeader,
+    tail: *mut SlabBlockHeader,
+}
+
+impl Default for BlockList {
+    fn default() -> Self {
+        BlockList {
+            head: std::ptr::null_mut(),
+            tail: std::ptr::null_mut(),
+        }
+    }
+}
+
+impl BlockList {
+    unsafe fn push_head(&mut self, elem: *mut SlabBlockHeader) {
+        unsafe {
+            if self.is_empty() {
+                self.tail = elem;
+                (*elem).next = std::ptr::null_mut();
+            } else {
+                (*elem).next = self.head;
+                (*self.head).prev = elem;
+            }
+            (*elem).prev = std::ptr::null_mut();
+            self.head = elem;
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        self.head.is_null()
+    }
+
+    unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
+        unsafe { unlink_slab_block(Some(self), elem) }
+    }
+
+    #[cfg(test)]
+    fn dump(&self) {
+        let mut next = self.head;
+
+        while !next.is_null() {
+            let n = unsafe { next.as_ref() }.unwrap();
+            eprintln!(
+                "  blk {:?} (free {}/{})",
+                next,
+                n.num_free_chunks.load(Ordering::Relaxed),
+                n.num_chunks
+            );
+            next = n.next;
+        }
+    }
+}
+
+impl SlabDesc {
+    pub(crate) fn new(layout: &Layout) -> SlabDesc {
+        SlabDesc {
+            layout: *layout,
+            block_lists: spin::RwLock::new(BlockLists::default()),
+            num_allocated: AtomicU64::new(0),
+            num_blocks: AtomicU64::new(0),
+        }
+    }
+}
+
+#[derive(Debug)]
+struct SlabBlockHeader {
+    free_chunks_head: spin::Mutex<*mut FreeChunk>,
+    num_free_chunks: AtomicU32,
+    num_chunks: u32, // this is really a constant for a given Layout
+
+    // these fields are protected by the lock on the BlockLists
+    prev: *mut SlabBlockHeader,
+    next: *mut SlabBlockHeader,
+}
+
+struct FreeChunk {
+    next: *mut FreeChunk,
+}
+
+enum ReadOrWriteGuard<'a, T> {
+    Read(spin::RwLockReadGuard<'a, T>),
+    Write(spin::RwLockWriteGuard<'a, T>),
+}
+
+impl<'a, T> Deref for ReadOrWriteGuard<'a, T> {
+    type Target = T;
+
+    fn deref(&self) -> &<Self as Deref>::Target {
+        match self {
+            ReadOrWriteGuard::Read(g) => g.deref(),
+            ReadOrWriteGuard::Write(g) => g.deref(),
+        }
+    }
+}
+
+impl SlabDesc {
+    pub fn alloc_chunk(&self, block_allocator: &BlockAllocator) -> *mut u8 {
+        // Are there any free chunks?
+        let mut acquire_write = false;
+        'outer: loop {
+            let mut block_lists_guard = if acquire_write {
+                ReadOrWriteGuard::Write(self.block_lists.write())
+            } else {
+                ReadOrWriteGuard::Read(self.block_lists.read())
+            };
+            'inner: loop {
+                let block_ptr = block_lists_guard.nonfull_blocks.head;
+                if block_ptr.is_null() {
+                    break 'outer;
+                }
+                unsafe {
+                    let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
+                    if !(*free_chunks_head).is_null() {
+                        let result = *free_chunks_head;
+                        (*free_chunks_head) = (*result).next;
+                        let _old = (*block_ptr).num_free_chunks.fetch_sub(1, Ordering::Relaxed);
+
+                        self.num_allocated.fetch_add(1, Ordering::Relaxed);
+                        return result.cast();
+                    }
+                }
+
+                // The block at the head of the list was full. Grab write lock and retry
+                match block_lists_guard {
+                    ReadOrWriteGuard::Read(_) => {
+                        acquire_write = true;
+                        continue 'outer;
+                    }
+                    ReadOrWriteGuard::Write(ref mut g) => {
+                        // move the node to the list of full blocks
+                        unsafe {
+                            g.nonfull_blocks.unlink(block_ptr);
+                            g.full_blocks.push_head(block_ptr);
+                        };
+                        continue 'inner;
+                    }
+                }
+            }
+        }
+
+        // no free chunks. Allocate a new block (and the chunk from that)
+        let (new_block, new_chunk) = self.alloc_block_and_chunk(block_allocator);
+        self.num_blocks.fetch_add(1, Ordering::Relaxed);
+
+        // Add the block to the list in the SlabDesc
+        unsafe {
+            let mut block_lists_guard = self.block_lists.write();
+            block_lists_guard.nonfull_blocks.push_head(new_block);
+        }
+        self.num_allocated.fetch_add(1, Ordering::Relaxed);
+        new_chunk
+    }
+
+    pub fn dealloc_chunk(&self, chunk_ptr: *mut u8, _block_allocator: &BlockAllocator) {
+        // Find the block it belongs to. You can find the block from the address. (And knowing the
+        // layout, you could calculate the chunk number too.)
+        let block_ptr: *mut SlabBlockHeader = {
+            let block_addr = (chunk_ptr.addr() / BLOCK_SIZE) * BLOCK_SIZE;
+            chunk_ptr.with_addr(block_addr).cast()
+        };
+        let chunk_ptr: *mut FreeChunk = chunk_ptr.cast();
+
+        // Mark the chunk as free in 'freechunks' list
+        let num_chunks;
+        let num_free_chunks;
+        unsafe {
+            let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
+            (*chunk_ptr).next = *free_chunks_head;
+            *free_chunks_head = chunk_ptr;
+
+            num_free_chunks = (*block_ptr).num_free_chunks.fetch_add(1, Ordering::Relaxed) + 1;
+            num_chunks = (*block_ptr).num_chunks;
+        }
+
+        if num_free_chunks == 1 {
+            // If the block was full previously, add it to the nonfull blocks list. Note that
+            // we're not holding the lock anymore, so it can immediately become full again.
+            // That's harmless, it will be moved back to the full list again when a call
+            // to alloc_chunk() sees it.
+            let mut block_lists = self.block_lists.write();
+            unsafe {
+                block_lists.unlink(block_ptr);
+                block_lists.nonfull_blocks.push_head(block_ptr);
+            };
+        } else if num_free_chunks == num_chunks {
+            // If the block became completely empty, move it to the free list
+            // TODO
+            // FIXME: we're still holding the spinlock. It's not exactly safe to return it to
+            // the free blocks list, is it? Defer it as garbage to wait out concurrent updates?
+            //block_allocator.release_block()
+        }
+
+        // update stats
+        self.num_allocated.fetch_sub(1, Ordering::Relaxed);
+    }
+
+    fn alloc_block_and_chunk(
+        &self,
+        block_allocator: &BlockAllocator,
+    ) -> (*mut SlabBlockHeader, *mut u8) {
+        // fixme: handle OOM
+        let block_slice: &mut [MaybeUninit<u8>] = block_allocator.alloc_block();
+        let (block_header, remain) = alloc_from_slice::<SlabBlockHeader>(block_slice);
+
+        let padding = remain.as_ptr().align_offset(self.layout.align());
+
+        let num_chunks = (remain.len() - padding) / self.layout.size();
+
+        let first_chunk_ptr: *mut FreeChunk = remain[padding..].as_mut_ptr().cast();
+
+        unsafe {
+            let mut chunk_ptr = first_chunk_ptr;
+            for _ in 0..num_chunks - 1 {
+                let next_chunk_ptr = chunk_ptr.byte_add(self.layout.size());
+                (*chunk_ptr).next = next_chunk_ptr;
+                chunk_ptr = next_chunk_ptr;
+            }
+            (*chunk_ptr).next = std::ptr::null_mut();
+
+            let result_chunk = first_chunk_ptr;
+
+            let block_header = block_header.write(SlabBlockHeader {
+                free_chunks_head: spin::Mutex::new((*first_chunk_ptr).next),
+                prev: std::ptr::null_mut(),
+                next: std::ptr::null_mut(),
+                num_chunks: num_chunks as u32,
+                num_free_chunks: AtomicU32::new(num_chunks as u32 - 1),
+            });
+
+            (block_header, result_chunk.cast())
+        }
+    }
+
+    #[cfg(test)]
+    fn dump(&self) {
+        eprintln!(
+            "slab dump ({} blocks, {} allocated chunks)",
+            self.num_blocks.load(Ordering::Relaxed),
+            self.num_allocated.load(Ordering::Relaxed)
+        );
+        let lists = self.block_lists.read();
+
+        eprintln!("nonfull blocks:");
+        lists.nonfull_blocks.dump();
+        eprintln!("full blocks:");
+        lists.full_blocks.dump();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use rand::Rng;
+    use rand_distr::Zipf;
+
+    struct TestObject {
+        val: usize,
+        _dummy: [u8; BLOCK_SIZE / 4],
+    }
+
+    struct TestObjectSlab<'a>(SlabDesc, BlockAllocator<'a>);
+    impl<'a> TestObjectSlab<'a> {
+        fn new(block_allocator: BlockAllocator) -> TestObjectSlab {
+            TestObjectSlab(SlabDesc::new(&Layout::new::<TestObject>()), block_allocator)
+        }
+
+        fn alloc(&self, val: usize) -> *mut TestObject {
+            let obj: *mut TestObject = self.0.alloc_chunk(&self.1).cast();
+            unsafe { (*obj).val = val };
+            obj
+        }
+
+        fn dealloc(&self, obj: *mut TestObject) {
+            self.0.dealloc_chunk(obj.cast(), &self.1)
+        }
+    }
+
+    #[test]
+    fn test_slab_alloc() {
+        const MEM_SIZE: usize = 100000000;
+        let mut area = Box::new_uninit_slice(MEM_SIZE);
+        let block_allocator = BlockAllocator::new(&mut area);
+
+        let slab = TestObjectSlab::new(block_allocator);
+
+        let mut all: Vec<*mut TestObject> = Vec::new();
+        for i in 0..11 {
+            all.push(slab.alloc(i));
+        }
+        #[allow(clippy::needless_range_loop)]
+        for i in 0..11 {
+            assert!(unsafe { (*all[i]).val == i });
+        }
+
+        let distribution = Zipf::new(10.0, 1.1).unwrap();
+        let mut rng = rand::rng();
+        for _ in 0..100000 {
+            slab.0.dump();
+            let idx = rng.sample(distribution) as usize;
+            let ptr: *mut TestObject = all[idx];
+            if !ptr.is_null() {
+                assert_eq!(unsafe { (*ptr).val }, idx);
+                slab.dealloc(ptr);
+                all[idx] = std::ptr::null_mut();
+            } else {
+                all[idx] = slab.alloc(idx);
+            }
+        }
+    }
+
+    fn new_test_blk(i: u32) -> *mut SlabBlockHeader {
+        Box::into_raw(Box::new(SlabBlockHeader {
+            free_chunks_head: spin::Mutex::new(std::ptr::null_mut()),
+            num_free_chunks: AtomicU32::new(0),
+            num_chunks: i,
+            prev: std::ptr::null_mut(),
+            next: std::ptr::null_mut(),
+        }))
+    }
+
+    #[test]
+    fn test_block_linked_list() {
+        // note: these are leaked, but that's OK for tests
+        let a = new_test_blk(0);
+        let b = new_test_blk(1);
+
+        let mut list = BlockList::default();
+        assert!(list.is_empty());
+
+        unsafe {
+            list.push_head(a);
+            assert!(!list.is_empty());
+            list.unlink(a);
+        }
+        assert!(list.is_empty());
+
+        unsafe {
+            list.push_head(b);
+            list.push_head(a);
+            assert_eq!(list.head, a);
+            assert_eq!((*a).next, b);
+            assert_eq!((*b).prev, a);
+            assert_eq!(list.tail, b);
+
+            list.unlink(a);
+            list.unlink(b);
+            assert!(list.is_empty());
+        }
+    }
+}
--- a/libs/neonart/src/allocator/static.rs
+++ b/libs/neonart/src/allocator/static.rs
@@ -0,0 +1,44 @@
+use std::mem::MaybeUninit;
+
+pub fn alloc_from_slice<T>(
+    area: &mut [MaybeUninit<u8>],
+) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
+    let layout = std::alloc::Layout::new::<T>();
+
+    let area_start = area.as_mut_ptr();
+
+    // pad to satisfy alignment requirements
+    let padding = area_start.align_offset(layout.align());
+    if padding + layout.size() > area.len() {
+        panic!("out of memory");
+    }
+    let area = &mut area[padding..];
+    let (result_area, remain) = area.split_at_mut(layout.size());
+
+    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
+    let result = unsafe { result_ptr.as_mut().unwrap() };
+
+    (result, remain)
+}
+
+pub fn alloc_array_from_slice<T>(
+    area: &mut [MaybeUninit<u8>],
+    len: usize,
+) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
+    let layout = std::alloc::Layout::new::<T>();
+
+    let area_start = area.as_mut_ptr();
+
+    // pad to satisfy alignment requirements
+    let padding = area_start.align_offset(layout.align());
+    if padding + layout.size() * len > area.len() {
+        panic!("out of memory");
+    }
+    let area = &mut area[padding..];
+    let (result_area, remain) = area.split_at_mut(layout.size() * len);
+
+    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
+    let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
+
+    (result, remain)
+}
--- a/libs/neonart/src/epoch.rs
+++ b/libs/neonart/src/epoch.rs
@@ -0,0 +1,142 @@
+//! This is similar to crossbeam_epoch crate, but works in shared memory
+
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
+
+use crossbeam_utils::CachePadded;
+
+const NUM_SLOTS: usize = 1000;
+
+/// This is the struct that is stored in shmem
+///
+/// bit 0: is it pinned or not?
+/// rest of the bits are the epoch counter.
+pub struct EpochShared {
+    global_epoch: AtomicU64,
+    participants: [CachePadded<AtomicU64>; NUM_SLOTS],
+
+    broadcast_lock: spin::Mutex<()>,
+}
+
+impl EpochShared {
+    pub fn new() -> EpochShared {
+        EpochShared {
+            global_epoch: AtomicU64::new(2),
+            participants: [const { CachePadded::new(AtomicU64::new(2)) }; NUM_SLOTS],
+            broadcast_lock: spin::Mutex::new(()),
+        }
+    }
+
+    pub fn register(&self) -> LocalHandle {
+        LocalHandle {
+            global: self,
+            last_slot: AtomicUsize::new(0), // todo: choose more intelligently
+        }
+    }
+
+    fn release_pin(&self, slot: usize, _epoch: u64) {
+        let global_epoch = self.global_epoch.load(Ordering::Relaxed);
+        self.participants[slot].store(global_epoch, Ordering::Relaxed);
+    }
+
+    fn pin_internal(&self, slot_hint: usize) -> (usize, u64) {
+        // pick a slot
+        let mut slot = slot_hint;
+        let epoch = loop {
+            let old = self.participants[slot].fetch_or(1, Ordering::Relaxed);
+            if old & 1 == 0 {
+                // Got this slot
+                break old;
+            }
+
+            // the slot was busy by another thread / process. try a different slot
+            slot += 1;
+            if slot == NUM_SLOTS {
+                slot = 0;
+            }
+            continue;
+        };
+        (slot, epoch)
+    }
+
+    pub(crate) fn advance(&self) -> u64 {
+        // Advance the global epoch
+        let old_epoch = self.global_epoch.fetch_add(2, Ordering::Relaxed);
+        // Anyone that release their pin after this will update their slot.
+        old_epoch + 2
+    }
+
+    pub(crate) fn broadcast(&self) {
+        let Some(_guard) = self.broadcast_lock.try_lock() else {
+            return;
+        };
+
+        let epoch = self.global_epoch.load(Ordering::Relaxed);
+        let old_epoch = epoch.wrapping_sub(2);
+
+        // Update all free slots.
+        for i in 0..NUM_SLOTS {
+            // TODO: check result, as a sanity check. It should either be the old epoch, or pinned
+            let _ = self.participants[i].compare_exchange(
+                old_epoch,
+                epoch,
+                Ordering::Relaxed,
+                Ordering::Relaxed,
+            );
+        }
+
+        // FIXME: memory fence here, since we used Relaxed?
+    }
+
+    pub(crate) fn get_oldest(&self) -> u64 {
+        // Read all slots.
+        let now = self.global_epoch.load(Ordering::Relaxed);
+        let mut oldest = now;
+        for i in 0..NUM_SLOTS {
+            let this_epoch = self.participants[i].load(Ordering::Relaxed);
+            let delta = now.wrapping_sub(this_epoch);
+            if delta > u64::MAX / 2 {
+                // this is very recent
+            } else if delta > now.wrapping_sub(oldest) {
+                oldest = this_epoch;
+            }
+        }
+        oldest
+    }
+
+    pub(crate) fn get_current(&self) -> u64 {
+        self.global_epoch.load(Ordering::Relaxed)
+    }
+}
+
+pub(crate) struct EpochPin<'e> {
+    slot: usize,
+    pub(crate) epoch: u64,
+
+    handle: &'e LocalHandle<'e>,
+}
+
+impl<'e> Drop for EpochPin<'e> {
+    fn drop(&mut self) {
+        self.handle.global.release_pin(self.slot, self.epoch);
+    }
+}
+
+pub struct LocalHandle<'g> {
+    global: &'g EpochShared,
+
+    last_slot: AtomicUsize,
+}
+
+impl<'g> LocalHandle<'g> {
+    pub fn pin(&self) -> EpochPin {
+        let (slot, epoch) = self
+            .global
+            .pin_internal(self.last_slot.load(Ordering::Relaxed));
+        self.last_slot.store(slot, Ordering::Relaxed);
+        EpochPin {
+            handle: self,
+            epoch,
+            slot,
+        }
+    }
+}
--- a/libs/neonart/src/lib.rs
+++ b/libs/neonart/src/lib.rs
@@ -0,0 +1,583 @@
+//! Adaptive Radix Tree (ART) implementation, with Optimistic Lock Coupling.
+//!
+//! The data structure is described in these two papers:
+//!
+//! [1] Leis, V. & Kemper, Alfons & Neumann, Thomas. (2013).
+//!     The adaptive radix tree: ARTful indexing for main-memory databases.
+//!     Proceedings - International Conference on Data Engineering. 38-49. 10.1109/ICDE.2013.6544812.
+//!     https://db.in.tum.de/~leis/papers/ART.pdf
+//!
+//! [2] Leis, Viktor & Scheibner, Florian & Kemper, Alfons & Neumann, Thomas. (2016).
+//!     The ART of practical synchronization.
+//!     1-8. 10.1145/2933349.2933352.
+//!     https://db.in.tum.de/~leis/papers/artsync.pdf
+//!
+//! [1] describes the base data structure, and [2] describes the Optimistic Lock Coupling that we
+//! use.
+//!
+//! The papers mention a few different variants. We have made the following choices in this
+//! implementation:
+//!
+//! - All keys have the same length
+//!
+//! - Single-value leaves.
+//!
+//! - For collapsing inner nodes, we use the Pessimistic approach, where each inner node stores a
+//!   variable length "prefix", which stores the keys of all the one-way nodes which have been
+//!   removed. However, similar to the "hybrid" approach described in the paper, each node only has
+//!   space for a constant-size prefix of 8 bytes. If a node would have a longer prefix, then we
+//!   create create one-way nodes to store them. (There was no particular reason for this choice,
+//!   the "hybrid" approach described in the paper might be better.)
+//!
+//! - For concurrency, we use Optimistic Lock Coupling. The paper [2] also describes another method,
+//!   ROWEX, which generally performs better when there is contention, but that is not important
+//!   for use and Optimisic Lock Coupling is simpler to implement.
+//!
+//! ## Requirements
+//!
+//! This data structure is currently used for the integrated LFC, relsize and last-written LSN cache
+//! in the compute communicator, part of the 'neon' Postgres extension. We have some unique
+//! requirements, which is why we had to write our own. Namely:
+//!
+//! - The data structure has to live in fixed-sized shared memory segment. That rules out any
+//!   built-in Rust collections and most crates. (Except possibly with the 'allocator_api' rust
+//!   feature, which still nightly-only experimental as of this writing).
+//!
+//! - The data structure is accessed from multiple processes. Only one process updates the data
+//!   structure, but other processes perform reads. That rules out using built-in Rust locking
+//!   primitives like Mutex and RwLock, and most crates too.
+//!
+//! - Within the one process with write-access, multiple threads can perform updates concurrently.
+//!   That rules out using PostgreSQL LWLocks for the locking.
+//!
+//! The implementation is generic, and doesn't depend on any PostgreSQL specifics, but it has been
+//! written with that usage and the above constraints in mind. Some noteworthy assumptions:
+//!
+//! - Contention is assumed to be rare. In the integrated cache in PostgreSQL, there's higher level
+//!   locking in the PostgreSQL buffer manager, which ensures that two backends should not try to
+//!   read / write the same page at the same time. (Prefetching can conflict with actual reads,
+//!   however.)
+//!
+//!  - The keys in the integrated cache are 17 bytes long.
+//!
+//! ## Usage
+//!
+//! Because this is designed to be used as a Postgres shared memory data structure, initialization
+//! happens in three stages:
+//!
+//! 0. A fixed area of shared memory is allocated at postmaster startup.
+//!
+//! 1. TreeInitStruct::new() is called to initialize it, still in Postmaster process, before any
+//!    other process or thread is running. It returns a TreeInitStruct, which is inherited by all
+//!    the processes through fork().
+//!
+//! 2. One process may have write-access to the struct, by calling
+//!    [TreeInitStruct::attach_writer]. (That process is the communicator process.)
+//!
+//! 3. Other processes get read-access to the struct, by calling [TreeInitStruct::attach_reader]
+//!
+//! "Write access" means that you can insert / update / delete values in the tree.
+//!
+//! NOTE: The Values stored in the tree are sometimes moved, when a leaf node fills up and a new
+//! larger node needs to be allocated. The versioning and epoch-based allocator ensure that the data
+//! structure stays consistent, but if the Value has interior mutability, like atomic fields,
+//! updates to such fields might be lost if the leaf node is concurrently moved! If that becomes a
+//! problem, the version check could be passed up to the caller, so that the caller could detect the
+//! lost updates and retry the operation.
+//!
+//! ## Implementation
+//!
+//! node_ptr: Provides low-level implementations of the four different node types (eight actually,
+//! since there is an Internal and Leaf variant of each)
+//!
+//! lock_and_version.rs: Provides an abstraction for the combined lock and version counter on each
+//! node.
+//!
+//! node_ref.rs: The code in node_ptr.rs deals with raw pointers. node_ref.rs provides more type-safe
+//!   abstractions on top.
+//!
+//! algorithm.rs: Contains the functions to implement lookups and updates in the tree
+//!
+//! allocator.rs: Provides a facility to allocate memory for the tree nodes. (We must provide our
+//!   own abstraction for that because we need the data structure to live in a pre-allocated shared
+//!   memory segment).
+//!
+//! epoch.rs: The data structure requires that when a node is removed from the tree, it is not
+//!   immediately deallocated, but stays around for as long as concurrent readers might still have
+//!   pointers to them. This is enforced by an epoch system. This is similar to
+//!   e.g. crossbeam_epoch, but we couldn't use that either because it has to work across processes
+//!   communicating over the shared memory segment.
+//!
+//! ## See also
+//!
+//! There are some existing Rust ART implementations out there, but none of them filled all
+//! the requirements:
+//!
+//! - https://github.com/XiangpengHao/congee
+//! - https://github.com/declanvk/blart
+//!
+//! ## TODO
+//!
+//! - Removing values has not been implemented
+
+mod algorithm;
+pub mod allocator;
+mod epoch;
+
+use algorithm::RootPtr;
+use algorithm::node_ptr::NodePtr;
+
+use std::collections::VecDeque;
+use std::fmt::Debug;
+use std::marker::PhantomData;
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+use crate::epoch::EpochPin;
+
+#[cfg(test)]
+mod tests;
+
+use allocator::ArtAllocator;
+pub use allocator::ArtMultiSlabAllocator;
+pub use allocator::OutOfMemoryError;
+
+/// Fixed-length key type.
+///
+pub trait Key: Debug {
+    const KEY_LEN: usize;
+
+    fn as_bytes(&self) -> &[u8];
+}
+
+/// Values stored in the tree
+///
+/// Values need to be Cloneable, because when a node "grows", the value is copied to a new node and
+/// the old sticks around until all readers that might see the old value are gone.
+// fixme obsolete, no longer needs Clone
+pub trait Value {}
+
+const MAX_GARBAGE: usize = 1024;
+
+/// The root of the tree, plus other tree-wide data. This is stored in the shared memory.
+pub struct Tree<V: Value> {
+    /// For simplicity, so that we never need to grow or shrink the root, the root node is always an
+    /// Internal256 node. Also, it never has a prefix (that's actually a bit wasteful, incurring one
+    /// indirection to every lookup)
+    root: RootPtr<V>,
+
+    writer_attached: AtomicBool,
+
+    epoch: epoch::EpochShared,
+}
+
+unsafe impl<V: Value + Sync> Sync for Tree<V> {}
+unsafe impl<V: Value + Send> Send for Tree<V> {}
+
+struct GarbageQueue<V>(VecDeque<(NodePtr<V>, u64)>);
+
+unsafe impl<V: Value + Sync> Sync for GarbageQueue<V> {}
+unsafe impl<V: Value + Send> Send for GarbageQueue<V> {}
+
+impl<V> GarbageQueue<V> {
+    fn new() -> GarbageQueue<V> {
+        GarbageQueue(VecDeque::with_capacity(MAX_GARBAGE))
+    }
+
+    fn remember_obsolete_node(&mut self, ptr: NodePtr<V>, epoch: u64) {
+        self.0.push_front((ptr, epoch));
+    }
+
+    fn next_obsolete(&mut self, cutoff_epoch: u64) -> Option<NodePtr<V>> {
+        if let Some(back) = self.0.back() {
+            if back.1 < cutoff_epoch {
+                return Some(self.0.pop_back().unwrap().0);
+            }
+        }
+        None
+    }
+}
+
+/// Struct created at postmaster startup
+pub struct TreeInitStruct<'t, K: Key, V: Value, A: ArtAllocator<V>> {
+    tree: &'t Tree<V>,
+
+    allocator: &'t A,
+
+    phantom_key: PhantomData<K>,
+}
+
+/// The worker process has a reference to this. The write operations are only safe
+/// from the worker process
+pub struct TreeWriteAccess<'t, K: Key, V: Value, A: ArtAllocator<V>>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'t Tree<V>,
+
+    pub allocator: &'t A,
+
+    epoch_handle: epoch::LocalHandle<'t>,
+
+    phantom_key: PhantomData<K>,
+
+    /// Obsolete nodes that cannot be recycled until their epoch expires.
+    garbage: spin::Mutex<GarbageQueue<V>>,
+}
+
+/// The backends have a reference to this. It cannot be used to modify the tree
+pub struct TreeReadAccess<'t, K: Key, V: Value>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'t Tree<V>,
+
+    epoch_handle: epoch::LocalHandle<'t>,
+
+    phantom_key: PhantomData<K>,
+}
+
+impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeInitStruct<'t, K, V, A> {
+    pub fn new(allocator: &'t A) -> TreeInitStruct<'t, K, V, A> {
+        let tree_ptr = allocator.alloc_tree();
+        let tree_ptr = NonNull::new(tree_ptr).expect("out of memory");
+        let init = Tree {
+            root: algorithm::new_root(allocator).expect("out of memory"),
+            writer_attached: AtomicBool::new(false),
+            epoch: epoch::EpochShared::new(),
+        };
+        unsafe { tree_ptr.write(init) };
+
+        TreeInitStruct {
+            tree: unsafe { tree_ptr.as_ref() },
+            allocator,
+            phantom_key: PhantomData,
+        }
+    }
+
+    pub fn attach_writer(self) -> TreeWriteAccess<'t, K, V, A> {
+        let previously_attached = self.tree.writer_attached.swap(true, Ordering::Relaxed);
+        if previously_attached {
+            panic!("writer already attached");
+        }
+        TreeWriteAccess {
+            tree: self.tree,
+            allocator: self.allocator,
+            phantom_key: PhantomData,
+            epoch_handle: self.tree.epoch.register(),
+            garbage: spin::Mutex::new(GarbageQueue::new()),
+        }
+    }
+
+    pub fn attach_reader(self) -> TreeReadAccess<'t, K, V> {
+        TreeReadAccess {
+            tree: self.tree,
+            phantom_key: PhantomData,
+            epoch_handle: self.tree.epoch.register(),
+        }
+    }
+}
+
+impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteAccess<'t, K, V, A> {
+    pub fn start_write<'g>(&'t self) -> TreeWriteGuard<'g, K, V, A>
+    where
+        't: 'g,
+    {
+        TreeWriteGuard {
+            tree_writer: self,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+            created_garbage: false,
+        }
+    }
+
+    pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
+        TreeReadGuard {
+            tree: self.tree,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+        }
+    }
+}
+
+impl<'t, K: Key, V: Value> TreeReadAccess<'t, K, V> {
+    pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
+        TreeReadGuard {
+            tree: self.tree,
+            epoch_pin: self.epoch_handle.pin(),
+            phantom_key: PhantomData,
+        }
+    }
+}
+
+pub struct TreeReadGuard<'e, K, V>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'e Tree<V>,
+
+    epoch_pin: EpochPin<'e>,
+    phantom_key: PhantomData<K>,
+}
+
+impl<'e, K: Key, V: Value> TreeReadGuard<'e, K, V> {
+    pub fn get(&'e self, key: &K) -> Option<&'e V> {
+        algorithm::search(key, self.tree.root, &self.epoch_pin)
+    }
+}
+
+pub struct TreeWriteGuard<'e, K, V, A>
+where
+    K: Key,
+    V: Value,
+    A: ArtAllocator<V>,
+{
+    tree_writer: &'e TreeWriteAccess<'e, K, V, A>,
+
+    epoch_pin: EpochPin<'e>,
+    phantom_key: PhantomData<K>,
+
+    created_garbage: bool,
+}
+
+pub enum UpdateAction<V> {
+    Nothing,
+    Insert(V),
+    Remove,
+}
+
+impl<'e, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
+    /// Get a value
+    pub fn get(&'e mut self, key: &K) -> Option<&'e V> {
+        algorithm::search(key, self.tree_writer.tree.root, &self.epoch_pin)
+    }
+
+    /// Insert a value
+    pub fn insert(self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
+        let mut success = None;
+
+        self.update_with_fn(key, |existing| {
+            if existing.is_some() {
+                success = Some(false);
+                UpdateAction::Nothing
+            } else {
+                success = Some(true);
+                UpdateAction::Insert(value)
+            }
+        })?;
+        Ok(success.expect("value_fn not called"))
+    }
+
+    /// Remove value. Returns true if it existed
+    pub fn remove(self, key: &K) -> bool {
+        let mut result = false;
+        // FIXME: It's not clear if OOM is expected while removing. It seems
+        // not nice, but shrinking a node can OOM. Then again, we could opt
+        // to not shrink a node if we cannot allocate, to live a little longer.
+        self.update_with_fn(key, |existing| match existing {
+            Some(_) => {
+                result = true;
+                UpdateAction::Remove
+            }
+            None => UpdateAction::Nothing,
+        })
+        .expect("out of memory while removing");
+        result
+    }
+
+    /// Try to remove value and return the old value.
+    pub fn remove_and_return(self, key: &K) -> Option<V>
+    where
+        V: Clone,
+    {
+        let mut old = None;
+        self.update_with_fn(key, |existing| {
+            old = existing.cloned();
+            UpdateAction::Remove
+        })
+        .expect("out of memory while removing");
+        old
+    }
+
+    /// Update key using the given function. All the other modifying operations are based on this.
+    ///
+    /// The function is passed a reference to the existing value, if any. If the function
+    /// returns None, the value is removed from the tree (or if there was no existing value,
+    /// does nothing). If the function returns Some, the existing value is replaced, of if there
+    /// was no existing value, it is inserted. FIXME: update comment
+    pub fn update_with_fn<F>(mut self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
+    where
+        F: FnOnce(Option<&V>) -> UpdateAction<V>,
+    {
+        algorithm::update_fn(key, value_fn, self.tree_writer.tree.root, &mut self)?;
+
+        if self.created_garbage {
+            let _ = self.collect_garbage();
+        }
+        Ok(())
+    }
+
+    fn remember_obsolete_node(&mut self, ptr: NodePtr<V>) {
+        self.tree_writer
+            .garbage
+            .lock()
+            .remember_obsolete_node(ptr, self.epoch_pin.epoch);
+        self.created_garbage = true;
+    }
+
+    // returns number of nodes recycled
+    fn collect_garbage(&self) -> usize {
+        self.tree_writer.tree.epoch.advance();
+        self.tree_writer.tree.epoch.broadcast();
+
+        let cutoff_epoch = self.tree_writer.tree.epoch.get_oldest();
+
+        let mut result = 0;
+        let mut garbage_queue = self.tree_writer.garbage.lock();
+        while let Some(ptr) = garbage_queue.next_obsolete(cutoff_epoch) {
+            ptr.deallocate(self.tree_writer.allocator);
+            result += 1;
+        }
+        result
+    }
+}
+
+pub struct TreeIterator<K>
+where
+    K: Key + for<'a> From<&'a [u8]>,
+{
+    done: bool,
+    pub next_key: Vec<u8>,
+    max_key: Option<Vec<u8>>,
+
+    phantom_key: PhantomData<K>,
+}
+
+impl<K> TreeIterator<K>
+where
+    K: Key + for<'a> From<&'a [u8]>,
+{
+    pub fn new_wrapping() -> TreeIterator<K> {
+        TreeIterator {
+            done: false,
+            next_key: vec![0; K::KEY_LEN],
+            max_key: None,
+            phantom_key: PhantomData,
+        }
+    }
+
+    pub fn new(range: &std::ops::Range<K>) -> TreeIterator<K> {
+        let result = TreeIterator {
+            done: false,
+            next_key: Vec::from(range.start.as_bytes()),
+            max_key: Some(Vec::from(range.end.as_bytes())),
+            phantom_key: PhantomData,
+        };
+        assert_eq!(result.next_key.len(), K::KEY_LEN);
+        assert_eq!(result.max_key.as_ref().unwrap().len(), K::KEY_LEN);
+
+        result
+    }
+
+    pub fn next<'g, V>(&mut self, read_guard: &'g TreeReadGuard<'g, K, V>) -> Option<(K, &'g V)>
+    where
+        V: Value,
+    {
+        if self.done {
+            return None;
+        }
+
+        let mut wrapped_around = false;
+        loop {
+            assert_eq!(self.next_key.len(), K::KEY_LEN);
+            if let Some((k, v)) =
+                algorithm::iter_next(&self.next_key, read_guard.tree.root, &read_guard.epoch_pin)
+            {
+                assert_eq!(k.len(), K::KEY_LEN);
+                assert_eq!(self.next_key.len(), K::KEY_LEN);
+
+                // Check if we reached the end of the range
+                if let Some(max_key) = &self.max_key {
+                    if k.as_slice() >= max_key.as_slice() {
+                        self.done = true;
+                        break None;
+                    }
+                }
+
+                // increment the key
+                self.next_key = k.clone();
+                increment_key(self.next_key.as_mut_slice());
+                let k = k.as_slice().into();
+
+                break Some((k, v));
+            } else {
+                if self.max_key.is_some() {
+                    self.done = true;
+                } else {
+                    // Start from beginning
+                    if !wrapped_around {
+                        for i in 0..K::KEY_LEN {
+                            self.next_key[i] = 0;
+                        }
+                        wrapped_around = true;
+                        continue;
+                    } else {
+                        // The tree is completely empty
+                        // FIXME: perhaps we should remember the starting point instead.
+                        // Currently this will scan some ranges twice.
+                        break None;
+                    }
+                }
+                break None;
+            }
+        }
+    }
+}
+
+fn increment_key(key: &mut [u8]) -> bool {
+    for i in (0..key.len()).rev() {
+        let (byte, overflow) = key[i].overflowing_add(1);
+        key[i] = byte;
+        if !overflow {
+            return false;
+        }
+    }
+    true
+}
+
+// Debugging functions
+impl<'e, K: Key, V: Value + Debug, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
+    pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
+        algorithm::dump_tree(self.tree_writer.tree.root, &self.epoch_pin, dst)
+    }
+}
+impl<'e, K: Key, V: Value + Debug> TreeReadGuard<'e, K, V> {
+    pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
+        algorithm::dump_tree(self.tree.root, &self.epoch_pin, dst)
+    }
+}
+impl<'e, K: Key, V: Value> TreeWriteAccess<'e, K, V, ArtMultiSlabAllocator<'e, V>> {
+    pub fn get_statistics(&self) -> ArtTreeStatistics {
+        self.allocator.get_statistics();
+        ArtTreeStatistics {
+            blocks: self.allocator.inner.block_allocator.get_statistics(),
+            slabs: self.allocator.get_statistics(),
+            epoch: self.tree.epoch.get_current(),
+            oldest_epoch: self.tree.epoch.get_oldest(),
+            num_garbage: self.garbage.lock().0.len() as u64,
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ArtTreeStatistics {
+    pub blocks: allocator::block::BlockAllocatorStats,
+    pub slabs: allocator::ArtMultiSlabStats,
+
+    pub epoch: u64,
+    pub oldest_epoch: u64,
+    pub num_garbage: u64,
+}
--- a/libs/neonart/src/tests.rs
+++ b/libs/neonart/src/tests.rs
@@ -0,0 +1,236 @@
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fmt::{Debug, Formatter};
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use crate::ArtAllocator;
+use crate::ArtMultiSlabAllocator;
+use crate::TreeInitStruct;
+use crate::TreeIterator;
+use crate::TreeWriteAccess;
+use crate::UpdateAction;
+
+use crate::{Key, Value};
+
+use rand::Rng;
+use rand::seq::SliceRandom;
+use rand_distr::Zipf;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl TestKey {
+    const MIN: TestKey = TestKey([0; TEST_KEY_LEN]);
+    const MAX: TestKey = TestKey([u8::MAX; TEST_KEY_LEN]);
+}
+
+impl Key for TestKey {
+    const KEY_LEN: usize = TEST_KEY_LEN;
+    fn as_bytes(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+impl From<&TestKey> for u128 {
+    fn from(val: &TestKey) -> u128 {
+        u128::from_be_bytes(val.0)
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl<'a> From<&'a [u8]> for TestKey {
+    fn from(bytes: &'a [u8]) -> TestKey {
+        TestKey(bytes.try_into().unwrap())
+    }
+}
+
+impl Value for usize {}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
+    const MEM_SIZE: usize = 10000000;
+    let mut area = Box::new_uninit_slice(MEM_SIZE);
+
+    let allocator = ArtMultiSlabAllocator::new(&mut area);
+
+    let init_struct = TreeInitStruct::<TestKey, usize, _>::new(allocator);
+    let tree_writer = init_struct.attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+        let w = tree_writer.start_write();
+        let res = w.insert(&(*k).into(), idx);
+        assert!(res.is_ok());
+    }
+
+    for (idx, k) in keys.iter().enumerate() {
+        let r = tree_writer.start_read();
+        let value = r.get(&(*k).into());
+        assert_eq!(value, Some(idx).as_ref());
+    }
+
+    eprintln!("stats: {:?}", tree_writer.get_statistics());
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut rand::rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.contains(&key) {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
+
+struct TestValue(AtomicUsize);
+
+impl TestValue {
+    fn new(val: usize) -> TestValue {
+        TestValue(AtomicUsize::new(val))
+    }
+
+    fn load(&self) -> usize {
+        self.0.load(Ordering::Relaxed)
+    }
+}
+
+impl Value for TestValue {}
+
+impl Clone for TestValue {
+    fn clone(&self) -> TestValue {
+        TestValue::new(self.load())
+    }
+}
+
+impl Debug for TestValue {
+    fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.load())
+    }
+}
+
+#[derive(Clone, Debug)]
+struct TestOp(TestKey, Option<usize>);
+
+fn apply_op<A: ArtAllocator<TestValue>>(
+    op: &TestOp,
+    tree: &TreeWriteAccess<TestKey, TestValue, A>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    eprintln!("applying op: {op:?}");
+
+    // apply the change to the shadow tree first
+    let shadow_existing = if let Some(v) = op.1 {
+        shadow.insert(op.0, v)
+    } else {
+        shadow.remove(&op.0)
+    };
+
+    // apply to Art tree
+    let w = tree.start_write();
+    w.update_with_fn(&op.0, |existing| {
+        assert_eq!(existing.map(TestValue::load), shadow_existing);
+
+        match (existing, op.1) {
+            (None, None) => UpdateAction::Nothing,
+            (None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
+            (Some(_old_val), None) => UpdateAction::Remove,
+            (Some(old_val), Some(new_val)) => {
+                old_val.0.store(new_val, Ordering::Relaxed);
+                UpdateAction::Nothing
+            }
+        }
+    })
+    .expect("out of memory");
+}
+
+fn test_iter<A: ArtAllocator<TestValue>>(
+    tree: &TreeWriteAccess<TestKey, TestValue, A>,
+    shadow: &BTreeMap<TestKey, usize>,
+) {
+    let mut shadow_iter = shadow.iter();
+    let mut iter = TreeIterator::new(&(TestKey::MIN..TestKey::MAX));
+
+    loop {
+        let shadow_item = shadow_iter.next().map(|(k, v)| (*k, *v));
+        let r = tree.start_read();
+        let item = iter.next(&r);
+
+        if shadow_item != item.map(|(k, v)| (k, v.load())) {
+            eprintln!("FAIL: iterator returned {item:?}, expected {shadow_item:?}");
+            tree.start_read().dump(&mut std::io::stderr());
+
+            eprintln!("SHADOW:");
+            for si in shadow {
+                eprintln!("key: {:?}, val: {}", si.0, si.1);
+            }
+            panic!("FAIL: iterator returned {item:?}, expected {shadow_item:?}");
+        }
+        if item.is_none() {
+            break;
+        }
+    }
+}
+
+#[test]
+fn random_ops() {
+    const MEM_SIZE: usize = 10000000;
+    let mut area = Box::new_uninit_slice(MEM_SIZE);
+
+    let allocator = ArtMultiSlabAllocator::new(&mut area);
+
+    let init_struct = TreeInitStruct::<TestKey, TestValue, _>::new(allocator);
+    let tree_writer = init_struct.attach_writer();
+
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
+    let mut rng = rand::rng();
+    for i in 0..100000 {
+        let mut key: TestKey = (rng.sample(distribution) as u128).into();
+
+        if rng.random_bool(0.10) {
+            key = TestKey::from(u128::from(&key) | 0xffffffff);
+        }
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &tree_writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+            eprintln!("stats: {:?}", tree_writer.get_statistics());
+            test_iter(&tree_writer, &shadow);
+        }
+    }
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -54,6 +54,7 @@ pageserver_api.workspace = true
 pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
 pageserver_compaction.workspace = true
 pageserver_page_api.workspace = true
+peekable.workspace = true
 pem.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
@@ -66,6 +67,7 @@ postgres-types.workspace = true
 posthog_client_lite.workspace = true
 pprof.workspace = true
 pq_proto.workspace = true
+prost.workspace = true
 rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
--- a/pageserver/client_grpc/src/lib.rs
+++ b/pageserver/client_grpc/src/lib.rs
@@ -4,3 +4,4 @@ mod retry;
 mod split;

 pub use client::{PageserverClient, ShardSpec};
+pub use pageserver_api::shard::ShardStripeSize; // used in ShardSpec
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -33,6 +33,8 @@ pub enum ProtocolError {
    Invalid(&'static str, String),
    #[error("required field '{0}' is missing")]
    Missing(&'static str),
+    #[error("invalid combination of not_modified_lsn '{0}' and request_lsn '{1}'")]
+    InvalidLsns(Lsn, Lsn),
 }

 impl ProtocolError {
@@ -85,9 +87,9 @@ impl TryFrom<proto::ReadLsn> for ReadLsn {
            return Err(ProtocolError::invalid("request_lsn", pb.request_lsn));
        }
        if pb.not_modified_since_lsn > pb.request_lsn {
-            return Err(ProtocolError::invalid(
-                "not_modified_since_lsn",
-                pb.not_modified_since_lsn,
+            return Err(ProtocolError::InvalidLsns(
+                Lsn(pb.not_modified_since_lsn),
+                Lsn(pb.request_lsn),
            ));
        }
        Ok(Self {
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -24,6 +24,9 @@ tracing.workspace = true
 tokio.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
+axum.workspace = true
+http.workspace = true
+metrics.workspace = true
 tonic.workspace = true
 url.workspace = true

--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -34,6 +34,10 @@ use crate::util::{request_stats, tokio_thread_local_stats};
 /// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
 #[derive(clap::Parser)]
 pub(crate) struct Args {
+    #[clap(long, default_value = "false")]
+    grpc: bool,
+    #[clap(long, default_value = "false")]
+    grpc_stream: bool,
    #[clap(long, default_value = "http://localhost:9898")]
    mgmt_api_endpoint: String,
    /// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
@@ -78,6 +82,9 @@ pub(crate) struct Args {
    #[clap(long)]
    set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,

+    #[clap(long)]
+    only_relnode: Option<u32>,
+
    /// Queue depth generated in each client.
    #[clap(long, default_value = "1")]
    queue_depth: NonZeroUsize,
@@ -92,10 +99,31 @@ pub(crate) struct Args {
    #[clap(long, default_value = "1")]
    batch_size: NonZeroUsize,

-    #[clap(long)]
-    only_relnode: Option<u32>,
-
    targets: Option<Vec<TenantTimelineId>>,
+
+    #[clap(long, default_value = "100")]
+    pool_max_consumers: NonZeroUsize,
+
+    #[clap(long, default_value = "5")]
+    pool_error_threshold: NonZeroUsize,
+
+    #[clap(long, default_value = "5000")]
+    pool_connect_timeout: NonZeroUsize,
+
+    #[clap(long, default_value = "1000")]
+    pool_connect_backoff: NonZeroUsize,
+
+    #[clap(long, default_value = "60000")]
+    pool_max_idle_duration: NonZeroUsize,
+
+    #[clap(long, default_value = "0")]
+    max_delay_ms: usize,
+
+    #[clap(long, default_value = "0")]
+    percent_drops: usize,
+
+    #[clap(long, default_value = "0")]
+    percent_hangs: usize,
 }

 /// State shared by all clients
@@ -152,7 +180,6 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
        main_impl(args, thread_local_stats)
    })
 }
-
 async fn main_impl(
    args: Args,
    all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
@@ -317,6 +344,7 @@ async fn main_impl(
    let rps_period = args
        .per_client_rate
        .map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
+
    let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
        let ss = shared_state.clone();
        let cancel = cancel.clone();
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -3218,6 +3218,7 @@ where
 pub struct GrpcPageServiceHandler {
    tenant_manager: Arc<TenantManager>,
    ctx: RequestContext,
+    cancel: CancellationToken,
    gate_guard: GateGuard,
    get_vectored_concurrent_io: GetVectoredConcurrentIo,
 }
@@ -3270,6 +3271,7 @@ impl GrpcPageServiceHandler {
        let page_service_handler = GrpcPageServiceHandler {
            tenant_manager,
            ctx,
+            cancel: cancel.clone(),
            gate_guard: gate.enter().expect("gate was just created"),
            get_vectored_concurrent_io,
        };
@@ -3406,6 +3408,8 @@ impl GrpcPageServiceHandler {
    /// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
    /// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
    /// split them up in the client or server.
+    ///
+    /// TODO: verify that the given keys belong to this shard.
    #[instrument(skip_all, fields(req_id, rel, blkno, blks, req_lsn, mod_lsn))]
    async fn get_page(
        ctx: &RequestContext,
@@ -3727,6 +3731,7 @@ impl proto::PageService for GrpcPageServiceHandler {
        // Spawn a task to handle the GetPageRequest stream.
        let span = Span::current();
        let ctx = self.ctx.attached_child();
+        let cancel = self.cancel.clone();
        let mut reqs = req.into_inner();

        let resps = async_stream::try_stream! {
@@ -3734,8 +3739,18 @@ impl proto::PageService for GrpcPageServiceHandler {
                .get(ttid.tenant_id, ttid.timeline_id, shard_selector)
                .await?
                .downgrade();
-            while let Some(req) = reqs.message().await? {
+
+            loop {
+                let req = tokio::select! {
+                    req = reqs.message() => req,
+                    _ = cancel.cancelled() => {
+                        tracing::info!("closing getpages stream due to shutdown");
+                        break;
+                    },
+                };
+                let Some(req) = req? else { break };
                let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default();
+
                let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
                    .instrument(span.clone()) // propagate request span
                    .await;
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -5,6 +5,7 @@ MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
 	communicator.o \
+	communicator_new.o \
 	extension_server.o \
 	file_cache.o \
 	hll.o \
@@ -29,6 +30,11 @@ PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl

+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S), Darwin)
+    SHLIB_LINK += -framework Security -framework CoreFoundation -framework SystemConfiguration
+endif
+
 EXTENSION = neon
 DATA = \
 	neon--1.0.sql \
@@ -57,7 +63,7 @@ WALPROP_OBJS = \

 # libcommunicator.a is built by cargo from the Rust sources under communicator/
 # subdirectory. `cargo build` also generates communicator_bindings.h.
-neon.o: communicator/communicator_bindings.h
+communicator_new.o: communicator/communicator_bindings.h

 $(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h &:
 	(cd $(srcdir)/communicator && cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE))
--- a/pgxn/neon/communicator/Cargo.lock
+++ b/pgxn/neon/communicator/Cargo.lock
@@ -0,0 +1,372 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "addr2line"
+version = "0.24.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
+dependencies = [
+ "gimli",
+]
+
+[[package]]
+name = "adler2"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
+
+[[package]]
+name = "backtrace"
+version = "0.3.74"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a"
+dependencies = [
+ "addr2line",
+ "cfg-if",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+ "windows-targets",
+]
+
+[[package]]
+name = "base64"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
+
+[[package]]
+name = "bytes"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "communicator"
+version = "0.1.0"
+dependencies = [
+ "tonic",
+]
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "futures-core"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
+
+[[package]]
+name = "gimli"
+version = "0.31.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
+
+[[package]]
+name = "http"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
+dependencies = [
+ "bytes",
+ "fnv",
+ "itoa",
+]
+
+[[package]]
+name = "http-body"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
+dependencies = [
+ "bytes",
+ "http",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
+
+[[package]]
+name = "libc"
+version = "0.2.171"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6"
+
+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.8.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff70ce3e48ae43fa075863cef62e8b43b71a4f2382229920e0df362592919430"
+dependencies = [
+ "adler2",
+]
+
+[[package]]
+name = "object"
+version = "0.36.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
+
+[[package]]
+name = "percent-encoding"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
+
+[[package]]
+name = "pin-project"
+version = "1.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
+dependencies = [
+ "pin-project-internal",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "1.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.94"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rustc-demangle"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
+
+[[package]]
+name = "syn"
+version = "2.0.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "tokio"
+version = "1.44.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48"
+dependencies = [
+ "backtrace",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "tokio-stream"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047"
+dependencies = [
+ "futures-core",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tonic"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85839f0b32fd242bb3209262371d07feda6d780d16ee9d2bc88581b89da1549b"
+dependencies = [
+ "base64",
+ "bytes",
+ "http",
+ "http-body",
+ "http-body-util",
+ "percent-encoding",
+ "pin-project",
+ "tokio-stream",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+
+[[package]]
+name = "tower-service"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
+
+[[package]]
+name = "tracing"
+version = "0.1.41"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
+dependencies = [
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
+dependencies = [
+ "once_cell",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
--- a/pgxn/neon/communicator/Cargo.toml
+++ b/pgxn/neon/communicator/Cargo.toml
@@ -1,19 +1,42 @@
 [package]
 name = "communicator"
-version = "0.1.0"
 license.workspace = true
 edition.workspace = true

-[lib]
-crate-type = ["staticlib"]
-
 [features]
 # 'testing' feature is currently unused in the communicator, but we accept it for convenience of
 # calling build scripts, so that you can pass the same feature to all packages.
 testing = []

+[lib]
+crate-type = ["staticlib"]
+
 [dependencies]
+axum.workspace = true
+bytes.workspace = true
+clashmap.workspace = true
+http.workspace = true
+libc.workspace = true
+nix.workspace = true
+atomic_enum = "0.3.0"
+prometheus.workspace = true
+prost.workspace = true
+tonic = { version = "0.12.0", default-features = false, features=["codegen", "prost", "transport"] }
+tokio = { version = "1.43.1", features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] }
+tokio-pipe = { version = "0.2.12" }
+thiserror.workspace = true
+tracing.workspace = true
+tracing-subscriber.workspace = true
+
+metrics.workspace = true
+uring-common = { workspace = true, features = ["bytes"] }
+
+pageserver_client_grpc.workspace = true
+pageserver_api.workspace = true
+pageserver_page_api.workspace = true
+
 neon-shmem.workspace = true
+utils.workspace = true
 workspace_hack = { version = "0.1", path = "../../../workspace_hack" }

 [build-dependencies]
--- a/pgxn/neon/communicator/README.md
+++ b/pgxn/neon/communicator/README.md
@@ -1,8 +1,138 @@
-This package will evolve into a "compute-pageserver communicator"
-process and machinery. For now, it's just a dummy that doesn't do
-anything interesting, but it allows us to test the compilation and
-linking of Rust code into the Postgres extensions.
+# Communicator
+
+This package provides the so-called "compute-pageserver communicator",
+or just "communicator" in short. It runs in a PostgreSQL server, as
+part of the neon extension, and handles the communication with the
+pageservers. On the PostgreSQL side, the glue code in pgxn/neon/ uses
+the communicator to implement the PostgreSQL Storage Manager (SMGR)
+interface.
+
+## Design criteria
+
+- Low latency
+- Saturate a 10 Gbit / s network interface without becoming a bottleneck
+
+## Source code view
+
+pgxn/neon/communicator_new.c
+	Contains the glue that interact with PostgreSQL code and the Rust
+	communicator code.
+
+pgxn/neon/communicator/src/backend_interface.rs
+	The entry point for calls from each backend.
+
+pgxn/neon/communicator/src/init.rs
+	Initialization at server startup
+
+pgxn/neon/communicator/src/worker_process/
+    Worker process main loop and glue code

 At compilation time, pgxn/neon/communicator/ produces a static
 library, libcommunicator.a. It is linked to the neon.so extension
 library.
+
+The real networking code, which is independent of PostgreSQL, is in
+the pageserver/client_grpc crate.
+
+## Process view
+
+The communicator runs in a dedicated background worker process, the
+"communicator process". The communicator uses a multi-threaded Tokio
+runtime to execute the IO requests. So the communicator process has
+multiple threads running. That's unusual for Postgres processes and
+care must be taken to make that work.
+
+### Backend <-> worker communication
+
+Each backend has a number of I/O request slots in shared memory. The
+slots are statically allocated for each backend, and must not be
+accessed by other backends. The worker process reads requests from the
+shared memory slots, and writes responses back to the slots.
+
+Here's an example snapshot of the system, when two requests from two
+different backends are in progress:
+
+```
+Backends           Request slots          Communicator process
+---------          -------------          --------------------
+
+Backend 1          1: Idle
+                   2: Idle
+                   3: Processing          tokio task handling request 3
+
+Backend 2          4: Completed
+                   5: Processing          tokio task handling request 5
+                   6: Idle
+
+...                ...
+```
+
+To submit an IO request, the backend first picks one of its Idle
+slots, writes the IO request in the slot, and updates it to
+'Submitted' state. That transfers the ownership of the slot to the
+worker process, until the worker process marks the request as
+Completed. The worker process spawns a separate Tokio task for each
+request.
+
+To inform the worker process that a request slot has a pending IO
+request, there's a pipe shared by the worker process and all backend
+processes. The backend writes the index of the request slot to the
+pipe after changing the slot's state to Submitted. This wakes up the
+worker process.
+
+(Note that the pipe is just used for wakeups, but the worker process
+is free to pick up Submitted IO requests even without receiving the
+wakeup. As of this writing, it doesn't do that, but it might be useful
+in the future to reduce latency even further, for example.)
+
+When the worker process has completed processing the request, it
+writes the result back in the request slot. A GetPage request can also
+contain a pointer to buffer in the shared buffer cache. In that case,
+the worker process writes the resulting page contents directly to the
+buffer, and just a result code in the request slot. It then updates
+the 'state' field to Completed, which passes the owner ship back to
+the originating backend. Finally, it signals the process Latch of the
+originating backend, waking it up.
+
+### Differences between PostgreSQL v16, v17 and v18
+
+PostgreSQL v18 introduced the new AIO mechanism. The PostgreSQL AIO
+mechanism uses a very similar mechanism as described in the previous
+section, for the communication between AIO worker processes and
+backends. With our communicator, the AIO worker processes are not
+used, but we use the same PgAioHandle request slots as in upstream.
+For Neon-specific IO requests like GetDbSize, a neon request slot is
+used. But for the actual IO requests, the request slot merely contains
+a pointer to the PgAioHandle slot. The worker process updates the
+status of that, calls the IO callbacks upon completionetc, just like
+the upstream AIO worker processes do.
+
+## Sequence diagram
+
+                      neon
+    PostgreSQL     extension       backend_interface.rs  worker_process.rs    processor    tonic
+       |               .                    .                   .                 .
+	   | smgr_read()   .                    .                   .                 .
+	   +-------------> +                    .                   .                 .
+	   .               |                    .                   .                 .
+	   .               |  rcommunicator_    .                   .                 .
+	   .               | get_page_at_lsn    .                   .                 .
+	   .               +------------------> +                   .                 .
+                                            |                   .                 .
+                                            | write request to  .                 .                 .
+                                            | slot              .                 .
+                                            |                   .                 .
+                                            |                   .                 .
+											| submit_request()  .                 .
+											+-----------------> +                 .
+											|                   |                 .
+											|					| db_size_request .               .
+																+---------------->.
+																                  . TODO
+
+
+
+### Compute <-> pageserver protocol
+
+The protocol between Compute and the pageserver is based on gRPC. See `protos/`.
+
--- a/pgxn/neon/communicator/src/backend_comms.rs
+++ b/pgxn/neon/communicator/src/backend_comms.rs
@@ -0,0 +1,224 @@
+//! This module implements a request/response "slot" for submitting
+//! requests from backends to the communicator process.
+//!
+//! NB: The "backend" side of this code runs in Postgres backend processes,
+//! which means that it is not safe to use the 'tracing' crate for logging, nor
+//! to launch threads or use tokio tasks!
+
+use std::cell::UnsafeCell;
+use std::sync::atomic::{AtomicI32, Ordering};
+
+use crate::neon_request::{NeonIORequest, NeonIOResult};
+
+use atomic_enum::atomic_enum;
+
+/// One request/response slot. Each backend has its own set of slots that it
+/// uses.
+///
+/// This is the moral equivalent of PgAioHandle for Postgres AIO requests
+/// Like PgAioHandle, try to keep this small.
+///
+/// There is an array of these in shared memory. Therefore, this must be Sized.
+///
+/// ## Lifecycle of a request
+///
+/// A slot is always owned by either the backend process or the communicator
+/// process, depending on the 'state'. Only the owning process is allowed to
+/// read or modify the slot, except for reading the 'state' itself to check who
+/// owns it.
+///
+/// A slot begins in the Idle state, where it is owned by the backend process.
+/// To submit a request, the backend process fills the slot with the request
+/// data, and changes it to the Submitted state. After changing the state, the
+/// slot is owned by the communicator process, and the backend is not allowed
+/// to access it until the communicator process marks it as Completed.
+///
+/// When the communicator process sees that the slot is in Submitted state, it
+/// starts to process the request. After processing the request, it stores the
+/// result in the slot, and changes the state to Completed. It is now owned by
+/// the backend process again, which may now read the result, and reuse the
+/// slot for a new request.
+///
+/// For correctness of the above protocol, we really only need two states:
+/// "owned by backend" and "owned by communicator process". But to help with
+/// debugging and better assertions, there are a few more states. When the
+/// backend starts to fill in the request details in the slot, it first sets the
+/// state from Idle to Filling, and when it's done with that, from Filling to
+/// Submitted. In the Filling state, the slot is still owned by the
+/// backend. Similarly, when the communicator process starts to process a
+/// request, it sets it to Processing state first, but the slot is still owned
+/// by the communicator process.
+///
+/// This struct doesn't handle waking up the communicator process when a request
+/// has been submitted or when a response is ready. The 'owner_procno' is used
+/// for waking up the backend on completion, but that happens elsewhere.
+pub struct NeonIORequestSlot {
+    /// similar to PgAioHandleState
+    state: AtomicNeonIORequestSlotState,
+
+    /// The owning process's ProcNumber. The worker process uses this to set the
+    /// process's latch on completion.
+    ///
+    /// (This could be calculated from num_neon_request_slots_per_backend and
+    /// the index of this slot in the overall 'neon_requst_slots array'. But we
+    /// prefer the communicator process to not know how the request slots are
+    /// divided between the backends.)
+    owner_procno: AtomicI32,
+
+    /// SAFETY: This is modified by submit_request(), after it has established
+    /// ownership of the slot by setting state from Idle to Filling
+    request: UnsafeCell<NeonIORequest>,
+
+    /// Valid when state is Completed
+    ///
+    /// SAFETY: This is modified by RequestProcessingGuard::complete(). There
+    /// can be only one RequestProcessingGuard outstanding for a slot at a time,
+    /// because it is returned by start_processing_request() which checks the
+    /// state, so RequestProcessingGuard has exclusive access to the slot.
+    result: UnsafeCell<NeonIOResult>,
+}
+
+// The protocol described in the "Lifecycle of a request" section above ensures
+// the safe access to the fields
+unsafe impl Send for NeonIORequestSlot {}
+unsafe impl Sync for NeonIORequestSlot {}
+
+impl Default for NeonIORequestSlot {
+    fn default() -> NeonIORequestSlot {
+        NeonIORequestSlot {
+            owner_procno: AtomicI32::new(-1),
+            request: UnsafeCell::new(NeonIORequest::Empty),
+            result: UnsafeCell::new(NeonIOResult::Empty),
+            state: AtomicNeonIORequestSlotState::new(NeonIORequestSlotState::Idle),
+        }
+    }
+}
+
+#[atomic_enum]
+#[derive(Eq, PartialEq)]
+pub enum NeonIORequestSlotState {
+    Idle,
+
+    /// Backend is filling in the request
+    Filling,
+
+    /// Backend has submitted the request to the communicator, but the
+    /// communicator process has not yet started processing it.
+    Submitted,
+
+    /// Communicator is processing the request
+    Processing,
+
+    /// Communicator has completed the request, and the 'result' field is now
+    /// valid, but the backend has not read the result yet.
+    Completed,
+}
+
+impl NeonIORequestSlot {
+    /// Write a request to the slot, and mark it as Submitted.
+    ///
+    /// Note: This does not wake up the worker process to actually process
+    /// the request. It's the caller's responsibility to do that.
+    pub fn submit_request(&self, request: &NeonIORequest, proc_number: i32) {
+        // Verify that the slot is in Idle state previously, and put it in
+        // Filling state.
+        //
+        // XXX: This step isn't strictly necessary. Assuming the caller didn't
+        // screw up and try to use a slot that's already in use, we could fill
+        // the slot and switch it directly from Idle to Submitted state.
+        if let Err(s) = self.state.compare_exchange(
+            NeonIORequestSlotState::Idle,
+            NeonIORequestSlotState::Filling,
+            Ordering::Relaxed,
+            Ordering::Relaxed,
+        ) {
+            panic!("unexpected state in request slot: {s:?}");
+        }
+
+        // Fill in the request details
+        self.owner_procno.store(proc_number, Ordering::Relaxed);
+        unsafe { *self.request.get() = *request }
+
+        // This synchronizes-with store/swap in [`start_processing_request`].
+        // Note that this ensures that the previous non-atomic writes visible
+        // to other threads too.
+        self.state
+            .store(NeonIORequestSlotState::Submitted, Ordering::Release);
+    }
+
+    pub fn get_state(&self) -> NeonIORequestSlotState {
+        self.state.load(Ordering::Relaxed)
+    }
+
+    pub fn try_get_result(&self) -> Option<NeonIOResult> {
+        // This synchronizes-with the store/swap in [`RequestProcessingGuard::completed`]
+        let state = self.state.load(Ordering::Acquire);
+        if state == NeonIORequestSlotState::Completed {
+            let result = unsafe { *self.result.get() };
+            self.state
+                .store(NeonIORequestSlotState::Idle, Ordering::Relaxed);
+            Some(result)
+        } else {
+            None
+        }
+    }
+
+    /// Read the IO request from the slot indicated in the wakeup
+    pub fn start_processing_request<'a>(&'a self) -> Option<RequestProcessingGuard<'a>> {
+        // XXX: using atomic load rather than compare_exchange would be
+        // sufficient here, as long as the communicator process has _some_ means
+        // of tracking which requests it's already processing. That could be a
+        // flag somewhere in communicator's private memory, for example.
+        //
+        // This synchronizes-with the store in [`submit_request`].
+        if let Err(s) = self.state.compare_exchange(
+            NeonIORequestSlotState::Submitted,
+            NeonIORequestSlotState::Processing,
+            Ordering::Acquire,
+            Ordering::Relaxed,
+        ) {
+            // FIXME surprising state. This is unexpected at the moment, but if we
+            // started to process requests more aggressively, without waiting for the
+            // read from the pipe, then this could happen
+            panic!("unexpected state in request slot: {s:?}");
+        }
+
+        Some(RequestProcessingGuard(self))
+    }
+}
+
+/// [`NeonIORequestSlot::start_processing_request`] returns this guard object to
+/// indicate that the the caller now "owns" the slot, until it calls
+/// [`RequestProcessingGuard::completed`].
+///
+/// TODO: implement Drop on this, to mark the request as Aborted or Errored
+/// if [`RequestProcessingGuard::completed`] is not called.
+pub struct RequestProcessingGuard<'a>(&'a NeonIORequestSlot);
+
+unsafe impl<'a> Send for RequestProcessingGuard<'a> {}
+unsafe impl<'a> Sync for RequestProcessingGuard<'a> {}
+
+impl<'a> RequestProcessingGuard<'a> {
+    pub fn get_request(&self) -> &NeonIORequest {
+        unsafe { &*self.0.request.get() }
+    }
+
+    pub fn get_owner_procno(&self) -> i32 {
+        self.0.owner_procno.load(Ordering::Relaxed)
+    }
+
+    pub fn completed(self, result: NeonIOResult) {
+        // Store the result to the slot.
+        unsafe {
+            *self.0.result.get() = result;
+        };
+
+        // Mark the request as completed. After that, we no longer have
+        // ownership of the slot, and must not modify it.
+        let old_state = self
+            .0
+            .state
+            .swap(NeonIORequestSlotState::Completed, Ordering::Release);
+        assert!(old_state == NeonIORequestSlotState::Processing);
+    }
+}
--- a/pgxn/neon/communicator/src/backend_interface.rs
+++ b/pgxn/neon/communicator/src/backend_interface.rs
@@ -0,0 +1,232 @@
+//! This code runs in each backend process. That means that launching Rust threads, panicking
+//! etc. is forbidden!
+
+use std::os::fd::OwnedFd;
+
+use crate::backend_comms::NeonIORequestSlot;
+use crate::init::CommunicatorInitStruct;
+use crate::integrated_cache::{BackendCacheReadOp, IntegratedCacheReadAccess};
+use crate::neon_request::{CCachedGetPageVResult, COid};
+use crate::neon_request::{NeonIORequest, NeonIOResult};
+
+pub struct CommunicatorBackendStruct<'t> {
+    my_proc_number: i32,
+
+    neon_request_slots: &'t [NeonIORequestSlot],
+
+    submission_pipe_write_fd: OwnedFd,
+
+    pending_cache_read_op: Option<BackendCacheReadOp<'t>>,
+
+    integrated_cache: &'t IntegratedCacheReadAccess<'t>,
+}
+
+#[unsafe(no_mangle)]
+pub extern "C" fn rcommunicator_backend_init(
+    cis: Box<CommunicatorInitStruct>,
+    my_proc_number: i32,
+) -> &'static mut CommunicatorBackendStruct<'static> {
+    if my_proc_number < 0 {
+        panic!(
+            "cannot attach to communicator shared memory with procnumber {}",
+            my_proc_number,
+        );
+    }
+
+    let integrated_cache = Box::leak(Box::new(cis.integrated_cache_init_struct.backend_init()));
+
+    let bs: &'static mut CommunicatorBackendStruct =
+        Box::leak(Box::new(CommunicatorBackendStruct {
+            my_proc_number,
+            neon_request_slots: cis.neon_request_slots,
+
+            submission_pipe_write_fd: cis.submission_pipe_write_fd,
+            pending_cache_read_op: None,
+
+            integrated_cache,
+        }));
+    bs
+}
+
+/// Start a request. You can poll for its completion and get the result by
+/// calling bcomm_poll_dbsize_request_completion(). The communicator will wake
+/// us up by setting our process latch, so to wait for the completion, wait on
+/// the latch and call bcomm_poll_dbsize_request_completion() every time the
+/// latch is set.
+///
+/// Safety: The C caller must ensure that the references are valid.
+/// The requested slot must be free, or this panics.
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_start_io_request(
+    bs: &'_ mut CommunicatorBackendStruct,
+    slot_idx: i32,
+    request: &NeonIORequest,
+    immediate_result_ptr: &mut NeonIOResult,
+) -> i32 {
+    assert!(bs.pending_cache_read_op.is_none());
+
+    // Check if the request can be satisfied from the cache first
+    if let NeonIORequest::RelSize(req) = request {
+        if let Some(nblocks) = bs.integrated_cache.get_rel_size(&req.reltag()) {
+            *immediate_result_ptr = NeonIOResult::RelSize(nblocks);
+            return -1;
+        }
+    }
+
+    // Create neon request and submit it
+    bs.start_neon_io_request(slot_idx, request);
+
+    slot_idx
+}
+
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_start_get_page_v_request(
+    bs: &mut CommunicatorBackendStruct,
+    slot_idx: i32,
+    request: &NeonIORequest,
+    immediate_result_ptr: &mut CCachedGetPageVResult,
+) -> i32 {
+    let NeonIORequest::GetPageV(get_pagev_request) = request else {
+        panic!("invalid request passed to bcomm_start_get_page_v_request()");
+    };
+    assert!(matches!(request, NeonIORequest::GetPageV(_)));
+    assert!(bs.pending_cache_read_op.is_none());
+
+    // Check if the request can be satisfied from the cache first
+    let mut all_cached = true;
+    let mut read_op = bs.integrated_cache.start_read_op();
+    for i in 0..get_pagev_request.nblocks {
+        if let Some(cache_block) = read_op.get_page(
+            &get_pagev_request.reltag(),
+            get_pagev_request.block_number + i as u32,
+        ) {
+            immediate_result_ptr.cache_block_numbers[i as usize] = cache_block;
+        } else {
+            // not found in cache
+            all_cached = false;
+            break;
+        }
+    }
+    if all_cached {
+        bs.pending_cache_read_op = Some(read_op);
+        return -1;
+    }
+
+    // Create neon request and submit it
+    bs.start_neon_io_request(slot_idx, request);
+
+    slot_idx
+}
+
+/// Check if a request has completed. Returns:
+///
+/// -1 if the request is still being processed
+/// 0 on success
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_poll_request_completion(
+    bs: &mut CommunicatorBackendStruct,
+    request_slot_idx: u32,
+    result_p: &mut NeonIOResult,
+) -> i32 {
+    match bs.neon_request_slots[request_slot_idx as usize].try_get_result() {
+        None => -1, // still processing
+        Some(result) => {
+            *result_p = result;
+            0
+        }
+    }
+}
+
+/// Check if a request has completed. Returns:
+///
+/// 'false' if the slot is Idle. The backend process has ownership.
+/// 'true' if the slot is busy, and should be polled for result.
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_get_request_slot_status(
+    bs: &mut CommunicatorBackendStruct,
+    request_slot_idx: u32,
+) -> bool {
+    use crate::backend_comms::NeonIORequestSlotState;
+    match bs.neon_request_slots[request_slot_idx as usize].get_state() {
+        NeonIORequestSlotState::Idle => false,
+        NeonIORequestSlotState::Filling => {
+            // 'false' would be the right result here. However, this
+            // is a very transient state. The C code should never
+            // leave a slot in this state, so if it sees that,
+            // something's gone wrong and it's not clear what to do
+            // with it.
+            panic!(
+                "unexpected Filling state in request slot {}",
+                request_slot_idx
+            );
+        }
+        NeonIORequestSlotState::Submitted => true,
+        NeonIORequestSlotState::Processing => true,
+        NeonIORequestSlotState::Completed => true,
+    }
+}
+
+// LFC functions
+
+/// Finish a local file cache read
+///
+//
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_finish_cache_read(bs: &mut CommunicatorBackendStruct) -> bool {
+    if let Some(op) = bs.pending_cache_read_op.take() {
+        op.finish()
+    } else {
+        panic!("bcomm_finish_cache_read() called with no cached read pending");
+    }
+}
+
+/// Check if the local file cache contians the given block
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_cache_contains(
+    bs: &mut CommunicatorBackendStruct,
+    spc_oid: COid,
+    db_oid: COid,
+    rel_number: u32,
+    fork_number: u8,
+    block_number: u32,
+) -> bool {
+    bs.integrated_cache.cache_contains_page(
+        &pageserver_page_api::RelTag {
+            spcnode: spc_oid,
+            dbnode: db_oid,
+            relnode: rel_number,
+            forknum: fork_number,
+        },
+        block_number,
+    )
+}
+
+impl<'t> CommunicatorBackendStruct<'t> {
+    /// The slot must be free, or this panics.
+    pub(crate) fn start_neon_io_request(&mut self, request_slot_idx: i32, request: &NeonIORequest) {
+        let my_proc_number = self.my_proc_number;
+
+        self.neon_request_slots[request_slot_idx as usize].submit_request(request, my_proc_number);
+
+        // Tell the communicator about it
+        self.notify_about_request(request_slot_idx);
+    }
+
+    /// Send a wakeup to the communicator process
+    fn notify_about_request(self: &CommunicatorBackendStruct<'t>, request_slot_idx: i32) {
+        // wake up communicator by writing the idx to the submission pipe
+        //
+
+        // This can block, if the pipe is full. That should be very rare,
+        // because the communicator tries hard to drain the pipe to prevent
+        // that. Also, there's a natural upper bound on how many wakeups can be
+        // queued up: there is only a limited number of request slots for each
+        // backend.
+        //
+        // If it does block very briefly, that's not too serious.
+        let idxbuf = request_slot_idx.to_ne_bytes();
+
+        let _res = nix::unistd::write(&self.submission_pipe_write_fd, &idxbuf);
+        // FIXME: check result, return any errors
+    }
+}
--- a/pgxn/neon/communicator/src/file_cache.rs
+++ b/pgxn/neon/communicator/src/file_cache.rs
@@ -0,0 +1,162 @@
+//! Implement the "low-level" parts of the file cache.
+//!
+//! This module just deals with reading and writing the file, and keeping track
+//! which blocks in the cache file are in use and which are free. The "high
+//! level" parts of tracking which block in the cache file corresponds to which
+//! relation block is handled in 'integrated_cache' instead.
+//!
+//! This module is only used to access the file from the communicator
+//! process. The backend processes *also* read the file (and sometimes also
+//! write it? ), but the backends use direct C library calls for that.
+use std::fs::File;
+use std::os::unix::fs::FileExt;
+use std::path::Path;
+use std::sync::Arc;
+use std::sync::Mutex;
+
+use crate::BLCKSZ;
+
+use tokio::task::spawn_blocking;
+
+pub type CacheBlock = u64;
+
+pub const INVALID_CACHE_BLOCK: CacheBlock = u64::MAX;
+
+#[derive(Debug)]
+pub struct FileCache {
+    file: Arc<File>,
+
+    free_list: Mutex<FreeList>,
+
+    // metrics
+    max_blocks_gauge: metrics::IntGauge,
+    num_free_blocks_gauge: metrics::IntGauge,
+}
+
+// TODO: We keep track of all free blocks in this vec. That doesn't really scale.
+// Idea: when free_blocks fills up with more than 1024 entries, write them all to
+// one block on disk.
+#[derive(Debug)]
+struct FreeList {
+    next_free_block: CacheBlock,
+    max_blocks: u64,
+
+    free_blocks: Vec<CacheBlock>,
+}
+
+impl FileCache {
+    pub fn new(file_cache_path: &Path, mut initial_size: u64) -> Result<FileCache, std::io::Error> {
+        if initial_size < 100 {
+            tracing::warn!(
+                "min size for file cache is 100 blocks, {} requested",
+                initial_size
+            );
+            initial_size = 100;
+        }
+
+        let file = std::fs::OpenOptions::new()
+            .read(true)
+            .write(true)
+            .truncate(true)
+            .create(true)
+            .open(file_cache_path)?;
+
+        let max_blocks_gauge = metrics::IntGauge::new(
+            "file_cache_max_blocks",
+            "Local File Cache size in 8KiB blocks",
+        )
+        .unwrap();
+        let num_free_blocks_gauge = metrics::IntGauge::new(
+            "file_cache_num_free_blocks",
+            "Number of free 8KiB blocks in Local File Cache",
+        )
+        .unwrap();
+
+        tracing::info!("initialized file cache with {} blocks", initial_size);
+
+        Ok(FileCache {
+            file: Arc::new(file),
+            free_list: Mutex::new(FreeList {
+                next_free_block: 0,
+                max_blocks: initial_size,
+                free_blocks: Vec::new(),
+            }),
+            max_blocks_gauge,
+            num_free_blocks_gauge,
+        })
+    }
+
+    // File cache management
+
+    pub async fn read_block(
+        &self,
+        cache_block: CacheBlock,
+        mut dst: impl uring_common::buf::IoBufMut + Send + Sync,
+    ) -> Result<(), std::io::Error> {
+        assert!(dst.bytes_total() == BLCKSZ);
+        let file = self.file.clone();
+
+        let dst_ref = unsafe { std::slice::from_raw_parts_mut(dst.stable_mut_ptr(), BLCKSZ) };
+
+        spawn_blocking(move || file.read_exact_at(dst_ref, cache_block * BLCKSZ as u64)).await??;
+        Ok(())
+    }
+
+    pub async fn write_block(
+        &self,
+        cache_block: CacheBlock,
+        src: impl uring_common::buf::IoBuf + Send + Sync,
+    ) -> Result<(), std::io::Error> {
+        assert!(src.bytes_init() == BLCKSZ);
+        let file = self.file.clone();
+
+        let src_ref = unsafe { std::slice::from_raw_parts(src.stable_ptr(), BLCKSZ) };
+
+        spawn_blocking(move || file.write_all_at(src_ref, cache_block * BLCKSZ as u64)).await??;
+
+        Ok(())
+    }
+
+    pub fn alloc_block(&self) -> Option<CacheBlock> {
+        let mut free_list = self.free_list.lock().unwrap();
+        if let Some(x) = free_list.free_blocks.pop() {
+            return Some(x);
+        }
+        if free_list.next_free_block < free_list.max_blocks {
+            let result = free_list.next_free_block;
+            free_list.next_free_block += 1;
+            return Some(result);
+        }
+        None
+    }
+
+    pub fn dealloc_block(&self, cache_block: CacheBlock) {
+        let mut free_list = self.free_list.lock().unwrap();
+        free_list.free_blocks.push(cache_block);
+    }
+}
+
+impl metrics::core::Collector for FileCache {
+    fn desc(&self) -> Vec<&metrics::core::Desc> {
+        let mut descs = Vec::new();
+        descs.append(&mut self.max_blocks_gauge.desc());
+        descs.append(&mut self.num_free_blocks_gauge.desc());
+        descs
+    }
+    fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
+        // Update the gauges with fresh values first
+        {
+            let free_list = self.free_list.lock().unwrap();
+            self.max_blocks_gauge.set(free_list.max_blocks as i64);
+
+            let total_free_blocks: i64 = free_list.free_blocks.len() as i64
+                + (free_list.max_blocks as i64 - free_list.next_free_block as i64);
+            self.num_free_blocks_gauge.set(total_free_blocks);
+        }
+
+        let mut values = Vec::new();
+        values.append(&mut self.max_blocks_gauge.collect());
+        values.append(&mut self.num_free_blocks_gauge.collect());
+        values
+    }
+}
--- a/pgxn/neon/communicator/src/global_allocator.rs
+++ b/pgxn/neon/communicator/src/global_allocator.rs
@@ -0,0 +1,109 @@
+//! Global allocator, for tracking memory usage of the Rust parts
+//!
+//! Postgres is designed to handle allocation failure (ie. malloc() returning NULL) gracefully.  It
+//! rolls backs the transaction and gives the user an "ERROR: out of memory" error. Rust code
+//! however panics if an allocation fails. We don't want that to ever happen, because an unhandled
+//! panic leads to Postgres crash and restart. Our strategy is to pre-allocate a large enough chunk
+//! of memory for use by the Rust code, so that the allocations never fail.
+//!
+//! To pick the size for the pre-allocated chunk, we have a metric to track the high watermark
+//! memory usage of all the Rust allocations in total.
+//!
+//! TODO:
+//!
+//! - Currently we just export the metrics. Actual allocations are still just passed through to
+//!   the system allocator.
+//! - Take padding etc. overhead into account
+
+use std::alloc::{GlobalAlloc, Layout, System};
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
+
+use metrics::IntGauge;
+
+struct MyAllocator {
+    allocations: AtomicU64,
+    deallocations: AtomicU64,
+
+    allocated: AtomicUsize,
+    high: AtomicUsize,
+}
+
+unsafe impl GlobalAlloc for MyAllocator {
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        self.allocations.fetch_add(1, Ordering::Relaxed);
+        let mut allocated = self.allocated.fetch_add(layout.size(), Ordering::Relaxed);
+        allocated += layout.size();
+        self.high.fetch_max(allocated, Ordering::Relaxed);
+        unsafe { System.alloc(layout) }
+    }
+
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+        self.deallocations.fetch_add(1, Ordering::Relaxed);
+        self.allocated.fetch_sub(layout.size(), Ordering::Relaxed);
+        unsafe { System.dealloc(ptr, layout) }
+    }
+}
+
+#[global_allocator]
+static GLOBAL: MyAllocator = MyAllocator {
+    allocations: AtomicU64::new(0),
+    deallocations: AtomicU64::new(0),
+    allocated: AtomicUsize::new(0),
+    high: AtomicUsize::new(0),
+};
+
+pub struct MyAllocatorCollector {
+    allocations: IntGauge,
+    deallocations: IntGauge,
+    allocated: IntGauge,
+    high: IntGauge,
+}
+
+impl MyAllocatorCollector {
+    pub fn new() -> MyAllocatorCollector {
+        MyAllocatorCollector {
+            allocations: IntGauge::new("allocations_total", "Number of allocations in Rust code")
+                .unwrap(),
+            deallocations: IntGauge::new(
+                "deallocations_total",
+                "Number of deallocations in Rust code",
+            )
+            .unwrap(),
+            allocated: IntGauge::new("allocated_total", "Bytes currently allocated").unwrap(),
+            high: IntGauge::new("allocated_high", "High watermark of allocated bytes").unwrap(),
+        }
+    }
+}
+
+impl metrics::core::Collector for MyAllocatorCollector {
+    fn desc(&self) -> Vec<&metrics::core::Desc> {
+        let mut descs = Vec::new();
+
+        descs.append(&mut self.allocations.desc());
+        descs.append(&mut self.deallocations.desc());
+        descs.append(&mut self.allocated.desc());
+        descs.append(&mut self.high.desc());
+
+        descs
+    }
+
+    fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
+        let mut values = Vec::new();
+
+        // update the gauges
+        self.allocations
+            .set(GLOBAL.allocations.load(Ordering::Relaxed) as i64);
+        self.deallocations
+            .set(GLOBAL.allocations.load(Ordering::Relaxed) as i64);
+        self.allocated
+            .set(GLOBAL.allocated.load(Ordering::Relaxed) as i64);
+        self.high.set(GLOBAL.high.load(Ordering::Relaxed) as i64);
+
+        values.append(&mut self.allocations.collect());
+        values.append(&mut self.deallocations.collect());
+        values.append(&mut self.allocated.collect());
+        values.append(&mut self.high.collect());
+
+        values
+    }
+}
--- a/pgxn/neon/communicator/src/init.rs
+++ b/pgxn/neon/communicator/src/init.rs
@@ -0,0 +1,168 @@
+//! Initialization functions. These are executed in the postmaster process,
+//! at different stages of server startup.
+//!
+//!
+//! Communicator initialization steps:
+//!
+//! 1. At postmaster startup, before shared memory is allocated,
+//!    rcommunicator_shmem_size() is called to get the amount of
+//!    shared memory that this module needs.
+//!
+//! 2. Later, after the shared memory has been allocated,
+//!    rcommunicator_shmem_init() is called to initialize the shmem
+//!    area.
+//!
+//! Per process initialization:
+//!
+//! When a backend process starts up, it calls rcommunicator_backend_init().
+//! In the communicator worker process, other functions are called, see
+//! `worker_process` module.
+
+use std::ffi::c_int;
+use std::mem;
+use std::mem::MaybeUninit;
+use std::os::fd::OwnedFd;
+
+use crate::backend_comms::NeonIORequestSlot;
+use crate::integrated_cache::IntegratedCacheInitStruct;
+
+/// This struct is created in the postmaster process, and inherited to
+/// the communicator process and all backend processes through fork()
+#[repr(C)]
+pub struct CommunicatorInitStruct {
+    pub submission_pipe_read_fd: OwnedFd,
+    pub submission_pipe_write_fd: OwnedFd,
+
+    // Shared memory data structures
+    pub num_neon_request_slots: u32,
+
+    pub neon_request_slots: &'static [NeonIORequestSlot],
+
+    pub integrated_cache_init_struct: IntegratedCacheInitStruct<'static>,
+}
+
+impl std::fmt::Debug for CommunicatorInitStruct {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        fmt.debug_struct("CommunicatorInitStruct")
+            .field("submission_pipe_read_fd", &self.submission_pipe_read_fd)
+            .field("submission_pipe_write_fd", &self.submission_pipe_write_fd)
+            .field("num_neon_request_slots", &self.num_neon_request_slots)
+            .field("neon_request_slots length", &self.neon_request_slots.len())
+            .finish()
+    }
+}
+
+#[unsafe(no_mangle)]
+pub extern "C" fn rcommunicator_shmem_size(num_neon_request_slots: u32) -> u64 {
+    let mut size = 0;
+
+    size += mem::size_of::<NeonIORequestSlot>() * num_neon_request_slots as usize;
+
+    // For integrated_cache's Allocator. TODO: make this adjustable
+    size += IntegratedCacheInitStruct::shmem_size();
+
+    size as u64
+}
+
+/// Initialize the shared memory segment. Returns a backend-private
+/// struct, which will be inherited by backend processes through fork
+#[unsafe(no_mangle)]
+pub extern "C" fn rcommunicator_shmem_init(
+    submission_pipe_read_fd: c_int,
+    submission_pipe_write_fd: c_int,
+    num_neon_request_slots: u32,
+    shmem_area_ptr: *mut MaybeUninit<u8>,
+    shmem_area_len: u64,
+    initial_file_cache_size: u64,
+    max_file_cache_size: u64,
+) -> &'static mut CommunicatorInitStruct {
+    let shmem_area: &'static mut [MaybeUninit<u8>] =
+        unsafe { std::slice::from_raw_parts_mut(shmem_area_ptr, shmem_area_len as usize) };
+
+    let (neon_request_slots, remaining_area) =
+        alloc_array_from_slice::<NeonIORequestSlot>(shmem_area, num_neon_request_slots as usize);
+
+    for slot in neon_request_slots.iter_mut() {
+        slot.write(NeonIORequestSlot::default());
+    }
+
+    // 'neon_request_slots' is initialized now. (MaybeUninit::slice_assume_init_mut() is nightly-only
+    // as of this writing.)
+    let neon_request_slots = unsafe {
+        std::mem::transmute::<&mut [MaybeUninit<NeonIORequestSlot>], &mut [NeonIORequestSlot]>(
+            neon_request_slots,
+        )
+    };
+
+    // Give the rest of the area to the integrated cache
+    let integrated_cache_init_struct = IntegratedCacheInitStruct::shmem_init(
+        remaining_area,
+        initial_file_cache_size,
+        max_file_cache_size,
+    );
+
+    let (submission_pipe_read_fd, submission_pipe_write_fd) = unsafe {
+        use std::os::fd::FromRawFd;
+        (
+            OwnedFd::from_raw_fd(submission_pipe_read_fd),
+            OwnedFd::from_raw_fd(submission_pipe_write_fd),
+        )
+    };
+
+    let cis: &'static mut CommunicatorInitStruct = Box::leak(Box::new(CommunicatorInitStruct {
+        submission_pipe_read_fd,
+        submission_pipe_write_fd,
+
+        num_neon_request_slots,
+        neon_request_slots,
+
+        integrated_cache_init_struct,
+    }));
+
+    cis
+}
+
+// fixme: currently unused
+#[allow(dead_code)]
+pub fn alloc_from_slice<T>(
+    area: &mut [MaybeUninit<u8>],
+) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
+    let layout = std::alloc::Layout::new::<T>();
+
+    let area_start = area.as_mut_ptr();
+
+    // pad to satisfy alignment requirements
+    let padding = area_start.align_offset(layout.align());
+    if padding + layout.size() > area.len() {
+        panic!("out of memory");
+    }
+    let area = &mut area[padding..];
+    let (result_area, remain) = area.split_at_mut(layout.size());
+
+    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
+    let result = unsafe { result_ptr.as_mut().unwrap() };
+
+    (result, remain)
+}
+
+pub fn alloc_array_from_slice<T>(
+    area: &mut [MaybeUninit<u8>],
+    len: usize,
+) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
+    let layout = std::alloc::Layout::new::<T>();
+
+    let area_start = area.as_mut_ptr();
+
+    // pad to satisfy alignment requirements
+    let padding = area_start.align_offset(layout.align());
+    if padding + layout.size() * len > area.len() {
+        panic!("out of memory");
+    }
+    let area = &mut area[padding..];
+    let (result_area, remain) = area.split_at_mut(layout.size() * len);
+
+    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
+    let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
+
+    (result, remain)
+}
--- a/pgxn/neon/communicator/src/integrated_cache.rs
+++ b/pgxn/neon/communicator/src/integrated_cache.rs
@@ -0,0 +1,804 @@
+//! Integrated communicator cache
+//!
+//! It tracks:
+//! - Relation sizes and existence
+//! - Last-written LSN
+//! - Block cache (also known as LFC)
+//!
+//! TODO: limit the size
+//! TODO: concurrency
+//!
+//! Note: This deals with "relations" which is really just one "relation fork" in Postgres
+//! terms. RelFileLocator + ForkNumber is the key.
+
+//
+// TODO: Thoughts on eviction:
+//
+// There are two things we need to track, and evict if we run out of space:
+// - blocks in the file cache's file. If the file grows too large, need to evict something.
+//   Also if the cache is resized
+//
+// - entries in the cache map. If we run out of memory in the shmem area, need to evict
+//   something
+//
+
+use std::mem::MaybeUninit;
+use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
+
+use utils::lsn::{AtomicLsn, Lsn};
+
+use crate::file_cache::INVALID_CACHE_BLOCK;
+use crate::file_cache::{CacheBlock, FileCache};
+use pageserver_page_api::RelTag;
+
+use metrics::{IntCounter, IntGauge};
+
+use neon_shmem::hash::{HashMapInit, entry::Entry};
+use neon_shmem::shmem::ShmemHandle;
+
+// in # of entries
+const RELSIZE_CACHE_SIZE: u32 = 64 * 1024;
+
+/// This struct is initialized at postmaster startup, and passed to all the processes via fork().
+pub struct IntegratedCacheInitStruct<'t> {
+    relsize_cache_handle: HashMapInit<'t, RelKey, RelEntry>,
+    block_map_handle: HashMapInit<'t, BlockKey, BlockEntry>,
+}
+
+/// Represents write-access to the integrated cache. This is used by the communicator process.
+#[derive(Debug)]
+pub struct IntegratedCacheWriteAccess<'t> {
+    relsize_cache: neon_shmem::hash::HashMapAccess<'t, RelKey, RelEntry>,
+    block_map: neon_shmem::hash::HashMapAccess<'t, BlockKey, BlockEntry>,
+
+    global_lw_lsn: AtomicU64,
+
+    pub(crate) file_cache: Option<FileCache>,
+
+    // Fields for eviction
+    clock_hand: std::sync::Mutex<usize>,
+
+    // Metrics
+    page_evictions_counter: IntCounter,
+    clock_iterations_counter: IntCounter,
+
+    // metrics from the hash map
+    block_map_num_buckets: IntGauge,
+    block_map_num_buckets_in_use: IntGauge,
+
+    relsize_cache_num_buckets: IntGauge,
+    relsize_cache_num_buckets_in_use: IntGauge,
+}
+
+/// Represents read-only access to the integrated cache. Backend processes have this.
+pub struct IntegratedCacheReadAccess<'t> {
+    relsize_cache: neon_shmem::hash::HashMapAccess<'t, RelKey, RelEntry>,
+    block_map: neon_shmem::hash::HashMapAccess<'t, BlockKey, BlockEntry>,
+}
+
+impl<'t> IntegratedCacheInitStruct<'t> {
+    /// Return the desired size in bytes of the fixed-size shared memory area to reserve for the
+    /// integrated cache.
+    pub fn shmem_size() -> usize {
+        // The relsize cache is fixed-size. The block map is allocated in a separate resizable
+        // area.
+        HashMapInit::<RelKey, RelEntry>::estimate_size(RELSIZE_CACHE_SIZE)
+    }
+
+    /// Initialize the shared memory segment. This runs once in postmaster. Returns a struct which
+    /// will be inherited by all processes through fork.
+    pub fn shmem_init(
+        shmem_area: &'t mut [MaybeUninit<u8>],
+        initial_file_cache_size: u64,
+        max_file_cache_size: u64,
+    ) -> IntegratedCacheInitStruct<'t> {
+        // Initialize the relsize cache in the fixed-size area
+        let relsize_cache_handle =
+            neon_shmem::hash::HashMapInit::with_fixed(RELSIZE_CACHE_SIZE, shmem_area);
+
+        let max_bytes =
+            HashMapInit::<BlockKey, BlockEntry>::estimate_size(max_file_cache_size as u32);
+
+        // Initialize the block map in a separate resizable shared memory area
+        let shmem_handle = ShmemHandle::new("block mapping", 0, max_bytes).unwrap();
+
+        let block_map_handle =
+            neon_shmem::hash::HashMapInit::with_shmem(initial_file_cache_size as u32, shmem_handle);
+        IntegratedCacheInitStruct {
+            relsize_cache_handle,
+            block_map_handle,
+        }
+    }
+
+    /// Initialize access to the integrated cache for the communicator worker process
+    pub fn worker_process_init(
+        self,
+        lsn: Lsn,
+        file_cache: Option<FileCache>,
+    ) -> IntegratedCacheWriteAccess<'t> {
+        let IntegratedCacheInitStruct {
+            relsize_cache_handle,
+            block_map_handle,
+        } = self;
+        IntegratedCacheWriteAccess {
+            relsize_cache: relsize_cache_handle.attach_writer(),
+            block_map: block_map_handle.attach_writer(),
+            global_lw_lsn: AtomicU64::new(lsn.0),
+            file_cache,
+            clock_hand: std::sync::Mutex::new(0),
+
+            page_evictions_counter: metrics::IntCounter::new(
+                "integrated_cache_evictions",
+                "Page evictions from the Local File Cache",
+            )
+            .unwrap(),
+
+            clock_iterations_counter: metrics::IntCounter::new(
+                "clock_iterations",
+                "Number of times the clock hand has moved",
+            )
+            .unwrap(),
+
+            block_map_num_buckets: metrics::IntGauge::new(
+                "block_map_num_buckets",
+                "Allocated size of the block cache hash map",
+            )
+            .unwrap(),
+            block_map_num_buckets_in_use: metrics::IntGauge::new(
+                "block_map_num_buckets_in_use",
+                "Number of buckets in use in the block cache hash map",
+            )
+            .unwrap(),
+
+            relsize_cache_num_buckets: metrics::IntGauge::new(
+                "relsize_cache_num_buckets",
+                "Allocated size of the relsize cache hash map",
+            )
+            .unwrap(),
+            relsize_cache_num_buckets_in_use: metrics::IntGauge::new(
+                "relsize_cache_num_buckets_in_use",
+                "Number of buckets in use in the relsize cache hash map",
+            )
+            .unwrap(),
+        }
+    }
+
+    /// Initialize access to the integrated cache for a backend process
+    pub fn backend_init(self) -> IntegratedCacheReadAccess<'t> {
+        let IntegratedCacheInitStruct {
+            relsize_cache_handle,
+            block_map_handle,
+        } = self;
+
+        IntegratedCacheReadAccess {
+            relsize_cache: relsize_cache_handle.attach_reader(),
+            block_map: block_map_handle.attach_reader(),
+        }
+    }
+}
+
+/// Value stored in the cache mapping hash table.
+struct BlockEntry {
+    lw_lsn: AtomicLsn,
+    cache_block: AtomicU64,
+
+    pinned: AtomicU64,
+
+    // 'referenced' bit for the clock algorithm
+    referenced: AtomicBool,
+}
+
+/// Value stored in the relsize cache hash table.
+struct RelEntry {
+    /// cached size of the relation
+    /// u32::MAX means 'not known' (that's InvalidBlockNumber in Postgres)
+    nblocks: AtomicU32,
+
+    /// This is the last time the "metadata" of this relation changed, not
+    /// the contents of the blocks. That is, the size of the relation.
+    lw_lsn: AtomicLsn,
+}
+
+impl std::fmt::Debug for RelEntry {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        fmt.debug_struct("Rel")
+            .field("nblocks", &self.nblocks.load(Ordering::Relaxed))
+            .finish()
+    }
+}
+impl std::fmt::Debug for BlockEntry {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        fmt.debug_struct("Block")
+            .field("lw_lsn", &self.lw_lsn.load())
+            .field("cache_block", &self.cache_block.load(Ordering::Relaxed))
+            .field("pinned", &self.pinned.load(Ordering::Relaxed))
+            .field("referenced", &self.referenced.load(Ordering::Relaxed))
+            .finish()
+    }
+}
+
+#[derive(Clone, Debug, PartialEq, PartialOrd, Eq, Hash, Ord)]
+struct RelKey(RelTag);
+
+impl From<&RelTag> for RelKey {
+    fn from(val: &RelTag) -> RelKey {
+        RelKey(*val)
+    }
+}
+
+#[derive(Clone, Debug, PartialEq, PartialOrd, Eq, Hash, Ord)]
+struct BlockKey {
+    rel: RelTag,
+    block_number: u32,
+}
+
+impl From<(&RelTag, u32)> for BlockKey {
+    fn from(val: (&RelTag, u32)) -> BlockKey {
+        BlockKey {
+            rel: *val.0,
+            block_number: val.1,
+        }
+    }
+}
+
+/// Return type used in the cache's get_*() functions. 'Found' means that the page, or other
+/// information that was enqueried, exists in the cache. '
+pub enum CacheResult<V> {
+    /// The enqueried page or other information existed in the cache.
+    Found(V),
+
+    /// The cache doesn't contain the page (or other enqueried information, like relation size). The
+    /// Lsn is the 'not_modified_since' LSN that should be used in the request to the pageserver to
+    /// read the page.
+    NotFound(Lsn),
+}
+
+impl<'t> IntegratedCacheWriteAccess<'t> {
+    pub fn get_rel_size(&'t self, rel: &RelTag) -> CacheResult<u32> {
+        if let Some(nblocks) = get_rel_size(&self.relsize_cache, rel) {
+            CacheResult::Found(nblocks)
+        } else {
+            let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
+            CacheResult::NotFound(lsn)
+        }
+    }
+
+    pub async fn get_page(
+        &'t self,
+        rel: &RelTag,
+        block_number: u32,
+        dst: impl uring_common::buf::IoBufMut + Send + Sync,
+    ) -> Result<CacheResult<()>, std::io::Error> {
+        let x = if let Some(block_entry) = self.block_map.get(&BlockKey::from((rel, block_number)))
+        {
+            block_entry.referenced.store(true, Ordering::Relaxed);
+
+            let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
+            if cache_block != INVALID_CACHE_BLOCK {
+                // pin it and release lock
+                block_entry.pinned.fetch_add(1, Ordering::Relaxed);
+
+                (cache_block, DeferredUnpin(block_entry.pinned.as_ptr()))
+            } else {
+                return Ok(CacheResult::NotFound(block_entry.lw_lsn.load()));
+            }
+        } else {
+            let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
+            return Ok(CacheResult::NotFound(lsn));
+        };
+
+        let (cache_block, _deferred_pin) = x;
+        self.file_cache
+            .as_ref()
+            .unwrap()
+            .read_block(cache_block, dst)
+            .await?;
+
+        // unpin the entry (by implicitly dropping deferred_pin)
+        Ok(CacheResult::Found(()))
+    }
+
+    pub async fn page_is_cached(
+        &'t self,
+        rel: &RelTag,
+        block_number: u32,
+    ) -> Result<CacheResult<()>, std::io::Error> {
+        if let Some(block_entry) = self.block_map.get(&BlockKey::from((rel, block_number))) {
+            // This is used for prefetch requests. Treat the probe as an 'access', to keep it
+            // in cache.
+            block_entry.referenced.store(true, Ordering::Relaxed);
+
+            let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
+
+            if cache_block != INVALID_CACHE_BLOCK {
+                Ok(CacheResult::Found(()))
+            } else {
+                Ok(CacheResult::NotFound(block_entry.lw_lsn.load()))
+            }
+        } else {
+            let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
+            Ok(CacheResult::NotFound(lsn))
+        }
+    }
+
+    /// Does the relation exists? CacheResult::NotFound means that the cache doesn't contain that
+    /// information, i.e. we don't know if the relation exists or not.
+    pub fn get_rel_exists(&'t self, rel: &RelTag) -> CacheResult<bool> {
+        // we don't currently cache negative entries, so if the relation is in the cache, it exists
+        if let Some(_rel_entry) = self.relsize_cache.get(&RelKey::from(rel)) {
+            CacheResult::Found(true)
+        } else {
+            let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
+            CacheResult::NotFound(lsn)
+        }
+    }
+
+    pub fn get_db_size(&'t self, _db_oid: u32) -> CacheResult<u64> {
+        // TODO: it would be nice to cache database sizes too. Getting the database size
+        // is not a very common operation, but when you do it, it's often interactive, with
+        // e.g. psql \l+ command, so the user will feel the latency.
+
+        // fixme: is this right lsn?
+        let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
+        CacheResult::NotFound(lsn)
+    }
+
+    pub fn remember_rel_size(&'t self, rel: &RelTag, nblocks: u32, lsn: Lsn) {
+        match self.relsize_cache.entry(RelKey::from(rel)) {
+            Entry::Vacant(e) => {
+                tracing::info!("inserting rel entry for {rel:?}, {nblocks} blocks");
+                // FIXME: what to do if we run out of memory? Evict other relation entries?
+                _ = e
+                    .insert(RelEntry {
+                        nblocks: AtomicU32::new(nblocks),
+                        lw_lsn: AtomicLsn::new(lsn.0),
+                    })
+                    .expect("out of memory");
+            }
+            Entry::Occupied(e) => {
+                tracing::info!("updating rel entry for {rel:?}, {nblocks} blocks");
+                e.get().nblocks.store(nblocks, Ordering::Relaxed);
+                e.get().lw_lsn.store(lsn);
+            }
+        };
+    }
+
+    /// Remember the given page contents in the cache.
+    pub async fn remember_page(
+        &'t self,
+        rel: &RelTag,
+        block_number: u32,
+        src: impl uring_common::buf::IoBuf + Send + Sync,
+        lw_lsn: Lsn,
+        is_write: bool,
+    ) {
+        let key = BlockKey::from((rel, block_number));
+
+        // FIXME: make this work when file cache is disabled. Or make it mandatory
+        let file_cache = self.file_cache.as_ref().unwrap();
+
+        if is_write {
+            // there should be no concurrent IOs. If a backend tries to read the page
+            // at the same time, they may get a torn write. That's the same as with
+            // regular POSIX filesystem read() and write()
+
+            // First check if we have a block in cache already
+            let mut old_cache_block = None;
+            let mut found_existing = false;
+
+            // NOTE(quantumish): honoring original semantics here (used to be update_with_fn)
+            // but I don't see any reason why this has to take a write lock.
+            if let Entry::Occupied(e) = self.block_map.entry(key.clone()) {
+                let block_entry = e.get();
+                found_existing = true;
+
+                // Prevent this entry from being evicted
+                let pin_count = block_entry.pinned.fetch_add(1, Ordering::Relaxed);
+                if pin_count > 0 {
+                    // this is unexpected, because the caller has obtained the io-in-progress lock,
+                    // so no one else should try to modify the page at the same time.
+                    // XXX: and I think a read should not be happening either, because the postgres
+                    // buffer is held locked. TODO: check these conditions and tidy this up a little. Seems fragile to just panic.
+                    panic!("block entry was unexpectedly pinned");
+                }
+
+                let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
+                old_cache_block = if cache_block != INVALID_CACHE_BLOCK {
+                    Some(cache_block)
+                } else {
+                    None
+                };
+            }
+
+            // Allocate a new block if required
+            let cache_block = old_cache_block.unwrap_or_else(|| {
+                loop {
+                    if let Some(x) = file_cache.alloc_block() {
+                        break x;
+                    }
+                    if let Some(x) = self.try_evict_one_cache_block() {
+                        break x;
+                    }
+                }
+            });
+
+            // Write the page to the cache file
+            file_cache
+                .write_block(cache_block, src)
+                .await
+                .expect("error writing to cache");
+            // FIXME: handle errors gracefully.
+            // FIXME: unpin the block entry on error
+
+            // Update the block entry
+            let entry = self.block_map.entry(key);
+            assert_eq!(found_existing, matches!(entry, Entry::Occupied(_)));
+            match entry {
+                Entry::Occupied(e) => {
+                    let block_entry = e.get();
+                    // Update the cache block
+                    let old_blk = block_entry.cache_block.compare_exchange(
+                        INVALID_CACHE_BLOCK,
+                        cache_block,
+                        Ordering::Relaxed,
+                        Ordering::Relaxed,
+                    );
+                    assert!(old_blk == Ok(INVALID_CACHE_BLOCK) || old_blk == Err(cache_block));
+
+                    block_entry.lw_lsn.store(lw_lsn);
+
+                    block_entry.referenced.store(true, Ordering::Relaxed);
+
+                    let pin_count = block_entry.pinned.fetch_sub(1, Ordering::Relaxed);
+                    assert!(pin_count > 0);
+                }
+                Entry::Vacant(e) => {
+                    // FIXME: what to do if we run out of memory? Evict other relation entries? Remove
+                    // block entries first?
+                    _ = e
+                        .insert(BlockEntry {
+                            lw_lsn: AtomicLsn::new(lw_lsn.0),
+                            cache_block: AtomicU64::new(cache_block),
+                            pinned: AtomicU64::new(0),
+                            referenced: AtomicBool::new(true),
+                        })
+                        .expect("out of memory");
+                }
+            }
+        } else {
+            // !is_write
+            //
+            // We can assume that it doesn't already exist, because the
+            // caller is assumed to have already checked it, and holds
+            // the io-in-progress lock. (The BlockEntry might exist, but no cache block)
+
+            // Allocate a new block first
+            let cache_block = {
+                loop {
+                    if let Some(x) = file_cache.alloc_block() {
+                        break x;
+                    }
+                    if let Some(x) = self.try_evict_one_cache_block() {
+                        break x;
+                    }
+                }
+            };
+
+            // Write the page to the cache file
+            file_cache
+                .write_block(cache_block, src)
+                .await
+                .expect("error writing to cache");
+            // FIXME: handle errors gracefully.
+
+            match self.block_map.entry(key) {
+                Entry::Occupied(e) => {
+                    let block_entry = e.get();
+                    // FIXME: could there be concurrent readers?
+                    assert!(block_entry.pinned.load(Ordering::Relaxed) == 0);
+
+                    let old_cache_block =
+                        block_entry.cache_block.swap(cache_block, Ordering::Relaxed);
+                    if old_cache_block != INVALID_CACHE_BLOCK {
+                        panic!(
+                            "remember_page called in !is_write mode, but page is already cached at blk {old_cache_block}"
+                        );
+                    }
+                }
+                Entry::Vacant(e) => {
+                    // FIXME: what to do if we run out of memory? Evict other relation entries? Remove
+                    // block entries first?
+                    _ = e
+                        .insert(BlockEntry {
+                            lw_lsn: AtomicLsn::new(lw_lsn.0),
+                            cache_block: AtomicU64::new(cache_block),
+                            pinned: AtomicU64::new(0),
+                            referenced: AtomicBool::new(true),
+                        })
+                        .expect("out of memory");
+                }
+            }
+        }
+    }
+
+    /// Forget information about given relation in the cache. (For DROP TABLE and such)
+    pub fn forget_rel(&'t self, rel: &RelTag, _nblocks: Option<u32>, flush_lsn: Lsn) {
+        tracing::info!("forgetting rel entry for {rel:?}");
+        self.relsize_cache.remove(&RelKey::from(rel));
+
+        // update with flush LSN
+        let _ = self.global_lw_lsn.fetch_max(flush_lsn.0, Ordering::Relaxed);
+
+        // also forget all cached blocks for the relation
+        // FIXME
+        /*
+            let mut iter = MapIterator::new(&key_range_for_rel_blocks(rel));
+            let r = self.cache_tree.start_read();
+            while let Some((k, _v)) = iter.next(&r) {
+                let w = self.cache_tree.start_write();
+
+                let mut evicted_cache_block = None;
+
+                let res = w.update_with_fn(&k, |e| {
+                    if let Some(e) = e {
+                        let block_entry = if let MapEntry::Block(e) = e {
+                            e
+                        } else {
+                            panic!("unexpected map entry type for block key");
+                        };
+                        let cache_block = block_entry
+                            .cache_block
+                            .swap(INVALID_CACHE_BLOCK, Ordering::Relaxed);
+                        if cache_block != INVALID_CACHE_BLOCK {
+                            evicted_cache_block = Some(cache_block);
+                        }
+                        UpdateAction::Remove
+                    } else {
+                        UpdateAction::Nothing
+                    }
+                });
+
+                // FIXME: It's pretty surprising to run out of memory while removing. But
+                // maybe it can happen because of trying to shrink a node?
+                res.expect("out of memory");
+
+                if let Some(evicted_cache_block) = evicted_cache_block {
+                    self.file_cache
+                        .as_ref()
+                        .unwrap()
+                        .dealloc_block(evicted_cache_block);
+                }
+        }
+
+            */
+    }
+
+    // Maintenance routines
+
+    /// Evict one block from the file cache. This is used when the file cache fills up
+    /// Returns the evicted block. It's not put to the free list, so it's available for the
+    /// caller to use immediately.
+    pub fn try_evict_one_cache_block(&self) -> Option<CacheBlock> {
+        let mut clock_hand = self.clock_hand.lock().unwrap();
+        for _ in 0..100 {
+            self.clock_iterations_counter.inc();
+
+            (*clock_hand) += 1;
+
+            let mut evict_this = false;
+            let num_buckets = self.block_map.get_num_buckets();
+            match self
+                .block_map
+                .get_at_bucket((*clock_hand) % num_buckets)
+                .as_deref()
+            {
+                None => {
+                    // This bucket was unused
+                }
+                Some((_, blk_entry)) => {
+                    if !blk_entry.referenced.swap(false, Ordering::Relaxed) {
+                        // Evict this. Maybe.
+                        evict_this = true;
+                    }
+                }
+            };
+
+            if evict_this {
+                // grab the write lock
+                let mut evicted_cache_block = None;
+                if let Some(e) = self.block_map.entry_at_bucket(*clock_hand % num_buckets) {
+                    let old = e.get();
+                    // note: all the accesses to 'pinned' currently happen
+                    // within update_with_fn(), or while holding ValueReadGuard, which protects from concurrent
+                    // updates. Otherwise, another thread could set the 'pinned'
+                    // flag just after we have checked it here.
+                    if old.pinned.load(Ordering::Relaxed) == 0 {
+                        let _ = self
+                            .global_lw_lsn
+                            .fetch_max(old.lw_lsn.load().0, Ordering::Relaxed);
+                        let cache_block =
+                            old.cache_block.swap(INVALID_CACHE_BLOCK, Ordering::Relaxed);
+                        if cache_block != INVALID_CACHE_BLOCK {
+                            evicted_cache_block = Some(cache_block);
+                        }
+                        e.remove();
+                    }
+                }
+
+                if evicted_cache_block.is_some() {
+                    self.page_evictions_counter.inc();
+                    return evicted_cache_block;
+                }
+            }
+        }
+        // Give up if we didn't find anything
+        None
+    }
+
+    /// Resize the local file cache.
+    pub fn resize_file_cache(&self, num_blocks: u32) {
+        let old_num_blocks = self.block_map.get_num_buckets() as u32;
+
+        if old_num_blocks < num_blocks {
+            if let Err(err) = self.block_map.grow(num_blocks) {
+                tracing::warn!(
+                    "could not grow file cache to {} blocks (old size {}): {}",
+                    num_blocks,
+                    old_num_blocks,
+                    err
+                );
+            }
+        } else {
+            // TODO: Shrinking not implemented yet
+        }
+    }
+
+    pub fn dump_map(&self, _dst: &mut dyn std::io::Write) {
+        //FIXME self.cache_map.start_read().dump(dst);
+    }
+}
+
+impl metrics::core::Collector for IntegratedCacheWriteAccess<'_> {
+    fn desc(&self) -> Vec<&metrics::core::Desc> {
+        let mut descs = Vec::new();
+        descs.append(&mut self.page_evictions_counter.desc());
+        descs.append(&mut self.clock_iterations_counter.desc());
+
+        descs.append(&mut self.block_map_num_buckets.desc());
+        descs.append(&mut self.block_map_num_buckets_in_use.desc());
+
+        descs.append(&mut self.relsize_cache_num_buckets.desc());
+        descs.append(&mut self.relsize_cache_num_buckets_in_use.desc());
+
+        descs
+    }
+    fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
+        // Update gauges
+        self.block_map_num_buckets
+            .set(self.block_map.get_num_buckets() as i64);
+        self.block_map_num_buckets_in_use
+            .set(self.block_map.get_num_buckets_in_use() as i64);
+        self.relsize_cache_num_buckets
+            .set(self.relsize_cache.get_num_buckets() as i64);
+        self.relsize_cache_num_buckets_in_use
+            .set(self.relsize_cache.get_num_buckets_in_use() as i64);
+
+        let mut values = Vec::new();
+        values.append(&mut self.page_evictions_counter.collect());
+        values.append(&mut self.clock_iterations_counter.collect());
+
+        values.append(&mut self.block_map_num_buckets.collect());
+        values.append(&mut self.block_map_num_buckets_in_use.collect());
+
+        values.append(&mut self.relsize_cache_num_buckets.collect());
+        values.append(&mut self.relsize_cache_num_buckets_in_use.collect());
+
+        values
+    }
+}
+
+/// Read relation size from the cache.
+///
+/// This is in a separate function so that it can be shared by
+/// IntegratedCacheReadAccess::get_rel_size() and IntegratedCacheWriteAccess::get_rel_size()
+fn get_rel_size(
+    r: &neon_shmem::hash::HashMapAccess<RelKey, RelEntry>,
+    rel: &RelTag,
+) -> Option<u32> {
+    if let Some(rel_entry) = r.get(&RelKey::from(rel)) {
+        let nblocks = rel_entry.nblocks.load(Ordering::Relaxed);
+        if nblocks != u32::MAX {
+            Some(nblocks)
+        } else {
+            None
+        }
+    } else {
+        None
+    }
+}
+
+/// Accessor for other backends
+///
+/// This allows backends to read pages from the cache directly, on their own, without making a
+/// request to the communicator process.
+impl<'t> IntegratedCacheReadAccess<'t> {
+    pub fn get_rel_size(&'t self, rel: &RelTag) -> Option<u32> {
+        get_rel_size(&self.relsize_cache, rel)
+    }
+
+    pub fn start_read_op(&'t self) -> BackendCacheReadOp<'t> {
+        BackendCacheReadOp {
+            read_guards: Vec::new(),
+            map_access: self,
+        }
+    }
+
+    /// Check if the given page is present in the cache
+    pub fn cache_contains_page(&'t self, rel: &RelTag, block_number: u32) -> bool {
+        self.block_map
+            .get(&BlockKey::from((rel, block_number)))
+            .is_some()
+    }
+}
+
+pub struct BackendCacheReadOp<'t> {
+    read_guards: Vec<DeferredUnpin>,
+    map_access: &'t IntegratedCacheReadAccess<'t>,
+}
+
+impl<'e> BackendCacheReadOp<'e> {
+    /// Initiate a read of the page from the cache.
+    ///
+    /// This returns the "cache block number", i.e. the block number within the cache file, where
+    /// the page's contents is stored. To get the page contents, the caller needs to read that block
+    /// from the cache file. This returns a guard object that you must hold while it performs the
+    /// read. It's possible that while you are performing the read, the cache block is invalidated.
+    /// After you have completed the read, call BackendCacheReadResult::finish() to check if the
+    /// read was in fact valid or not. If it was concurrently invalidated, you need to retry.
+    pub fn get_page(&mut self, rel: &RelTag, block_number: u32) -> Option<u64> {
+        if let Some(block_entry) = self
+            .map_access
+            .block_map
+            .get(&BlockKey::from((rel, block_number)))
+        {
+            block_entry.referenced.store(true, Ordering::Relaxed);
+
+            let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
+            if cache_block != INVALID_CACHE_BLOCK {
+                block_entry.pinned.fetch_add(1, Ordering::Relaxed);
+                self.read_guards
+                    .push(DeferredUnpin(block_entry.pinned.as_ptr()));
+                Some(cache_block)
+            } else {
+                None
+            }
+        } else {
+            None
+        }
+    }
+
+    pub fn finish(self) -> bool {
+        // TODO: currently, we hold a pin on the in-memory map, so concurrent invalidations are not
+        // possible. But if we switch to optimistic locking, this would return 'false' if the
+        // optimistic locking failed and you need to retry.
+        true
+    }
+}
+
+/// A hack to decrement an AtomicU64 on drop. This is used to decrement the pin count
+/// of a BlockEntry. The safety depends on the fact that the BlockEntry is not evicted
+/// or moved while it's pinned.
+struct DeferredUnpin(*mut u64);
+
+unsafe impl Sync for DeferredUnpin {}
+unsafe impl Send for DeferredUnpin {}
+
+impl Drop for DeferredUnpin {
+    fn drop(&mut self) {
+        // unpin it
+        unsafe {
+            let pin_ref = AtomicU64::from_ptr(self.0);
+            pin_ref.fetch_sub(1, Ordering::Relaxed);
+        }
+    }
+}
--- a/pgxn/neon/communicator/src/lib.rs
+++ b/pgxn/neon/communicator/src/lib.rs
@@ -1,6 +1,25 @@
-/// dummy function, just to test linking Rust functions into the C
-/// extension
-#[unsafe(no_mangle)]
-pub extern "C" fn communicator_dummy(arg: u32) -> u32 {
-    arg + 1
-}
+//! Three main parts:
+//! - async tokio communicator core, which receives requests and processes them.
+//! - Main loop and requests queues, which routes requests from backends to the core
+//! - the per-backend glue code, which submits requests
+
+mod backend_comms;
+
+// mark this 'pub', because these functions are called from C code. Otherwise, the compiler
+// complains about a bunch of structs and enum variants being unused, because it thinkgs
+// the functions that use them are never called. There are some C-callable functions in
+// other modules too, but marking this as pub is currently enough to silence the warnings
+//
+// TODO: perhaps collect *all* the extern "C" functions to one module?
+pub mod backend_interface;
+
+mod file_cache;
+mod init;
+mod integrated_cache;
+mod neon_request;
+mod worker_process;
+
+mod global_allocator;
+
+// FIXME: get this from postgres headers somehow
+pub const BLCKSZ: usize = 8192;
--- a/pgxn/neon/communicator/src/neon_request.rs
+++ b/pgxn/neon/communicator/src/neon_request.rs
@@ -0,0 +1,433 @@
+pub type CLsn = u64;
+pub type COid = u32;
+
+// This conveniently matches PG_IOV_MAX
+pub const MAX_GETPAGEV_PAGES: usize = 32;
+
+use std::ffi::CStr;
+
+use pageserver_page_api::{self as page_api, SlruKind};
+
+#[allow(clippy::large_enum_variant)]
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub enum NeonIORequest {
+    Empty,
+
+    // Read requests. These are C-friendly variants of the corresponding structs in
+    // pageserver_page_api.
+    RelExists(CRelExistsRequest),
+    RelSize(CRelSizeRequest),
+    GetPageV(CGetPageVRequest),
+    ReadSlruSegment(CReadSlruSegmentRequest),
+    PrefetchV(CPrefetchVRequest),
+    DbSize(CDbSizeRequest),
+
+    // Write requests. These are needed to keep the relation size cache and LFC up-to-date.
+    // They are not sent to the pageserver.
+    WritePage(CWritePageRequest),
+    RelExtend(CRelExtendRequest),
+    RelZeroExtend(CRelZeroExtendRequest),
+    RelCreate(CRelCreateRequest),
+    RelTruncate(CRelTruncateRequest),
+    RelUnlink(CRelUnlinkRequest),
+
+    // Other requests
+    UpdateCachedRelSize(CUpdateCachedRelSizeRequest),
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub enum NeonIOResult {
+    Empty,
+    RelExists(bool),
+    RelSize(u32),
+
+    /// the result pages are written to the shared memory addresses given in the request
+    GetPageV,
+    /// The result is written to the file, path to which is provided
+    /// in the request. The [`u64`] value here is the number of blocks.
+    ReadSlruSegment(u64),
+
+    /// A prefetch request returns as soon as the request has been received by the communicator.
+    /// It is processed in the background.
+    PrefetchVLaunched,
+
+    DbSize(u64),
+
+    // FIXME design compact error codes. Can't easily pass a string or other dynamic data.
+    // currently, this is 'errno'
+    Error(i32),
+
+    Aborted,
+
+    /// used for all write requests
+    WriteOK,
+}
+
+impl NeonIORequest {
+    pub fn request_id(&self) -> u64 {
+        use NeonIORequest::*;
+        match self {
+            Empty => 0,
+            RelExists(req) => req.request_id,
+            RelSize(req) => req.request_id,
+            GetPageV(req) => req.request_id,
+            ReadSlruSegment(req) => req.request_id,
+            PrefetchV(req) => req.request_id,
+            DbSize(req) => req.request_id,
+            WritePage(req) => req.request_id,
+            RelExtend(req) => req.request_id,
+            RelZeroExtend(req) => req.request_id,
+            RelCreate(req) => req.request_id,
+            RelTruncate(req) => req.request_id,
+            RelUnlink(req) => req.request_id,
+            UpdateCachedRelSize(req) => req.request_id,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CCachedGetPageVResult {
+    pub cache_block_numbers: [u64; MAX_GETPAGEV_PAGES],
+}
+
+/// ShmemBuf represents a buffer in shared memory.
+///
+/// SAFETY: The pointer must point to an area in shared memory. The functions allow you to liberally
+/// get a mutable pointer to the contents; it is the caller's responsibility to ensure that you
+/// don't access a buffer that's you're not allowed to. Inappropriate access to the buffer doesn't
+/// violate Rust's safety semantics, but it will mess up and crash Postgres.
+///
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct ShmemBuf {
+    // These fields define where the result is written. Must point into a buffer in shared memory!
+    pub ptr: *mut u8,
+}
+
+unsafe impl Send for ShmemBuf {}
+unsafe impl Sync for ShmemBuf {}
+
+unsafe impl uring_common::buf::IoBuf for ShmemBuf {
+    fn stable_ptr(&self) -> *const u8 {
+        self.ptr
+    }
+
+    fn bytes_init(&self) -> usize {
+        crate::BLCKSZ
+    }
+
+    fn bytes_total(&self) -> usize {
+        crate::BLCKSZ
+    }
+}
+
+unsafe impl uring_common::buf::IoBufMut for ShmemBuf {
+    fn stable_mut_ptr(&mut self) -> *mut u8 {
+        self.ptr
+    }
+
+    unsafe fn set_init(&mut self, pos: usize) {
+        if pos > crate::BLCKSZ {
+            panic!(
+                "set_init called past end of buffer, pos {}, buffer size {}",
+                pos,
+                crate::BLCKSZ
+            );
+        }
+    }
+}
+
+impl ShmemBuf {
+    pub fn as_mut_ptr(&self) -> *mut u8 {
+        self.ptr
+    }
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelExistsRequest {
+    pub request_id: u64,
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelSizeRequest {
+    pub request_id: u64,
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CGetPageVRequest {
+    pub request_id: u64,
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+    pub nblocks: u8,
+
+    // These fields define where the result is written. Must point into a buffer in shared memory!
+    pub dest: [ShmemBuf; MAX_GETPAGEV_PAGES],
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CReadSlruSegmentRequest {
+    pub request_id: u64,
+    pub slru_kind: SlruKind,
+    pub segment_number: u32,
+    pub request_lsn: CLsn,
+    /// Must be a null-terminated C string containing the file path
+    /// where the communicator will write the SLRU segment.
+    pub destination_file_path: ShmemBuf,
+}
+
+impl CReadSlruSegmentRequest {
+    /// Returns the file path where the communicator will write the
+    /// SLRU segment.
+    pub(crate) fn destination_file_path(&self) -> String {
+        unsafe { CStr::from_ptr(self.destination_file_path.as_mut_ptr() as *const _) }
+            .to_string_lossy()
+            .into_owned()
+    }
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CPrefetchVRequest {
+    pub request_id: u64,
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+    pub nblocks: u8,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CDbSizeRequest {
+    pub request_id: u64,
+    pub db_oid: COid,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CWritePageRequest {
+    pub request_id: u64,
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+    pub lsn: CLsn,
+
+    // These fields define where the result is written. Must point into a buffer in shared memory!
+    pub src: ShmemBuf,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelExtendRequest {
+    pub request_id: u64,
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+    pub lsn: CLsn,
+
+    // These fields define page contents. Must point into a buffer in shared memory!
+    pub src: ShmemBuf,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelZeroExtendRequest {
+    pub request_id: u64,
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+    pub nblocks: u32,
+    pub lsn: CLsn,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelCreateRequest {
+    pub request_id: u64,
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub lsn: CLsn,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelTruncateRequest {
+    pub request_id: u64,
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub nblocks: u32,
+    pub lsn: CLsn,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelUnlinkRequest {
+    pub request_id: u64,
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub lsn: CLsn,
+}
+
+impl CRelExistsRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CRelSizeRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CGetPageVRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CPrefetchVRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CWritePageRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CRelExtendRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CRelZeroExtendRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CRelCreateRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CRelTruncateRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+impl CRelUnlinkRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CUpdateCachedRelSizeRequest {
+    pub request_id: u64,
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub nblocks: u32,
+    pub lsn: CLsn,
+}
+
+impl CUpdateCachedRelSizeRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
--- a/pgxn/neon/communicator/src/worker_process/callbacks.rs
+++ b/pgxn/neon/communicator/src/worker_process/callbacks.rs
@@ -0,0 +1,28 @@
+//! C callbacks to PostgreSQL facilities that the neon extension needs
+//! to provide. These are implemented in `neon/pgxn/communicator_new.c`.
+//! The function signatures better match!
+//!
+//! These are called from the communicator threads! Careful what you do, most
+//! Postgres functions are not safe to call in that context.
+
+use utils::lsn::Lsn;
+
+unsafe extern "C" {
+    pub fn notify_proc_unsafe(procno: std::ffi::c_int);
+    pub fn callback_set_my_latch_unsafe();
+    pub fn callback_get_request_lsn_unsafe() -> u64;
+}
+
+// safe wrappers
+
+pub(super) fn notify_proc(procno: std::ffi::c_int) {
+    unsafe { notify_proc_unsafe(procno) };
+}
+
+pub(super) fn callback_set_my_latch() {
+    unsafe { callback_set_my_latch_unsafe() };
+}
+
+pub(super) fn get_request_lsn() -> Lsn {
+    Lsn(unsafe { callback_get_request_lsn_unsafe() })
+}
--- a/pgxn/neon/communicator/src/worker_process/in_progress_ios.rs
+++ b/pgxn/neon/communicator/src/worker_process/in_progress_ios.rs
@@ -0,0 +1,95 @@
+//! Lock table to ensure that only one IO request is in flight for a given
+//! block (or relation or database metadata) at a time
+
+use std::cmp::Eq;
+use std::hash::Hash;
+use std::sync::Arc;
+
+use tokio::sync::{Mutex, OwnedMutexGuard};
+
+use clashmap::ClashMap;
+use clashmap::Entry;
+
+use pageserver_page_api::RelTag;
+
+#[derive(Clone, Eq, Hash, PartialEq)]
+pub enum RequestInProgressKey {
+    Db(u32),
+    Rel(RelTag),
+    Block(RelTag, u32),
+}
+
+type RequestId = u64;
+
+pub type RequestInProgressTable = MutexHashMap<RequestInProgressKey, RequestId>;
+
+// more primitive locking thingie:
+
+pub struct MutexHashMap<K, V>
+where
+    K: Clone + Eq + Hash,
+{
+    lock_table: ClashMap<K, (V, Arc<Mutex<()>>)>,
+}
+
+pub struct MutexHashMapGuard<'a, K, V>
+where
+    K: Clone + Eq + Hash,
+{
+    pub key: K,
+    map: &'a MutexHashMap<K, V>,
+    mutex: Arc<Mutex<()>>,
+    _guard: OwnedMutexGuard<()>,
+}
+
+impl<'a, K, V> Drop for MutexHashMapGuard<'a, K, V>
+where
+    K: Clone + Eq + Hash,
+{
+    fn drop(&mut self) {
+        let (_old_key, old_val) = self.map.lock_table.remove(&self.key).unwrap();
+        assert!(Arc::ptr_eq(&old_val.1, &self.mutex));
+
+        // the guard will be dropped as we return
+    }
+}
+
+impl<K, V> MutexHashMap<K, V>
+where
+    K: Clone + Eq + Hash,
+    V: std::fmt::Display + Copy,
+{
+    pub fn new() -> MutexHashMap<K, V> {
+        MutexHashMap {
+            lock_table: ClashMap::new(),
+        }
+    }
+
+    pub async fn lock<'a>(&'a self, key: K, val: V) -> MutexHashMapGuard<'a, K, V> {
+        let my_mutex = Arc::new(Mutex::new(()));
+        let my_guard = Arc::clone(&my_mutex).lock_owned().await;
+
+        loop {
+            let (request_id, lock) = match self.lock_table.entry(key.clone()) {
+                Entry::Occupied(e) => {
+                    let e = e.get();
+                    (e.0, Arc::clone(&e.1))
+                }
+                Entry::Vacant(e) => {
+                    e.insert((val, Arc::clone(&my_mutex)));
+                    break;
+                }
+            };
+            tracing::info!("waiting for conflicting IO {request_id} to complete");
+            let _ = lock.lock().await;
+            tracing::info!("conflicting IO {request_id} completed");
+        }
+
+        MutexHashMapGuard {
+            key,
+            map: self,
+            mutex: my_mutex,
+            _guard: my_guard,
+        }
+    }
+}
--- a/pgxn/neon/communicator/src/worker_process/logging.rs
+++ b/pgxn/neon/communicator/src/worker_process/logging.rs
@@ -0,0 +1,231 @@
+//! Glue code to hook up Rust logging with the `tracing` crate to the PostgreSQL log
+//!
+//! In the Rust threads, the log messages are written to a mpsc Channel, and the Postgres
+//! process latch is raised. That wakes up the loop in the main thread. It reads the
+//! message from the channel and ereport()s it. This ensures that only one thread, the main
+//! thread, calls the PostgreSQL logging routines at any time.
+
+use std::sync::mpsc::sync_channel;
+use std::sync::mpsc::{Receiver, SyncSender};
+use std::sync::mpsc::{TryRecvError, TrySendError};
+
+use tracing::info;
+use tracing::{Event, Level, Metadata, Subscriber};
+use tracing_subscriber::filter::LevelFilter;
+use tracing_subscriber::fmt::FmtContext;
+use tracing_subscriber::fmt::FormatEvent;
+use tracing_subscriber::fmt::FormatFields;
+use tracing_subscriber::fmt::FormattedFields;
+use tracing_subscriber::fmt::MakeWriter;
+use tracing_subscriber::fmt::format::Writer;
+use tracing_subscriber::registry::LookupSpan;
+
+use crate::worker_process::callbacks::callback_set_my_latch;
+
+pub struct LoggingState {
+    receiver: Receiver<FormattedEventWithMeta>,
+}
+
+/// Called once, at worker process startup. The returned LoggingState is passed back
+/// in the subsequent calls to `pump_logging`. It is opaque to the C code.
+#[unsafe(no_mangle)]
+pub extern "C" fn configure_logging() -> Box<LoggingState> {
+    let (sender, receiver) = sync_channel(1000);
+
+    let maker = Maker { channel: sender };
+
+    use tracing_subscriber::prelude::*;
+    let r = tracing_subscriber::registry();
+
+    let r = r.with(
+        tracing_subscriber::fmt::layer()
+            .with_ansi(false)
+            .event_format(SimpleFormatter::new())
+            .with_writer(maker)
+            // TODO: derive this from log_min_messages?
+            .with_filter(LevelFilter::from_level(Level::INFO)),
+    );
+    r.init();
+
+    info!("communicator process logging started");
+
+    let state = LoggingState { receiver };
+
+    Box::new(state)
+}
+
+/// Read one message from the logging queue. This is essentially a wrapper to Receiver,
+/// with a C-friendly signature.
+///
+/// The message is copied into *errbuf, which is a caller-supplied buffer of size `errbuf_len`.
+/// If the message doesn't fit in the buffer, it is truncated. It is always NULL-terminated.
+///
+/// The error level is returned *elevel_p. It's one of the PostgreSQL error levels, see elog.h
+#[unsafe(no_mangle)]
+pub extern "C" fn pump_logging(
+    state: &mut LoggingState,
+    errbuf: *mut u8,
+    errbuf_len: u32,
+    elevel_p: &mut i32,
+) -> i32 {
+    let msg = match state.receiver.try_recv() {
+        Err(TryRecvError::Empty) => return 0,
+        Err(TryRecvError::Disconnected) => return -1,
+        Ok(msg) => msg,
+    };
+
+    let src: &[u8] = &msg.message;
+    let dst = errbuf;
+    let len = std::cmp::min(src.len(), errbuf_len as usize - 1);
+    unsafe {
+        std::ptr::copy_nonoverlapping(src.as_ptr(), dst, len);
+        *(errbuf.add(len)) = b'\0'; // NULL terminator
+    }
+
+    // XXX: these levels are copied from PostgreSQL's elog.h. Introduce another enum
+    // to hide these?
+    *elevel_p = match msg.level {
+        Level::TRACE => 10, // DEBUG5
+        Level::DEBUG => 14, // DEBUG1
+        Level::INFO => 17,  // INFO
+        Level::WARN => 19,  // WARNING
+        Level::ERROR => 21, // ERROR
+    };
+
+    1
+}
+
+//---- The following functions can be called from any thread ----
+
+#[derive(Clone)]
+struct FormattedEventWithMeta {
+    message: Vec<u8>,
+    level: tracing::Level,
+}
+
+impl Default for FormattedEventWithMeta {
+    fn default() -> Self {
+        FormattedEventWithMeta {
+            message: Vec::new(),
+            level: tracing::Level::DEBUG,
+        }
+    }
+}
+
+struct EventBuilder<'a> {
+    event: FormattedEventWithMeta,
+
+    maker: &'a Maker,
+}
+
+impl std::io::Write for EventBuilder<'_> {
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        self.event.message.write(buf)
+    }
+    fn flush(&mut self) -> std::io::Result<()> {
+        self.maker.send_event(self.event.clone());
+        Ok(())
+    }
+}
+
+impl Drop for EventBuilder<'_> {
+    fn drop(&mut self) {
+        let maker = self.maker;
+        let event = std::mem::take(&mut self.event);
+
+        maker.send_event(event);
+    }
+}
+
+struct Maker {
+    channel: SyncSender<FormattedEventWithMeta>,
+}
+
+impl<'a> MakeWriter<'a> for Maker {
+    type Writer = EventBuilder<'a>;
+
+    fn make_writer(&'a self) -> Self::Writer {
+        panic!("not expected to be called when make_writer_for is implemented");
+    }
+
+    fn make_writer_for(&'a self, meta: &Metadata<'_>) -> Self::Writer {
+        EventBuilder {
+            event: FormattedEventWithMeta {
+                message: Vec::new(),
+                level: *meta.level(),
+            },
+            maker: self,
+        }
+    }
+}
+
+impl Maker {
+    fn send_event(&self, e: FormattedEventWithMeta) {
+        match self.channel.try_send(e) {
+            Ok(()) => {
+                // notify the main thread
+                callback_set_my_latch();
+            }
+            Err(TrySendError::Disconnected(_)) => {}
+            Err(TrySendError::Full(_)) => {
+                // TODO: record that some messages were lost
+            }
+        }
+    }
+}
+
+/// Simple formatter implementation for tracing_subscriber, which prints the log
+/// spans and message part like the default formatter, but no timestamp or error
+/// level. The error level is captured separately by `FormattedEventWithMeta',
+/// and when the error is printed by the main thread, with PostgreSQL ereport(),
+/// it gets a timestamp at that point. (The timestamp printed will therefore lag
+/// behind the timestamp on the event here, if the main thread doesn't process
+/// the log message promptly)
+struct SimpleFormatter;
+
+impl<S, N> FormatEvent<S, N> for SimpleFormatter
+where
+    S: Subscriber + for<'a> LookupSpan<'a>,
+    N: for<'a> FormatFields<'a> + 'static,
+{
+    fn format_event(
+        &self,
+        ctx: &FmtContext<'_, S, N>,
+        mut writer: Writer<'_>,
+        event: &Event<'_>,
+    ) -> std::fmt::Result {
+        // Format all the spans in the event's span context.
+        if let Some(scope) = ctx.event_scope() {
+            for span in scope.from_root() {
+                write!(writer, "{}", span.name())?;
+
+                // `FormattedFields` is a formatted representation of the span's
+                // fields, which is stored in its extensions by the `fmt` layer's
+                // `new_span` method. The fields will have been formatted
+                // by the same field formatter that's provided to the event
+                // formatter in the `FmtContext`.
+                let ext = span.extensions();
+                let fields = &ext
+                    .get::<FormattedFields<N>>()
+                    .expect("will never be `None`");
+
+                // Skip formatting the fields if the span had no fields.
+                if !fields.is_empty() {
+                    write!(writer, "{{{fields}}}")?;
+                }
+                write!(writer, ": ")?;
+            }
+        }
+
+        // Write fields on the event
+        ctx.field_format().format_fields(writer.by_ref(), event)?;
+
+        writeln!(writer)
+    }
+}
+
+impl SimpleFormatter {
+    fn new() -> Self {
+        SimpleFormatter {}
+    }
+}
--- a/pgxn/neon/communicator/src/worker_process/main_loop.rs
+++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs
@@ -0,0 +1,820 @@
+use std::collections::HashMap;
+use std::os::fd::AsRawFd;
+use std::os::fd::OwnedFd;
+use std::path::PathBuf;
+use std::str::FromStr as _;
+
+use crate::backend_comms::NeonIORequestSlot;
+use crate::file_cache::FileCache;
+use crate::global_allocator::MyAllocatorCollector;
+use crate::init::CommunicatorInitStruct;
+use crate::integrated_cache::{CacheResult, IntegratedCacheWriteAccess};
+use crate::neon_request::{CGetPageVRequest, CPrefetchVRequest};
+use crate::neon_request::{NeonIORequest, NeonIOResult};
+use crate::worker_process::in_progress_ios::{RequestInProgressKey, RequestInProgressTable};
+use pageserver_client_grpc::{PageserverClient, ShardSpec, ShardStripeSize};
+use pageserver_page_api as page_api;
+
+use metrics::{IntCounter, IntCounterVec};
+
+use tokio::io::AsyncReadExt;
+use tokio_pipe::PipeRead;
+use uring_common::buf::IoBuf;
+use utils::id::{TenantId, TimelineId};
+
+use super::callbacks::{get_request_lsn, notify_proc};
+
+use tracing::{debug, error, info, info_span, trace};
+
+use utils::lsn::Lsn;
+
+pub struct CommunicatorWorkerProcessStruct<'a> {
+    /// Tokio runtime that the main loop and any other related tasks runs in.
+    runtime: tokio::runtime::Handle,
+
+    /// Client to communicate with the pageserver
+    client: PageserverClient,
+
+    /// Request slots that backends use to send IO requests to the communicator.
+    neon_request_slots: &'a [NeonIORequestSlot],
+
+    /// Notification pipe. Backends use this to notify the communicator that a request is waiting to
+    /// be processed in one of the request slots.
+    submission_pipe_read_fd: OwnedFd,
+
+    /// Locking table for all in-progress IO requests.
+    in_progress_table: RequestInProgressTable,
+
+    /// Local File Cache, relation size tracking, last-written LSN tracking
+    pub(crate) cache: IntegratedCacheWriteAccess<'a>,
+
+    /*** Static configuration ***/
+    /// Stripe size doesn't change after startup. (The shard map is not stored here, it's passed
+    /// directly to the client)
+    stripe_size: Option<ShardStripeSize>,
+
+    /*** Metrics ***/
+    request_counters: IntCounterVec,
+    request_rel_exists_counter: IntCounter,
+    request_rel_size_counter: IntCounter,
+    request_get_pagev_counter: IntCounter,
+    request_read_slru_segment_counter: IntCounter,
+    request_prefetchv_counter: IntCounter,
+    request_db_size_counter: IntCounter,
+    request_write_page_counter: IntCounter,
+    request_rel_extend_counter: IntCounter,
+    request_rel_zero_extend_counter: IntCounter,
+    request_rel_create_counter: IntCounter,
+    request_rel_truncate_counter: IntCounter,
+    request_rel_unlink_counter: IntCounter,
+
+    getpage_cache_misses_counter: IntCounter,
+    getpage_cache_hits_counter: IntCounter,
+
+    request_nblocks_counters: IntCounterVec,
+    request_get_pagev_nblocks_counter: IntCounter,
+    request_prefetchv_nblocks_counter: IntCounter,
+    request_rel_zero_extend_nblocks_counter: IntCounter,
+
+    allocator_metrics: MyAllocatorCollector,
+}
+
+pub(super) async fn init(
+    cis: Box<CommunicatorInitStruct>,
+    tenant_id: String,
+    timeline_id: String,
+    auth_token: Option<String>,
+    shard_map: HashMap<utils::shard::ShardIndex, String>,
+    stripe_size: Option<ShardStripeSize>,
+    initial_file_cache_size: u64,
+    file_cache_path: Option<PathBuf>,
+) -> CommunicatorWorkerProcessStruct<'static> {
+    info!("Test log message");
+    let last_lsn = get_request_lsn();
+
+    let file_cache = if let Some(path) = file_cache_path {
+        Some(FileCache::new(&path, initial_file_cache_size).expect("could not create cache file"))
+    } else {
+        // FIXME: temporarily for testing, use LFC even if disabled
+        Some(
+            FileCache::new(&PathBuf::from("new_filecache"), 1000)
+                .expect("could not create cache file"),
+        )
+    };
+
+    // Initialize subsystems
+    let cache = cis
+        .integrated_cache_init_struct
+        .worker_process_init(last_lsn, file_cache);
+
+    debug!("Initialised integrated cache: {cache:?}");
+
+    let tenant_id = TenantId::from_str(&tenant_id).expect("invalid tenant ID");
+    let timeline_id = TimelineId::from_str(&timeline_id).expect("invalid timeline ID");
+    let shard_spec = ShardSpec::new(shard_map, stripe_size).expect("invalid shard spec");
+    let client = PageserverClient::new(tenant_id, timeline_id, shard_spec, auth_token, None)
+        .expect("could not create client");
+
+    let request_counters = IntCounterVec::new(
+        metrics::core::Opts::new(
+            "backend_requests_total",
+            "Number of requests from backends.",
+        ),
+        &["request_kind"],
+    )
+    .unwrap();
+    let request_rel_exists_counter = request_counters.with_label_values(&["rel_exists"]);
+    let request_rel_size_counter = request_counters.with_label_values(&["rel_size"]);
+    let request_get_pagev_counter = request_counters.with_label_values(&["get_pagev"]);
+    let request_read_slru_segment_counter =
+        request_counters.with_label_values(&["read_slru_segment"]);
+    let request_prefetchv_counter = request_counters.with_label_values(&["prefetchv"]);
+    let request_db_size_counter = request_counters.with_label_values(&["db_size"]);
+    let request_write_page_counter = request_counters.with_label_values(&["write_page"]);
+    let request_rel_extend_counter = request_counters.with_label_values(&["rel_extend"]);
+    let request_rel_zero_extend_counter = request_counters.with_label_values(&["rel_zero_extend"]);
+    let request_rel_create_counter = request_counters.with_label_values(&["rel_create"]);
+    let request_rel_truncate_counter = request_counters.with_label_values(&["rel_truncate"]);
+    let request_rel_unlink_counter = request_counters.with_label_values(&["rel_unlink"]);
+
+    let getpage_cache_misses_counter = IntCounter::new(
+        "getpage_cache_misses",
+        "Number of file cache misses in get_pagev requests.",
+    )
+    .unwrap();
+    let getpage_cache_hits_counter = IntCounter::new(
+        "getpage_cache_hits",
+        "Number of file cache hits in get_pagev requests.",
+    )
+    .unwrap();
+
+    // For the requests that affect multiple blocks, have separate counters for the # of blocks affected
+    let request_nblocks_counters = IntCounterVec::new(
+        metrics::core::Opts::new(
+            "request_nblocks_total",
+            "Number of blocks in backend requests.",
+        ),
+        &["request_kind"],
+    )
+    .unwrap();
+    let request_get_pagev_nblocks_counter =
+        request_nblocks_counters.with_label_values(&["get_pagev"]);
+    let request_prefetchv_nblocks_counter =
+        request_nblocks_counters.with_label_values(&["prefetchv"]);
+    let request_rel_zero_extend_nblocks_counter =
+        request_nblocks_counters.with_label_values(&["rel_zero_extend"]);
+
+    CommunicatorWorkerProcessStruct {
+        runtime: tokio::runtime::Handle::current(),
+        stripe_size,
+        neon_request_slots: cis.neon_request_slots,
+        client,
+        cache,
+        submission_pipe_read_fd: cis.submission_pipe_read_fd,
+        in_progress_table: RequestInProgressTable::new(),
+
+        // metrics
+        request_counters,
+        request_rel_exists_counter,
+        request_rel_size_counter,
+        request_get_pagev_counter,
+        request_read_slru_segment_counter,
+        request_prefetchv_counter,
+        request_db_size_counter,
+        request_write_page_counter,
+        request_rel_extend_counter,
+        request_rel_zero_extend_counter,
+        request_rel_create_counter,
+        request_rel_truncate_counter,
+        request_rel_unlink_counter,
+
+        getpage_cache_misses_counter,
+        getpage_cache_hits_counter,
+
+        request_nblocks_counters,
+        request_get_pagev_nblocks_counter,
+        request_prefetchv_nblocks_counter,
+        request_rel_zero_extend_nblocks_counter,
+
+        allocator_metrics: MyAllocatorCollector::new(),
+    }
+}
+
+impl<'t> CommunicatorWorkerProcessStruct<'t> {
+    /// Update the configuration
+    pub(super) fn update_shard_map(
+        &self,
+        new_shard_map: HashMap<utils::shard::ShardIndex, String>,
+    ) {
+        let shard_spec =
+            ShardSpec::new(new_shard_map, self.stripe_size.clone()).expect("invalid shard spec");
+
+        {
+            let _in_runtime = self.runtime.enter();
+            if let Err(err) = self.client.update_shards(shard_spec) {
+                tracing::error!("could not update shard map: {err:?}");
+            }
+        }
+    }
+
+    /// Main loop of the worker process. Receive requests from the backends and process them.
+    pub(super) async fn run(&'static self) {
+        let mut idxbuf: [u8; 4] = [0; 4];
+
+        let mut submission_pipe_read =
+            PipeRead::try_from(self.submission_pipe_read_fd.as_raw_fd()).expect("invalid pipe fd");
+
+        loop {
+            // Wait for a backend to ring the doorbell
+            match submission_pipe_read.read(&mut idxbuf).await {
+                Ok(4) => {}
+                Ok(nbytes) => panic!("short read ({nbytes} bytes) on communicator pipe"),
+                Err(e) => panic!("error reading from communicator pipe: {e}"),
+            }
+            let slot_idx = u32::from_ne_bytes(idxbuf) as usize;
+
+            // Read the IO request from the slot indicated in the wakeup
+            let Some(slot) = self.neon_request_slots[slot_idx].start_processing_request() else {
+                // This currently should not happen. But if we had multiple threads picking up
+                // requests, and without waiting for the notifications, it could.
+                panic!("no request in slot");
+            };
+
+            // Ok, we have ownership of this request now. We must process it now, there's no going
+            // back.
+            //
+            // Spawn a separate task for every request. That's a little excessive for requests that
+            // can be quickly satisfied from the cache, but we expect that to be rare, because the
+            // requesting backend would have already checked the cache.
+            tokio::spawn(async move {
+                use tracing::Instrument;
+
+                let request_id = slot.get_request().request_id();
+                let owner_procno = slot.get_owner_procno();
+
+                let span = info_span!(
+                    "processing",
+                    request_id = request_id,
+                    slot_idx = slot_idx,
+                    procno = owner_procno,
+                );
+                async {
+                    // FIXME: as a temporary hack, abort the request if we don't get a response
+                    // promptly.
+                    //
+                    // Lots of regression tests are getting stuck and failing at the moment,
+                    // this makes them fail a little faster, which it faster to iterate.
+                    // This needs to be removed once more regression tests are passing.
+                    // See also similar hack in the backend code, in wait_request_completion()
+                    let result = tokio::time::timeout(
+                        tokio::time::Duration::from_secs(30),
+                        self.handle_request(slot.get_request()),
+                    )
+                    .await
+                    .unwrap_or_else(|_elapsed| {
+                        info!("request {request_id} timed out");
+                        NeonIOResult::Error(libc::ETIMEDOUT)
+                    });
+                    trace!("request {request_id} at slot {slot_idx} completed");
+
+                    // Ok, we have completed the IO. Mark the request as completed. After that,
+                    // we no longer have ownership of the slot, and must not modify it.
+                    slot.completed(result);
+
+                    // Notify the backend about the completion. (Note that the backend might see
+                    // the completed status even before this; this is just a wakeup)
+                    notify_proc(owner_procno);
+                }
+                .instrument(span)
+                .await
+            });
+        }
+    }
+
+    /// Compute the 'request_lsn' to use for a pageserver request
+    fn request_lsns(&self, not_modified_since_lsn: Lsn) -> page_api::ReadLsn {
+        let mut request_lsn = get_request_lsn();
+
+        // Is it possible that the last-written LSN is ahead of last flush LSN? Generally not, we
+        // shouldn't evict a page from the buffer cache before all its modifications have been
+        // safely flushed. That's the "WAL before data" rule. However, there are a few exceptions:
+        //
+        // - when creation an index: _bt_blwritepage logs the full page without flushing WAL before
+        // smgrextend (files are fsynced before build ends).
+        //
+        // XXX: If we make a request LSN greater than the current WAL flush LSN, the pageserver would
+        // block waiting for the WAL arrive, until we flush it and it propagates through the
+        // safekeepers to the pageserver. If there's nothing that forces the WAL to be flushed,
+        // the pageserver would get stuck waiting forever. To avoid that, all the write-
+        // functions in communicator_new.c call XLogSetAsyncXactLSN(). That nudges the WAL writer to
+        // perform the flush relatively soon.
+        //
+        // It would perhaps be nicer to do the WAL flush here, but it's tricky to call back into
+        // Postgres code to do that from here. That's why we rely on communicator_new.c to do the
+        // calls "pre-emptively".
+        //
+        // FIXME: Because of the above, it can still happen that the flush LSN is ahead of
+        // not_modified_since, if the WAL writer hasn't done the flush yet. It would be nice to know
+        // if there are other cases like that that we have mised, but unfortunately we cannot turn
+        // this into an assertion because of that legit case.
+        //
+        // See also the old logic in neon_get_request_lsns() C function
+        if not_modified_since_lsn > request_lsn {
+            tracing::info!(
+                "not_modified_since_lsn {} is ahead of last flushed LSN {}",
+                not_modified_since_lsn,
+                request_lsn
+            );
+            request_lsn = not_modified_since_lsn;
+        }
+
+        page_api::ReadLsn {
+            request_lsn,
+            not_modified_since_lsn: Some(not_modified_since_lsn),
+        }
+    }
+
+    /// Handle one IO request
+    async fn handle_request(&'static self, req: &'_ NeonIORequest) -> NeonIOResult {
+        match req {
+            NeonIORequest::Empty => {
+                error!("unexpected Empty IO request");
+                NeonIOResult::Error(0)
+            }
+            NeonIORequest::RelExists(req) => {
+                self.request_rel_exists_counter.inc();
+                let rel = req.reltag();
+
+                let _in_progress_guard = self
+                    .in_progress_table
+                    .lock(RequestInProgressKey::Rel(rel), req.request_id)
+                    .await;
+
+                // Check the cache first
+                let not_modified_since = match self.cache.get_rel_exists(&rel) {
+                    CacheResult::Found(exists) => return NeonIOResult::RelExists(exists),
+                    CacheResult::NotFound(lsn) => lsn,
+                };
+
+                match self
+                    .client
+                    .check_rel_exists(page_api::CheckRelExistsRequest {
+                        read_lsn: self.request_lsns(not_modified_since),
+                        rel,
+                    })
+                    .await
+                {
+                    Ok(exists) => NeonIOResult::RelExists(exists),
+                    Err(err) => {
+                        info!("tonic error: {err:?}");
+                        NeonIOResult::Error(0)
+                    }
+                }
+            }
+
+            NeonIORequest::RelSize(req) => {
+                self.request_rel_size_counter.inc();
+                let rel = req.reltag();
+
+                let _in_progress_guard = self
+                    .in_progress_table
+                    .lock(RequestInProgressKey::Rel(rel), req.request_id)
+                    .await;
+
+                // Check the cache first
+                let not_modified_since = match self.cache.get_rel_size(&rel) {
+                    CacheResult::Found(nblocks) => {
+                        tracing::trace!("found relsize for {:?} in cache: {}", rel, nblocks);
+                        return NeonIOResult::RelSize(nblocks);
+                    }
+                    CacheResult::NotFound(lsn) => lsn,
+                };
+
+                let read_lsn = self.request_lsns(not_modified_since);
+                match self
+                    .client
+                    .get_rel_size(page_api::GetRelSizeRequest { read_lsn, rel })
+                    .await
+                {
+                    Ok(nblocks) => {
+                        // update the cache
+                        tracing::info!(
+                            "updated relsize for {:?} in cache: {}, lsn {}",
+                            rel,
+                            nblocks,
+                            read_lsn
+                        );
+                        self.cache
+                            .remember_rel_size(&rel, nblocks, not_modified_since);
+
+                        NeonIOResult::RelSize(nblocks)
+                    }
+                    Err(err) => {
+                        info!("tonic error: {err:?}");
+                        NeonIOResult::Error(0)
+                    }
+                }
+            }
+            NeonIORequest::GetPageV(req) => {
+                self.request_get_pagev_counter.inc();
+                self.request_get_pagev_nblocks_counter
+                    .inc_by(req.nblocks as u64);
+                match self.handle_get_pagev_request(req).await {
+                    Ok(()) => NeonIOResult::GetPageV,
+                    Err(errno) => NeonIOResult::Error(errno),
+                }
+            }
+            NeonIORequest::ReadSlruSegment(req) => {
+                self.request_read_slru_segment_counter.inc();
+                let lsn = Lsn(req.request_lsn);
+                let file_path = req.destination_file_path();
+
+                match self
+                    .client
+                    .get_slru_segment(page_api::GetSlruSegmentRequest {
+                        read_lsn: self.request_lsns(lsn),
+                        kind: req.slru_kind,
+                        segno: req.segment_number,
+                    })
+                    .await
+                {
+                    Ok(slru_bytes) => {
+                        if let Err(e) = tokio::fs::write(&file_path, &slru_bytes).await {
+                            info!("could not write slru segment to file {file_path}: {e}");
+                            return NeonIOResult::Error(e.raw_os_error().unwrap_or(libc::EIO));
+                        }
+
+                        let blocks_count = slru_bytes.len() / crate::BLCKSZ;
+
+                        NeonIOResult::ReadSlruSegment(blocks_count as _)
+                    }
+                    Err(err) => {
+                        info!("tonic error: {err:?}");
+                        NeonIOResult::Error(0)
+                    }
+                }
+            }
+            NeonIORequest::PrefetchV(req) => {
+                self.request_prefetchv_counter.inc();
+                self.request_prefetchv_nblocks_counter
+                    .inc_by(req.nblocks as u64);
+                let req = *req;
+                tokio::spawn(async move { self.handle_prefetchv_request(&req).await });
+                NeonIOResult::PrefetchVLaunched
+            }
+            NeonIORequest::DbSize(req) => {
+                self.request_db_size_counter.inc();
+                let _in_progress_guard = self
+                    .in_progress_table
+                    .lock(RequestInProgressKey::Db(req.db_oid), req.request_id)
+                    .await;
+
+                // Check the cache first
+                let not_modified_since = match self.cache.get_db_size(req.db_oid) {
+                    CacheResult::Found(db_size) => {
+                        // get_page already copied the block content to the destination
+                        return NeonIOResult::DbSize(db_size);
+                    }
+                    CacheResult::NotFound(lsn) => lsn,
+                };
+
+                match self
+                    .client
+                    .get_db_size(page_api::GetDbSizeRequest {
+                        read_lsn: self.request_lsns(not_modified_since),
+                        db_oid: req.db_oid,
+                    })
+                    .await
+                {
+                    Ok(db_size) => NeonIOResult::DbSize(db_size),
+                    Err(err) => {
+                        info!("tonic error: {err:?}");
+                        NeonIOResult::Error(0)
+                    }
+                }
+            }
+
+            // Write requests
+            NeonIORequest::WritePage(req) => {
+                self.request_write_page_counter.inc();
+
+                let rel = req.reltag();
+                let _in_progress_guard = self
+                    .in_progress_table
+                    .lock(
+                        RequestInProgressKey::Block(rel, req.block_number),
+                        req.request_id,
+                    )
+                    .await;
+
+                // We must at least update the last-written LSN on the page, but also store the page
+                // image in the LFC while we still have it
+                self.cache
+                    .remember_page(&rel, req.block_number, req.src, Lsn(req.lsn), true)
+                    .await;
+                NeonIOResult::WriteOK
+            }
+            NeonIORequest::RelExtend(req) => {
+                self.request_rel_extend_counter.inc();
+
+                let rel = req.reltag();
+                let _in_progress_guard = self
+                    .in_progress_table
+                    .lock(
+                        RequestInProgressKey::Block(rel, req.block_number),
+                        req.request_id,
+                    )
+                    .await;
+
+                // We must at least update the last-written LSN on the page and the relation size,
+                // but also store the page image in the LFC while we still have it
+                self.cache
+                    .remember_page(&rel, req.block_number, req.src, Lsn(req.lsn), true)
+                    .await;
+                self.cache
+                    .remember_rel_size(&req.reltag(), req.block_number + 1, Lsn(req.lsn));
+                NeonIOResult::WriteOK
+            }
+            NeonIORequest::RelZeroExtend(req) => {
+                self.request_rel_zero_extend_counter.inc();
+                self.request_rel_zero_extend_nblocks_counter
+                    .inc_by(req.nblocks as u64);
+
+                // TODO: need to grab an io-in-progress lock for this? I guess not
+                // TODO: We could put the empty pages to the cache. Maybe have
+                // a marker on the block entries for all-zero pages, instead of
+                // actually storing the empty pages.
+                self.cache.remember_rel_size(
+                    &req.reltag(),
+                    req.block_number + req.nblocks,
+                    Lsn(req.lsn),
+                );
+                NeonIOResult::WriteOK
+            }
+            NeonIORequest::RelCreate(req) => {
+                self.request_rel_create_counter.inc();
+
+                // TODO: need to grab an io-in-progress lock for this? I guess not
+                self.cache.remember_rel_size(&req.reltag(), 0, Lsn(req.lsn));
+                NeonIOResult::WriteOK
+            }
+            NeonIORequest::RelTruncate(req) => {
+                self.request_rel_truncate_counter.inc();
+
+                // TODO: need to grab an io-in-progress lock for this? I guess not
+                self.cache
+                    .remember_rel_size(&req.reltag(), req.nblocks, Lsn(req.lsn));
+                NeonIOResult::WriteOK
+            }
+            NeonIORequest::RelUnlink(req) => {
+                self.request_rel_unlink_counter.inc();
+
+                // TODO: need to grab an io-in-progress lock for this? I guess not
+                self.cache.forget_rel(&req.reltag(), None, Lsn(req.lsn));
+                NeonIOResult::WriteOK
+            }
+            NeonIORequest::UpdateCachedRelSize(req) => {
+                // TODO: need to grab an io-in-progress lock for this? I guess not
+                self.cache
+                    .remember_rel_size(&req.reltag(), req.nblocks, Lsn(req.lsn));
+                NeonIOResult::WriteOK
+            }
+        }
+    }
+
+    /// Subroutine to handle a GetPageV request, since it's a little more complicated than
+    /// others.
+    async fn handle_get_pagev_request(&'t self, req: &CGetPageVRequest) -> Result<(), i32> {
+        let rel = req.reltag();
+
+        // Check the cache first
+        //
+        // Note: Because the backends perform a direct lookup in the cache before sending
+        // the request to the communicator process, we expect the pages to almost never
+        // be already in cache. It could happen if:
+        // 1. two backends try to read the same page at the same time, but that should never
+        //    happen because there's higher level locking in the Postgres buffer manager, or
+        // 2. a prefetch request finished at the same time as a backend requested the
+        //    page. That's much more likely.
+        let mut cache_misses = Vec::with_capacity(req.nblocks as usize);
+        for i in 0..req.nblocks {
+            let blkno = req.block_number + i as u32;
+
+            // note: this is deadlock-safe even though we hold multiple locks at the same time,
+            // because they're always acquired in the same order.
+            let in_progress_guard = self
+                .in_progress_table
+                .lock(RequestInProgressKey::Block(rel, blkno), req.request_id)
+                .await;
+
+            let dest = req.dest[i as usize];
+            let not_modified_since = match self.cache.get_page(&rel, blkno, dest).await {
+                Ok(CacheResult::Found(_)) => {
+                    // get_page already copied the block content to the destination
+                    trace!("found blk {} in rel {:?} in LFC", blkno, rel);
+                    continue;
+                }
+                Ok(CacheResult::NotFound(lsn)) => lsn,
+                Err(_io_error) => return Err(-1), // FIXME errno?
+            };
+            cache_misses.push((blkno, not_modified_since, dest, in_progress_guard));
+        }
+        self.getpage_cache_misses_counter
+            .inc_by(cache_misses.len() as u64);
+        self.getpage_cache_hits_counter
+            .inc_by(req.nblocks as u64 - cache_misses.len() as u64);
+
+        if cache_misses.is_empty() {
+            return Ok(());
+        }
+        let not_modified_since = cache_misses
+            .iter()
+            .map(|(_blkno, lsn, _dest, _guard)| *lsn)
+            .max()
+            .unwrap();
+
+        // Construct a pageserver request for the cache misses
+        let block_numbers: Vec<u32> = cache_misses
+            .iter()
+            .map(|(blkno, _lsn, _dest, _guard)| *blkno)
+            .collect();
+        let read_lsn = self.request_lsns(not_modified_since);
+        info!(
+            "sending getpage request for blocks {:?} in rel {:?} lsns {}",
+            block_numbers, rel, read_lsn
+        );
+        match self
+            .client
+            .get_page(page_api::GetPageRequest {
+                request_id: req.request_id.into(),
+                request_class: page_api::GetPageClass::Normal,
+                read_lsn,
+                rel,
+                block_numbers: block_numbers.clone(),
+            })
+            .await
+        {
+            Ok(resp) => {
+                // Write the received page images directly to the shared memory location
+                // that the backend requested.
+                if resp.pages.len() != block_numbers.len() {
+                    error!(
+                        "received unexpected response with {} page images from pageserver for a request for {} pages",
+                        resp.pages.len(),
+                        block_numbers.len(),
+                    );
+                    return Err(-1);
+                }
+
+                info!(
+                    "received getpage response for blocks {:?} in rel {:?} lsns {}",
+                    block_numbers, rel, read_lsn
+                );
+
+                for (page, (blkno, _lsn, dest, _guard)) in resp.pages.into_iter().zip(cache_misses)
+                {
+                    let src: &[u8] = page.image.as_ref();
+                    let len = std::cmp::min(src.len(), dest.bytes_total());
+                    unsafe {
+                        std::ptr::copy_nonoverlapping(src.as_ptr(), dest.as_mut_ptr(), len);
+                    };
+
+                    // Also store it in the LFC while we have it
+                    self.cache
+                        .remember_page(
+                            &rel,
+                            blkno,
+                            page.image,
+                            read_lsn.not_modified_since_lsn.unwrap(),
+                            false,
+                        )
+                        .await;
+                }
+            }
+            Err(err) => {
+                info!("tonic error: {err:?}");
+                return Err(-1);
+            }
+        }
+        Ok(())
+    }
+
+    /// Subroutine to handle a PrefetchV request, since it's a little more complicated than
+    /// others.
+    ///
+    /// This is very similar to a GetPageV request, but the results are only stored in the cache.
+    async fn handle_prefetchv_request(&'static self, req: &CPrefetchVRequest) -> Result<(), i32> {
+        let rel = req.reltag();
+
+        // Check the cache first
+        let mut cache_misses = Vec::with_capacity(req.nblocks as usize);
+        for i in 0..req.nblocks {
+            let blkno = req.block_number + i as u32;
+
+            // note: this is deadlock-safe even though we hold multiple locks at the same time,
+            // because they're always acquired in the same order.
+            let in_progress_guard = self
+                .in_progress_table
+                .lock(RequestInProgressKey::Block(rel, blkno), req.request_id)
+                .await;
+
+            let not_modified_since = match self.cache.page_is_cached(&rel, blkno).await {
+                Ok(CacheResult::Found(_)) => {
+                    trace!("found blk {} in rel {:?} in LFC", blkno, rel);
+                    continue;
+                }
+                Ok(CacheResult::NotFound(lsn)) => lsn,
+                Err(_io_error) => return Err(-1), // FIXME errno?
+            };
+            cache_misses.push((blkno, not_modified_since, in_progress_guard));
+        }
+        if cache_misses.is_empty() {
+            return Ok(());
+        }
+        let not_modified_since = cache_misses
+            .iter()
+            .map(|(_blkno, lsn, _guard)| *lsn)
+            .max()
+            .unwrap();
+        let block_numbers: Vec<u32> = cache_misses
+            .iter()
+            .map(|(blkno, _lsn, _guard)| *blkno)
+            .collect();
+
+        // TODO: spawn separate tasks for these. Use the integrated cache to keep track of the
+        // in-flight requests
+
+        match self
+            .client
+            .get_page(page_api::GetPageRequest {
+                request_id: req.request_id.into(),
+                request_class: page_api::GetPageClass::Prefetch,
+                read_lsn: self.request_lsns(not_modified_since),
+                rel,
+                block_numbers: block_numbers.clone(),
+            })
+            .await
+        {
+            Ok(resp) => {
+                trace!(
+                    "prefetch completed, remembering blocks {:?} in rel {:?} in LFC",
+                    block_numbers, rel
+                );
+                if resp.pages.len() != block_numbers.len() {
+                    error!(
+                        "received unexpected response with {} page images from pageserver for a request for {} pages",
+                        resp.pages.len(),
+                        block_numbers.len(),
+                    );
+                    return Err(-1);
+                }
+
+                for (page, (blkno, _lsn, _guard)) in resp.pages.into_iter().zip(cache_misses) {
+                    self.cache
+                        .remember_page(&rel, blkno, page.image, not_modified_since, false)
+                        .await;
+                }
+            }
+            Err(err) => {
+                info!("tonic error: {err:?}");
+                return Err(-1);
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<'t> metrics::core::Collector for CommunicatorWorkerProcessStruct<'t> {
+    fn desc(&self) -> Vec<&metrics::core::Desc> {
+        let mut descs = Vec::new();
+
+        descs.append(&mut self.request_counters.desc());
+        descs.append(&mut self.getpage_cache_misses_counter.desc());
+        descs.append(&mut self.getpage_cache_hits_counter.desc());
+        descs.append(&mut self.request_nblocks_counters.desc());
+
+        if let Some(file_cache) = &self.cache.file_cache {
+            descs.append(&mut file_cache.desc());
+        }
+        descs.append(&mut self.cache.desc());
+        descs.append(&mut self.allocator_metrics.desc());
+
+        descs
+    }
+    fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
+        let mut values = Vec::new();
+
+        values.append(&mut self.request_counters.collect());
+        values.append(&mut self.getpage_cache_misses_counter.collect());
+        values.append(&mut self.getpage_cache_hits_counter.collect());
+        values.append(&mut self.request_nblocks_counters.collect());
+
+        if let Some(file_cache) = &self.cache.file_cache {
+            values.append(&mut file_cache.collect());
+        }
+        values.append(&mut self.cache.collect());
+        values.append(&mut self.allocator_metrics.collect());
+
+        values
+    }
+}
--- a/pgxn/neon/communicator/src/worker_process/metrics_exporter.rs
+++ b/pgxn/neon/communicator/src/worker_process/metrics_exporter.rs
@@ -0,0 +1,82 @@
+//! Export information about Postgres, the communicator process, file cache etc. as
+//! prometheus metrics.
+
+use axum::Router;
+use axum::body::Body;
+use axum::extract::State;
+use axum::response::Response;
+use http::StatusCode;
+use http::header::CONTENT_TYPE;
+
+use metrics::proto::MetricFamily;
+use metrics::{Encoder, TextEncoder};
+
+use std::path::PathBuf;
+
+use tokio::net::UnixListener;
+
+use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
+
+impl<'a> CommunicatorWorkerProcessStruct<'a> {
+    pub(crate) async fn launch_exporter_task(&'static self) {
+        use axum::routing::get;
+        let app = Router::new()
+            .route("/metrics", get(get_metrics))
+            .route("/dump_cache_map", get(dump_cache_map))
+            .with_state(self);
+
+        // Listen on unix domain socket, in the data directory. That should be unique.
+        let path = PathBuf::from(".metrics.socket");
+
+        let listener = UnixListener::bind(path.clone()).unwrap();
+
+        tokio::spawn(async {
+            tracing::info!("metrics listener spawned");
+            axum::serve(listener, app).await.unwrap()
+        });
+    }
+}
+
+async fn dump_cache_map(
+    State(state): State<&CommunicatorWorkerProcessStruct<'static>>,
+) -> Response {
+    let mut buf: Vec<u8> = Vec::new();
+    state.cache.dump_map(&mut buf);
+
+    Response::builder()
+        .status(StatusCode::OK)
+        .header(CONTENT_TYPE, "application/text")
+        .body(Body::from(buf))
+        .unwrap()
+}
+
+/// Expose Prometheus metrics.
+async fn get_metrics(State(state): State<&CommunicatorWorkerProcessStruct<'static>>) -> Response {
+    use metrics::core::Collector;
+    let metrics = state.collect();
+
+    // When we call TextEncoder::encode() below, it will immediately return an
+    // error if a metric family has no metrics, so we need to preemptively
+    // filter out metric families with no metrics.
+    let metrics = metrics
+        .into_iter()
+        .filter(|m| !m.get_metric().is_empty())
+        .collect::<Vec<MetricFamily>>();
+
+    let encoder = TextEncoder::new();
+    let mut buffer = vec![];
+
+    if let Err(e) = encoder.encode(&metrics, &mut buffer) {
+        Response::builder()
+            .status(StatusCode::INTERNAL_SERVER_ERROR)
+            .header(CONTENT_TYPE, "application/text")
+            .body(Body::from(e.to_string()))
+            .unwrap()
+    } else {
+        Response::builder()
+            .status(StatusCode::OK)
+            .header(CONTENT_TYPE, encoder.format_type())
+            .body(Body::from(buffer))
+            .unwrap()
+    }
+}
--- a/pgxn/neon/communicator/src/worker_process/mod.rs
+++ b/pgxn/neon/communicator/src/worker_process/mod.rs
@@ -0,0 +1,14 @@
+//! This code runs in the communicator worker process. This provides
+//! the glue code to:
+//!
+//! - launch the 'processor',
+//! - receive IO requests from backends and pass them to the processor,
+//! - write results back to backends.
+
+mod callbacks;
+mod logging;
+mod main_loop;
+mod metrics_exporter;
+mod worker_interface;
+
+mod in_progress_ios;
--- a/pgxn/neon/communicator/src/worker_process/worker_interface.rs
+++ b/pgxn/neon/communicator/src/worker_process/worker_interface.rs
@@ -0,0 +1,134 @@
+//! Functions called from the C code in the worker process
+
+use std::collections::HashMap;
+use std::ffi::{CStr, c_char};
+use std::path::PathBuf;
+
+use tracing::error;
+
+use crate::init::CommunicatorInitStruct;
+use crate::worker_process::main_loop;
+use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
+
+use pageserver_client_grpc::ShardStripeSize;
+
+/// Launch the communicator's tokio tasks, which do most of the work.
+///
+/// The caller has initialized the process as a regular PostgreSQL
+/// background worker process. The shared memory segment used to
+/// communicate with the backends has been allocated and initialized
+/// earlier, at postmaster startup, in rcommunicator_shmem_init().
+#[unsafe(no_mangle)]
+pub extern "C" fn communicator_worker_process_launch(
+    cis: Box<CommunicatorInitStruct>,
+    tenant_id: *const c_char,
+    timeline_id: *const c_char,
+    auth_token: *const c_char,
+    shard_map: *mut *mut c_char,
+    nshards: u32,
+    stripe_size: u32,
+    file_cache_path: *const c_char,
+    initial_file_cache_size: u64,
+) -> &'static CommunicatorWorkerProcessStruct<'static> {
+    // Convert the arguments into more convenient Rust types
+    let tenant_id = unsafe { CStr::from_ptr(tenant_id) }.to_str().unwrap();
+    let timeline_id = unsafe { CStr::from_ptr(timeline_id) }.to_str().unwrap();
+    let auth_token = if auth_token.is_null() {
+        None
+    } else {
+        Some(
+            unsafe { CStr::from_ptr(auth_token) }
+                .to_str()
+                .unwrap()
+                .to_string(),
+        )
+    };
+    let file_cache_path = {
+        if file_cache_path.is_null() {
+            None
+        } else {
+            let c_str = unsafe { CStr::from_ptr(file_cache_path) };
+            Some(PathBuf::from(c_str.to_str().unwrap()))
+        }
+    };
+    let shard_map = shard_map_to_hash(nshards, shard_map);
+
+    // start main loop
+    let runtime = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .thread_name("communicator thread")
+        .build()
+        .unwrap();
+
+    let worker_struct = runtime.block_on(main_loop::init(
+        cis,
+        tenant_id.to_string(),
+        timeline_id.to_string(),
+        auth_token,
+        shard_map,
+        if stripe_size > 0 {
+            Some(ShardStripeSize(stripe_size))
+        } else {
+            None
+        },
+        initial_file_cache_size,
+        file_cache_path,
+    ));
+    let worker_struct = Box::leak(Box::new(worker_struct));
+
+    let main_loop_handle = runtime.spawn(worker_struct.run());
+
+    runtime.spawn(async {
+        let err = main_loop_handle.await.unwrap_err();
+        error!("error: {err:?}");
+    });
+
+    runtime.block_on(worker_struct.launch_exporter_task());
+
+    // keep the runtime running after we exit this function
+    Box::leak(Box::new(runtime));
+
+    worker_struct
+}
+
+/// Convert the "shard map" from an array of C strings, indexed by shard no to a rust HashMap
+fn shard_map_to_hash(
+    nshards: u32,
+    shard_map: *mut *mut c_char,
+) -> HashMap<utils::shard::ShardIndex, String> {
+    use utils::shard::*;
+
+    assert!(nshards <= u8::MAX as u32);
+
+    let mut result: HashMap<ShardIndex, String> = HashMap::new();
+    let mut p = shard_map;
+
+    for i in 0..nshards {
+        let c_str = unsafe { CStr::from_ptr(*p) };
+
+        p = unsafe { p.add(1) };
+
+        let s = c_str.to_str().unwrap();
+        let k = if nshards > 1 {
+            ShardIndex::new(ShardNumber(i as u8), ShardCount(nshards as u8))
+        } else {
+            ShardIndex::unsharded()
+        };
+        result.insert(k, s.into());
+    }
+    result
+}
+
+/// Inform the rust code about a configuration change
+#[unsafe(no_mangle)]
+pub extern "C" fn communicator_worker_config_reload(
+    proc_handle: &'static CommunicatorWorkerProcessStruct<'static>,
+    file_cache_size: u64,
+    shard_map: *mut *mut c_char,
+    nshards: u32,
+) {
+    proc_handle.cache.resize_file_cache(file_cache_size as u32);
+
+    let shard_map = shard_map_to_hash(nshards, shard_map);
+    proc_handle.update_shard_map(shard_map);
+}
--- a/pgxn/neon/communicator_new.c
+++ b/pgxn/neon/communicator_new.c
--- a/pgxn/neon/communicator_new.h
+++ b/pgxn/neon/communicator_new.h
@@ -0,0 +1,64 @@
+/*-------------------------------------------------------------------------
+ *
+ * communicator_new.h
+ *	  new implementation
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef COMMUNICATOR_NEW_H
+#define COMMUNICATOR_NEW_H
+
+#include "neon_pgversioncompat.h"
+
+#include "storage/buf_internals.h"
+
+#include "pagestore_client.h"
+
+/* initialization at postmaster startup */
+extern void pg_init_communicator_new(void);
+extern void communicator_new_shmem_request(void);
+extern void communicator_new_shmem_startup(void);
+
+/* initialization at backend startup */
+extern void communicator_new_init(void);
+
+/* Read requests */
+extern bool communicator_new_rel_exists(NRelFileInfo rinfo, ForkNumber forkNum);
+extern BlockNumber communicator_new_rel_nblocks(NRelFileInfo rinfo, ForkNumber forknum);
+extern int64 communicator_new_dbsize(Oid dbNode);
+extern void communicator_new_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum,
+										  BlockNumber base_blockno,
+										  void **buffers, BlockNumber nblocks);
+extern void communicator_new_prefetch_register_bufferv(NRelFileInfo rinfo, ForkNumber forkNum,
+													   BlockNumber blockno,
+													   BlockNumber nblocks);
+extern bool communicator_new_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
+											BlockNumber blockno);
+extern int communicator_new_read_slru_segment(
+	SlruKind kind,
+	uint32_t segno,
+	neon_request_lsns *request_lsns,
+	char *path
+);
+
+/* Write requests, to keep the caches up-to-date */
+extern void communicator_new_write_page(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
+										const void *buffer, XLogRecPtr lsn);
+extern void communicator_new_rel_extend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
+										const void *buffer, XLogRecPtr lsn);
+extern void communicator_new_rel_zeroextend(NRelFileInfo rinfo, ForkNumber forkNum,
+											BlockNumber blockno, BlockNumber nblocks,
+											XLogRecPtr lsn);
+extern void communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum, XLogRecPtr lsn);
+extern void communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks, XLogRecPtr lsn);
+extern void communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum, XLogRecPtr lsn);
+extern void communicator_new_update_cached_rel_size(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks, XLogRecPtr lsn);
+
+/* other functions */
+extern int32 communicator_new_approximate_working_set_size_seconds(time_t duration, bool reset);
+
+#endif							/* COMMUNICATOR_NEW_H */
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -183,13 +183,13 @@ typedef struct FileCacheControl
 static HTAB *lfc_hash;
 static int	lfc_desc = -1;
 static LWLockId lfc_lock;
-static int	lfc_max_size;
-static int	lfc_size_limit;
+int	lfc_max_size;
+int	lfc_size_limit;
 static int	lfc_prewarm_limit;
 static int	lfc_prewarm_batch;
 static int	lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
 static int	lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
-static char *lfc_path;
+char *lfc_path;
 static uint64 lfc_generation;
 static FileCacheControl *lfc_ctl;
 static bool lfc_do_prewarm;
@@ -230,6 +230,8 @@ lfc_switch_off(void)
 {
 	int			fd;

+	Assert(!neon_use_communicator_worker);
+
 	if (LFC_ENABLED())
 	{
 		HASH_SEQ_STATUS status;
@@ -295,6 +297,8 @@ lfc_maybe_disabled(void)
 static bool
 lfc_ensure_opened(void)
 {
+	Assert(!neon_use_communicator_worker);
+
 	if (lfc_generation != lfc_ctl->generation)
 	{
 		lfc_close_file();
@@ -320,6 +324,8 @@ lfc_shmem_startup(void)
 	bool		found;
 	static HASHCTL info;

+	Assert(!neon_use_communicator_worker);
+
 	if (prev_shmem_startup_hook)
 	{
 		prev_shmem_startup_hook();
@@ -618,6 +624,9 @@ lfc_init(void)
 	if (lfc_max_size == 0)
 		return;

+	if (neon_use_communicator_worker)
+		return;
+
 	prev_shmem_startup_hook = shmem_startup_hook;
 	shmem_startup_hook = lfc_shmem_startup;
 #if PG_VERSION_NUM>=150000
@@ -693,6 +702,7 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
 	dsm_segment *seg;
 	BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS];

+	Assert(!neon_use_communicator_worker);

 	if (!lfc_ensure_opened())
 		return;
@@ -847,6 +857,8 @@ lfc_prewarm_main(Datum main_arg)
 	PrewarmWorkerState* ws;
 	uint32 worker_id = DatumGetInt32(main_arg);

+	Assert(!neon_use_communicator_worker);
+
 	AmPrewarmWorker = true;

 	pqsignal(SIGTERM, die);
@@ -947,6 +959,8 @@ lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
 	FileCacheEntry *entry;
 	uint32		hash;

+	Assert(!neon_use_communicator_worker);
+
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return;

@@ -992,6 +1006,8 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	bool		found = false;
 	uint32		hash;

+	Assert(!neon_use_communicator_worker);
+
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return false;

@@ -1027,6 +1043,8 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	uint32		hash;
 	int			i = 0;

+	Assert(!neon_use_communicator_worker);
+
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return 0;

@@ -1134,6 +1152,8 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	int			blocks_read = 0;
 	int			buf_offset = 0;

+	Assert(!neon_use_communicator_worker);
+
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return -1;

@@ -1500,6 +1520,8 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,

 	int		chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);

+	Assert(!neon_use_communicator_worker);
+
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return false;

@@ -1645,6 +1667,8 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	uint32		entry_offset;
 	int			buf_offset = 0;

+	Assert(!neon_use_communicator_worker);
+
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return;

@@ -2135,40 +2159,21 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		SRF_RETURN_DONE(funcctx);
 }

-PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);

-Datum
-approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
+int32
+lfc_approximate_working_set_size_seconds(time_t duration, bool reset)
 {
-	if (lfc_size_limit != 0)
-	{
-		int32 dc;
-		time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
-		LWLockAcquire(lfc_lock, LW_SHARED);
-		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
-		LWLockRelease(lfc_lock);
-		PG_RETURN_INT32(dc);
-	}
-	PG_RETURN_NULL();
-}
+	int32		dc;

-PG_FUNCTION_INFO_V1(approximate_working_set_size);
+	if (lfc_size_limit == 0)
+		return -1;

-Datum
-approximate_working_set_size(PG_FUNCTION_ARGS)
-{
-	if (lfc_size_limit != 0)
-	{
-		int32 dc;
-		bool reset = PG_GETARG_BOOL(0);
-		LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
-		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1);
-		if (reset)
-			memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
-		LWLockRelease(lfc_lock);
-		PG_RETURN_INT32(dc);
-	}
-	PG_RETURN_NULL();
+	LWLockAcquire(lfc_lock, LW_SHARED);
+	dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
+	if (reset)
+		memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
+	LWLockRelease(lfc_lock);
+	return dc;
 }

 PG_FUNCTION_INFO_V1(get_local_cache_state);
@@ -2177,7 +2182,13 @@ Datum
 get_local_cache_state(PG_FUNCTION_ARGS)
 {
 	size_t max_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0);
-	FileCacheState* fcs = lfc_get_state(max_entries);
+	FileCacheState* fcs;
+
+	if (neon_use_communicator_worker)
+		elog(ERROR, "TODO: not implemented");
+
+	fcs = lfc_get_state(max_entries);
+
 	if (fcs != NULL)
 		PG_RETURN_BYTEA_P((bytea*)fcs);
 	else
@@ -2191,8 +2202,12 @@ prewarm_local_cache(PG_FUNCTION_ARGS)
 {
 	bytea* state = PG_GETARG_BYTEA_PP(0);
 	uint32 n_workers =  PG_GETARG_INT32(1);
-	FileCacheState* fcs = (FileCacheState*)state;
+	FileCacheState* fcs;

+	if (neon_use_communicator_worker)
+		elog(ERROR, "TODO: not implemented");
+
+	fcs = (FileCacheState*)state;
 	lfc_prewarm(fcs, n_workers);

 	PG_RETURN_NULL();
@@ -2212,6 +2227,9 @@ get_prewarm_info(PG_FUNCTION_ARGS)
 	uint32 total_pages;
 	size_t n_workers;

+	if (neon_use_communicator_worker)
+		elog(ERROR, "TODO: not implemented");
+
 	if (lfc_size_limit == 0)
 		PG_RETURN_NULL();

--- a/pgxn/neon/file_cache.h
+++ b/pgxn/neon/file_cache.h
@@ -26,6 +26,9 @@ typedef struct FileCacheState

 /* GUCs */
 extern bool lfc_store_prefetch_result;
+extern int	lfc_max_size;
+extern int	lfc_size_limit;
+extern char *lfc_path;

 /* functions for local file cache */
 extern void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);
@@ -49,6 +52,9 @@ extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers);

 PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);

+extern int32 lfc_approximate_working_set_size_seconds(time_t duration, bool reset);
+
+
 static inline bool
 lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		 void *buffer)
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -69,7 +69,8 @@ char	   *neon_project_id;
 char	   *neon_branch_id;
 char	   *neon_endpoint_id;
 int32		max_cluster_size;
-char	   *page_server_connstring;
+char	   *pageserver_connstring;
+char	   *pageserver_grpc_urls;
 char	   *neon_auth_token;

 int			readahead_buffer_size = 128;
@@ -79,19 +80,13 @@ int         neon_protocol_version = 3;

 static int	neon_compute_mode = 0;
 static int	max_reconnect_attempts = 60;
-static int	stripe_size;
+int		neon_stripe_size;
 static int	max_sockets;

 static int pageserver_response_log_timeout = 10000;
 /* 2.5 minutes. A bit higher than highest default TCP retransmission timeout */
 static int pageserver_response_disconnect_timeout = 150000;

-typedef struct
-{
-	char		connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE];
-	size_t		num_shards;
-} ShardMap;
-
 /*
 * PagestoreShmemState is kept in shared memory. It contains the connection
 * strings for each shard.
@@ -128,7 +123,7 @@ static uint64 pagestore_local_counter = 0;
 typedef enum PSConnectionState {
 	PS_Disconnected,			/* no connection yet */
 	PS_Connecting_Startup,		/* connection starting up */
-	PS_Connecting_PageStream,	/* negotiating pagestream */ 
+	PS_Connecting_PageStream,	/* negotiating pagestream */
 	PS_Connected,				/* connected, pagestream established */
 } PSConnectionState;

@@ -177,6 +172,8 @@ static bool pageserver_flush(shardno_t shard_no);
 static void pageserver_disconnect(shardno_t shard_no);
 static void pageserver_disconnect_shard(shardno_t shard_no);

+static void AssignShardMap(const char *newval);
+
 static bool
 PagestoreShmemIsValid(void)
 {
@@ -190,8 +187,8 @@ PagestoreShmemIsValid(void)
 * not valid, returns false. The contents of *result are undefined in
 * that case, and must not be relied on.
 */
-static bool
-ParseShardMap(const char *connstr, ShardMap *result)
+bool
+parse_shard_map(const char *connstr, ShardMap *result)
 {
 	const char *p;
 	int			nshards = 0;
@@ -239,18 +236,25 @@ ParseShardMap(const char *connstr, ShardMap *result)
 	return true;
 }

+/* GUC hooks for neon.pageserver_connstring */
 static bool
 CheckPageserverConnstring(char **newval, void **extra, GucSource source)
 {
 	char	   *p = *newval;

-	return ParseShardMap(p, NULL);
+	return parse_shard_map(p, NULL);
 }

 static void
 AssignPageserverConnstring(const char *newval, void *extra)
 {
-	ShardMap	shard_map;
+	/*
+	 * 'neon.pageserver_connstring' is ignored if the new communicator is used.
+	 * In that case, the shard map is loaded from 'neon.pageserver_grpc_urls'
+	 * instead, and that happens in the communicator process only.
+	 */
+	if (neon_use_communicator_worker)
+		return;

 	/*
 	 * Only postmaster updates the copy in shared memory.
@@ -258,11 +262,29 @@ AssignPageserverConnstring(const char *newval, void *extra)
 	if (!PagestoreShmemIsValid() || IsUnderPostmaster)
 		return;

-	if (!ParseShardMap(newval, &shard_map))
+	AssignShardMap(newval);
+}
+
+
+/* GUC hooks for neon.pageserver_connstring */
+static bool
+CheckPageserverGrpcUrls(char **newval, void **extra, GucSource source)
+{
+	char	   *p = *newval;
+
+	return parse_shard_map(p, NULL);
+}
+
+static void
+AssignShardMap(const char *newval)
+{
+	ShardMap	shard_map;
+
+	if (!parse_shard_map(newval, &shard_map))
 	{
 		/*
 		 * shouldn't happen, because we already checked the value in
-		 * CheckPageserverConnstring
+		 * CheckPageserverConnstring/CheckPageserverGrpcUrls
 		 */
 		elog(ERROR, "could not parse shard map");
 	}
@@ -363,17 +385,17 @@ get_shard_number(BufferTag *tag)

 #if PG_MAJORVERSION_NUM < 16
 	hash = murmurhash32(tag->rnode.relNode);
-	hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
+	hash = hash_combine(hash, murmurhash32(tag->blockNum / neon_stripe_size));
 #else
 	hash = murmurhash32(tag->relNumber);
-	hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
+	hash = hash_combine(hash, murmurhash32(tag->blockNum / neon_stripe_size));
 #endif

 	return hash % n_shards;
 }

 static inline void
-CLEANUP_AND_DISCONNECT(PageServer *shard) 
+CLEANUP_AND_DISCONNECT(PageServer *shard)
 {
 	if (shard->wes_read)
 	{
@@ -395,7 +417,7 @@ CLEANUP_AND_DISCONNECT(PageServer *shard)
 * complete the connection (e.g. due to receiving an earlier cancellation
 * during connection start).
 * Returns true if successfully connected; false if the connection failed.
- * 
+ *
 * Throws errors in unrecoverable situations, or when this backend's query
 * is canceled.
 */
@@ -1304,7 +1326,7 @@ PagestoreShmemInit(void)
 		pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0);
 		pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0);
 		memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
-		AssignPageserverConnstring(page_server_connstring, NULL);
+		AssignPageserverConnstring(pageserver_connstring, NULL);
 	}

 	NeonPerfCountersShmemInit();
@@ -1357,12 +1379,21 @@ pg_init_libpagestore(void)
 	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
 							   NULL,
-							   &page_server_connstring,
+							   &pageserver_connstring,
 							   "",
 							   PGC_SIGHUP,
 							   0,	/* no flags required */
 							   CheckPageserverConnstring, AssignPageserverConnstring, NULL);

+	DefineCustomStringVariable("neon.pageserver_grpc_urls",
+							   "list of gRPC URLs for the page servers",
+							   NULL,
+							   &pageserver_grpc_urls,
+							   "",
+							   PGC_SIGHUP,
+							   0,	/* no flags required */
+							   CheckPageserverGrpcUrls, NULL, NULL);
+
 	DefineCustomStringVariable("neon.timeline_id",
 							   "Neon timeline_id the server is running on",
 							   NULL,
@@ -1409,7 +1440,7 @@ pg_init_libpagestore(void)
 	DefineCustomIntVariable("neon.stripe_size",
 							"sharding stripe size",
 							NULL,
-							&stripe_size,
+							&neon_stripe_size,
 							2048, 1, INT_MAX,
 							PGC_SIGHUP,
 							GUC_UNIT_BLOCKS,
@@ -1520,7 +1551,7 @@ pg_init_libpagestore(void)
 	if (neon_auth_token)
 		neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");

-	if (page_server_connstring && page_server_connstring[0])
+	if (pageserver_connstring[0] || pageserver_grpc_urls[0])
 	{
 		neon_log(PageStoreTrace, "set neon_smgr hook");
 		smgr_hook = smgr_neon;
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -21,6 +21,7 @@
 #include "replication/logicallauncher.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
+#include "storage/ipc.h"
 #include "storage/proc.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
@@ -30,6 +31,7 @@
 #include "utils/guc_tables.h"

 #include "communicator.h"
+#include "communicator_new.h"
 #include "extension_server.h"
 #include "file_cache.h"
 #include "neon.h"
@@ -59,11 +61,14 @@ static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
 static void neon_ExecutorStart(QueryDesc *queryDesc, int eflags);
 static void neon_ExecutorEnd(QueryDesc *queryDesc);

-#if PG_MAJORVERSION_NUM >= 16
 static shmem_startup_hook_type prev_shmem_startup_hook;
-
-static void neon_shmem_startup_hook(void);
+#if PG_VERSION_NUM>=150000
+static shmem_request_hook_type prev_shmem_request_hook;
 #endif
+
+static void neon_shmem_request(void);
+static void neon_shmem_startup_hook(void);
+
 #if PG_MAJORVERSION_NUM >= 17
 uint32		WAIT_EVENT_NEON_LFC_MAINTENANCE;
 uint32		WAIT_EVENT_NEON_LFC_READ;
@@ -450,13 +455,26 @@ _PG_init(void)
 	 */
 #if PG_VERSION_NUM >= 160000
 	load_file("$libdir/neon_rmgr", false);
+#endif

 	prev_shmem_startup_hook = shmem_startup_hook;
 	shmem_startup_hook = neon_shmem_startup_hook;
+#if PG_VERSION_NUM>=150000
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = neon_shmem_request;
+#else
+	neon_shmem_request();
 #endif

-	/* dummy call to a Rust function in the communicator library, to check that it works */
-	(void) communicator_dummy(123);
+	DefineCustomBoolVariable(
+							"neon.use_communicator_worker",
+							"Uses the communicator worker implementation",
+							NULL,
+							&neon_use_communicator_worker,
+							true,
+							PGC_POSTMASTER,
+							0,
+							NULL, NULL, NULL);

 	pg_init_libpagestore();
 	lfc_init();
@@ -464,6 +482,9 @@ _PG_init(void)
 	init_lwlsncache();

 	pg_init_communicator();
+	if (neon_use_communicator_worker)
+		pg_init_communicator_new();
+
 	Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;

 	InitUnstableExtensionsSupport();
@@ -561,6 +582,8 @@ _PG_init(void)
 PG_FUNCTION_INFO_V1(pg_cluster_size);
 PG_FUNCTION_INFO_V1(backpressure_lsns);
 PG_FUNCTION_INFO_V1(backpressure_throttling_time);
+PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
+PG_FUNCTION_INFO_V1(approximate_working_set_size);

 Datum
 pg_cluster_size(PG_FUNCTION_ARGS)
@@ -607,7 +630,51 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
 	PG_RETURN_UINT64(BackpressureThrottlingTime());
 }

-#if PG_MAJORVERSION_NUM >= 16
+Datum
+approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
+{
+	time_t		duration;
+	int32		dc;
+
+	duration = PG_ARGISNULL(0) ? (time_t) -1 : PG_GETARG_INT32(0);
+
+	if (neon_use_communicator_worker)
+		dc = communicator_new_approximate_working_set_size_seconds(duration, false);
+	else
+		dc = lfc_approximate_working_set_size_seconds(duration, false);
+	if (dc < 0)
+		PG_RETURN_NULL();
+	else
+		PG_RETURN_INT32(dc);
+}
+
+Datum
+approximate_working_set_size(PG_FUNCTION_ARGS)
+{
+	int32		dc;
+	bool		reset = PG_GETARG_BOOL(0);
+
+	if (neon_use_communicator_worker)
+		dc = communicator_new_approximate_working_set_size_seconds(-1, reset);
+	else
+		dc = lfc_approximate_working_set_size_seconds(-1, reset);
+	if (dc < 0)
+		PG_RETURN_NULL();
+	else
+		PG_RETURN_INT32(dc);
+}
+
+static void
+neon_shmem_request(void)
+{
+#if PG_VERSION_NUM>=150000
+	if (prev_shmem_request_hook)
+		prev_shmem_request_hook();
+#endif
+
+	communicator_new_shmem_request();
+}
+
 static void
 neon_shmem_startup_hook(void)
 {
@@ -627,8 +694,9 @@ neon_shmem_startup_hook(void)
 	WAIT_EVENT_NEON_PS_READ = WaitEventExtensionNew("Neon/PS_ReadIO");
 	WAIT_EVENT_NEON_WAL_DL = WaitEventExtensionNew("Neon/WAL_Download");
 #endif
+
+	communicator_new_shmem_startup();
 }
-#endif

 /*
 * ExecutorStart hook: start up tracking if needed
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -9,6 +9,10 @@
 #include "fmgr.h"
 #include "storage/buf_internals.h"

+#if PG_MAJORVERSION_NUM < 16
+typedef PGAlignedBlock PGIOAlignedBlock;
+#endif
+
 #if PG_MAJORVERSION_NUM < 17
 #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId)
 #else
@@ -160,6 +164,10 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
 #define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess())
 #endif

+#if PG_MAJORVERSION_NUM < 17
+#define	MyProcNumber (MyProc - &ProcGlobal->allProcs[0])
+#endif
+
 #if PG_MAJORVERSION_NUM < 15
 extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
 extern TimeLineID GetWALInsertionTimeLine(void);
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -236,14 +236,23 @@ extern void prefetch_on_ps_disconnect(void);

 extern page_server_api *page_server;

-extern char *page_server_connstring;
+extern char *pageserver_connstring;
+extern char *pageserver_grpc_urls;
 extern int	flush_every_n_requests;
 extern int	readahead_buffer_size;
 extern char *neon_timeline;
 extern char *neon_tenant;
 extern int32 max_cluster_size;
 extern int  neon_protocol_version;
+extern int	neon_stripe_size;

+typedef struct
+{
+	char		connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE];
+	size_t		num_shards;
+} ShardMap;
+
+extern bool parse_shard_map(const char *connstr, ShardMap *result);
 extern shardno_t get_shard_number(BufferTag* tag);

 extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);
@@ -290,6 +299,7 @@ extern int64 neon_dbsize(Oid dbNode);
 extern void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum,
 								  BlockNumber blkno, neon_request_lsns *output,
 								  BlockNumber nblocks);
+extern XLogRecPtr neon_get_write_lsn(void);

 /* utils for neon relsize cache */
 extern void relsize_hash_init(void);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -62,6 +62,7 @@

 #include "bitmap.h"
 #include "communicator.h"
+#include "communicator_new.h"
 #include "file_cache.h"
 #include "neon.h"
 #include "neon_lwlsncache.h"
@@ -87,7 +88,7 @@ static char *hexdump_page(char *page);
 		NInfoGetRelNumber(InfoFromSMgrRel(reln)) >= FirstNormalObjectId \
 )

-const int	SmgrTrace = DEBUG5;
+const int	SmgrTrace = DEBUG1;

 /* unlogged relation build states */
 typedef enum
@@ -501,6 +502,60 @@ nm_adjust_lsn(XLogRecPtr lsn)
 	return lsn;
 }

+/*
+ * Get a LSN to use to stamp an operation like relation create or truncate.
+ * On operations on individual pages we use the LSN of the page, but when
+ * e.g. smgrcreate() is called, we have to do something else.
+ */
+XLogRecPtr
+neon_get_write_lsn(void)
+{
+	XLogRecPtr	lsn;
+
+	if (RecoveryInProgress())
+	{
+		/*
+		 * FIXME: v14 doesn't have GetCurrentReplayRecPtr(). Options:
+		 * - add it in our fork
+		 * - store a magic value that means that you must use
+		 *   current latest possible LSN at the time that the request
+		 *   on this thing is made again (or some other recent enough
+		 *   lsn).
+		 */
+#if PG_VERSION_NUM >= 150000
+		lsn = GetCurrentReplayRecPtr(NULL);
+#else
+		lsn = GetXLogReplayRecPtr(NULL); /* FIXME: this is wrong, see above */
+#endif
+	}
+	else
+		lsn = GetXLogInsertRecPtr();
+
+	/*
+	 * If the insert LSN points to just after page header, round it down to
+	 * the beginning of the page, because the page header might not have been
+	 * inserted to the WAL yet, and if we tried to flush it, the WAL flushing
+	 * code gets upset.
+	 */
+	{
+		int			segoff;
+
+		segoff = XLogSegmentOffset(lsn, wal_segment_size);
+		if (segoff == SizeOfXLogLongPHD)
+		{
+			lsn = lsn - segoff;
+		}
+		else
+		{
+			int			offset = lsn % XLOG_BLCKSZ;
+
+			if (offset == SizeOfXLogShortPHD)
+				lsn = lsn - offset;
+		}
+	}
+
+	return lsn;
+}

 /*
 * Return LSN for requesting pages and number of blocks from page server
@@ -744,11 +799,6 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
-	{
-		return true;
-	}
-
 	/*
 	 * \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server
 	 * will error out if you check that, because the whole dbdir for
@@ -772,10 +822,20 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}

-	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
-						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
+	if (neon_use_communicator_worker)
+		return communicator_new_rel_exists(InfoFromSMgrRel(reln), forkNum);
+	else
+	{
+		if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
+		{
+			return true;
+		}

-	return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
+		neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
+							  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
+
+		return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
+	}
 }

 /*
@@ -833,16 +893,36 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	 * relation. Currently, we don't call SetLastWrittenLSN() when a new
 	 * relation created, so if we didn't remember the size in the relsize
 	 * cache, we might call smgrnblocks() on the newly-created relation before
-	 * the creation WAL record hass been received by the page server.
+	 * the creation WAL record has been received by the page server.
+	 *
+	 * XXX: with the new communicator, similar considerations apply. However,
+	 * during replay, neon_get_write_lsn() returns the (end-)LSN of the record
+	 * that's being replayed, so we should not have the correctness issue
+	 * mentioned in previous paragraph.
 	 */
-	if (isRedo)
+	if (neon_use_communicator_worker)
 	{
-		update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
-		get_cached_relsize(InfoFromSMgrRel(reln), forkNum,
-						   &reln->smgr_cached_nblocks[forkNum]);
+		XLogRecPtr	lsn = neon_get_write_lsn();
+
+		if (isRedo)
+		{
+			if (!communicator_new_rel_exists(InfoFromSMgrRel(reln), forkNum))
+				communicator_new_rel_create(InfoFromSMgrRel(reln), forkNum, lsn);
+		}
+		else
+			communicator_new_rel_create(InfoFromSMgrRel(reln), forkNum, lsn);
 	}
 	else
-		set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
+	{
+		if (isRedo)
+		{
+			update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
+			get_cached_relsize(InfoFromSMgrRel(reln), forkNum,
+							   &reln->smgr_cached_nblocks[forkNum]);
+		}
+		else
+			set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
+	}

 	if (debug_compare_local)
 	{
@@ -878,9 +958,17 @@ neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo)
 	 * unlink, it won't do any harm if the file doesn't exist.
 	 */
 	mdunlink(rinfo, forkNum, isRedo);
+
 	if (!NRelFileInfoBackendIsTemp(rinfo))
 	{
-		forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum);
+		if (neon_use_communicator_worker)
+		{
+			XLogRecPtr	lsn = neon_get_write_lsn();
+
+			communicator_new_rel_unlink(InfoFromNInfoB(rinfo), forkNum, lsn);
+		}
+		else
+			forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum);
 	}
 }

@@ -960,7 +1048,6 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);

 	neon_wallog_page(reln, forkNum, blkno, buffer, false);
-	set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);

 	lsn = PageGetLSN((Page) buffer);
 	neon_log(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
@@ -968,35 +1055,51 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		 forkNum, blkno,
 		 (uint32) (lsn >> 32), (uint32) lsn);

-	lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
-
-	if (debug_compare_local)
+	if (neon_use_communicator_worker)
 	{
-		if (IS_LOCAL_REL(reln))
-			mdextend(reln, forkNum, blkno, buffer, skipFsync);
-	}
+		// FIXME: this can pass lsn == invalid. Is that ok?
+		communicator_new_rel_extend(InfoFromSMgrRel(reln), forkNum, blkno, (const void *) buffer, lsn);

-	/*
-	 * smgr_extend is often called with an all-zeroes page, so
-	 * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
-	 * later, after it has been initialized with the real page contents, and
-	 * it is eventually evicted from the buffer cache. But we need a valid LSN
-	 * to the relation metadata update now.
-	 */
-	if (lsn == InvalidXLogRecPtr)
-	{
-		lsn = GetXLogInsertRecPtr();
-		neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno);
+		if (debug_compare_local)
+		{
+			if (IS_LOCAL_REL(reln))
+				mdextend(reln, forkNum, blkno, buffer, skipFsync);
+		}
+	}
+	else
+	{
+		set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);
+		lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
+
+		if (debug_compare_local)
+		{
+			if (IS_LOCAL_REL(reln))
+				mdextend(reln, forkNum, blkno, buffer, skipFsync);
+		}
+
+		/*
+		 * smgr_extend is often called with an all-zeroes page, so
+		 * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
+		 * later, after it has been initialized with the real page contents, and
+		 * it is eventually evicted from the buffer cache. But we need a valid LSN
+		 * to the relation metadata update now.
+		 */
+		if (lsn == InvalidXLogRecPtr)
+		{
+			lsn = GetXLogInsertRecPtr();
+			neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno);
+		}
+		neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
 	}
-	neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
 }

 #if PG_MAJORVERSION_NUM >= 16
 static void
-neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
+neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber start_block,
 				int nblocks, bool skipFsync)
 {
 	const PGIOAlignedBlock buffer = {0};
+	BlockNumber blocknum = start_block;
 	int			remblocks = nblocks;
 	XLogRecPtr	lsn = 0;

@@ -1079,11 +1182,14 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,

 		lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI);

-		for (int i = 0; i < count; i++)
+		if (!neon_use_communicator_worker)
 		{
-			lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
-			neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum,
-									  blocknum + i);
+			for (int i = 0; i < count; i++)
+			{
+				lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
+				neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum,
+									 blocknum + i);
+			}
 		}

 		blocknum += count;
@@ -1092,8 +1198,15 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,

 	Assert(lsn != 0);

-	neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
-	set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
+	if (neon_use_communicator_worker)
+	{
+		communicator_new_rel_zeroextend(InfoFromSMgrRel(reln), forkNum, start_block, nblocks, lsn);
+	}
+	else
+	{
+		neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
+		set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
+	}
 }
 #endif

@@ -1153,6 +1266,12 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

+	if (neon_use_communicator_worker)
+	{
+		communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, nblocks);
+		return false;
+	}
+
 	tag.spcOid = reln->smgr_rlocator.locator.spcOid;
 	tag.dbOid = reln->smgr_rlocator.locator.dbOid;
 	tag.relNumber = reln->smgr_rlocator.locator.relNumber;
@@ -1179,7 +1298,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		blocknum += iterblocks;
 	}

-	communicator_prefetch_pump_state();
+	if (!neon_use_communicator_worker)
+		communicator_prefetch_pump_state();

 	return false;
 }
@@ -1192,8 +1312,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 static bool
 neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
-	BufferTag	tag;
-
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:					/* probably shouldn't happen, but ignore it */
@@ -1208,17 +1326,25 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
-		return false;
+	if (neon_use_communicator_worker)
+	{
+		communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, 1);
+	}
+	else
+	{
+		BufferTag	tag;

-	tag.forkNum = forknum;
-	tag.blockNum = blocknum;
+		if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
+			return false;

-	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
+		tag.forkNum = forknum;
+		tag.blockNum = blocknum;

-	communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
+		CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
+		communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);

-	communicator_prefetch_pump_state();
+		communicator_prefetch_pump_state();
+	}

 	return false;
 }
@@ -1262,7 +1388,8 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 	 */
 	neon_log(SmgrTrace, "writeback noop");

-	communicator_prefetch_pump_state();
+	if (!neon_use_communicator_worker)
+		communicator_prefetch_pump_state();

 	if (debug_compare_local)
 	{
@@ -1279,7 +1406,14 @@ void
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 neon_request_lsns request_lsns, void *buffer)
 {
-	communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
+	if (neon_use_communicator_worker)
+	{
+		// FIXME: request_lsns is ignored. That affects the neon_test_utils callers.
+		// Add the capability to specify the LSNs explicitly, for the sake of neon_test_utils ?
+		communicator_new_read_at_lsnv(rinfo, forkNum, blkno, &buffer, 1);
+	}
+	else
+		communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
 }

 static void
@@ -1405,47 +1539,55 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	/* Try to read PS results if they are available */
-	communicator_prefetch_pump_state();
-
-	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
-
-	present = 0;
-	bufferp = buffer;
-	if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
+	if (neon_use_communicator_worker)
 	{
-		/* Prefetch hit */
-		if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
-		{
-			compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
-		}
-		if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH)
-		{
-			return;
-		}
+		communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forkNum, blkno,
+									  (void *) &buffer, 1);
 	}
-
-	/* Try to read from local file cache */
-	if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
+	else
 	{
-		MyNeonCounters->file_cache_hits_total++;
-		if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
+		/* Try to read PS results if they are available */
+		communicator_prefetch_pump_state();
+
+		neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
+
+		present = 0;
+		bufferp = buffer;
+		if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
 		{
-			compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
+			/* Prefetch hit */
+			if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
+			{
+				compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
+			}
+			if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH)
+			{
+				return;
+			}
 		}
-		if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC)
+
+		/* Try to read from local file cache */
+		if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
 		{
-			return;
+			MyNeonCounters->file_cache_hits_total++;
+			if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
+			{
+				compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
+			}
+			if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC)
+			{
+				return;
+			}
 		}
+
+		neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
+
+		/*
+		 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
+		 */
+		communicator_prefetch_pump_state();
 	}

-	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
-
-	/*
-	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
-	 */
-	communicator_prefetch_pump_state();
-
 	if (debug_compare_local)
 	{
 		compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
@@ -1508,59 +1650,67 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 				 nblocks, PG_IOV_MAX);

 	/* Try to read PS results if they are available */
-	communicator_prefetch_pump_state();
-
-	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
-						  request_lsns, nblocks);
+	if (!neon_use_communicator_worker)
+		communicator_prefetch_pump_state();

 	memset(read_pages, 0, sizeof(read_pages));

-	prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
-													blocknum, request_lsns, nblocks,
-													buffers, read_pages);
-
-	if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
+	if (neon_use_communicator_worker)
 	{
-		compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
+		communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum,
+									  buffers, nblocks);
 	}
-	if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH && prefetch_result == nblocks)
+	else
 	{
-		return;
+		neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
+							  request_lsns, nblocks);
+
+		prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
+														blocknum, request_lsns, nblocks,
+														buffers, read_pages);
+
+		if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
+		{
+			compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
+		}
+		if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH && prefetch_result == nblocks)
+		{
+			return;
+		}
+		if (debug_compare_local > DEBUG_COMPARE_LOCAL_PREFETCH)
+		{
+			memset(read_pages, 0, sizeof(read_pages));
+		}
+
+		/* Try to read from local file cache */
+		lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
+									  nblocks, read_pages);
+
+		if (lfc_result > 0)
+			MyNeonCounters->file_cache_hits_total += lfc_result;
+
+		if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
+		{
+			compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
+		}
+		if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC && prefetch_result + lfc_result == nblocks)
+		{
+			/* Read all blocks from LFC, so we're done */
+			return;
+		}
+		if (debug_compare_local > DEBUG_COMPARE_LOCAL_LFC)
+		{
+			memset(read_pages, 0, sizeof(read_pages));
+		}
+
+		communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
+								  buffers, nblocks, read_pages);
+
+		/*
+		 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
+		 */
+		communicator_prefetch_pump_state();
 	}
-	if (debug_compare_local > DEBUG_COMPARE_LOCAL_PREFETCH)
-	{
-		memset(read_pages, 0, sizeof(read_pages));
-	}
-
-
-	/* Try to read from local file cache */
-	lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
-								  nblocks, read_pages);
-
-	if (lfc_result > 0)
-		MyNeonCounters->file_cache_hits_total += lfc_result;
-
-	if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
-	{
-		compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
-	}
-	if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC && prefetch_result + lfc_result == nblocks)
-	{
-		/* Read all blocks from LFC, so we're done */
-		return;
-	}
-	if (debug_compare_local > DEBUG_COMPARE_LOCAL_LFC)
-	{
-		memset(read_pages, 0, sizeof(read_pages));
-	}
-
-	communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
-							  buffers, nblocks, read_pages);
-
-	/*
-	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
-	 */
-	communicator_prefetch_pump_state();

 	if (debug_compare_local)
 	{
@@ -1661,9 +1811,16 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 		 forknum, blocknum,
 		 (uint32) (lsn >> 32), (uint32) lsn);

-	lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
+	if (neon_use_communicator_worker)
+	{
+		communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blocknum, buffer, lsn);
+	}
+	else
+	{
+		lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);

-	communicator_prefetch_pump_state();
+		communicator_prefetch_pump_state();
+	}

 	if (debug_compare_local)
 	{
@@ -1724,9 +1881,21 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,

 	neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false);

-	lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
+	if (neon_use_communicator_worker)
+	{
+		for (int i = 0; i < nblocks; i++)
+		{
+			XLogRecPtr lsn = PageGetLSN((Page) buffers[i]);

-	communicator_prefetch_pump_state();
+			communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blkno + i, buffers[i], lsn);
+		}
+	}
+	else
+	{
+		lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
+
+		communicator_prefetch_pump_state();
+	}

 	if (debug_compare_local)
 	{
@@ -1767,19 +1936,26 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
+	if (neon_use_communicator_worker)
 	{
-		neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
-			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-			 forknum, n_blocks);
-		return n_blocks;
+		n_blocks = communicator_new_rel_nblocks(InfoFromSMgrRel(reln), forknum);
 	}
+	else
+	{
+		if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
+		{
+			neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
+					 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+					 forknum, n_blocks);
+			return n_blocks;
+		}

-	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
-						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
+		neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
+							  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);

-	n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
-	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
+		n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
+		update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
+	}

 	neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
 			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
@@ -1800,10 +1976,17 @@ neon_dbsize(Oid dbNode)
 	neon_request_lsns request_lsns;
 	NRelFileInfo dummy_node = {0};

-	neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
-						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
+	if (neon_use_communicator_worker)
+	{
+		db_size = communicator_new_dbsize(dbNode);
+	}
+	else
+	{
+		neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
+							  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);

-	db_size = communicator_dbsize(dbNode, &request_lsns);
+		db_size = communicator_dbsize(dbNode, &request_lsns);
+	}

 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
 			 dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
@@ -1817,8 +2000,6 @@ neon_dbsize(Oid dbNode)
 static void
 neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks)
 {
-	XLogRecPtr	lsn;
-
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
@@ -1842,34 +2023,45 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
+	if (neon_use_communicator_worker)
+	{
+		XLogRecPtr	lsn = neon_get_write_lsn();

-	/*
-	 * Truncating a relation drops all its buffers from the buffer cache
-	 * without calling smgrwrite() on them. But we must account for that in
-	 * our tracking of last-written-LSN all the same: any future smgrnblocks()
-	 * request must return the new size after the truncation. We don't know
-	 * what the LSN of the truncation record was, so be conservative and use
-	 * the most recently inserted WAL record's LSN.
-	 */
-	lsn = GetXLogInsertRecPtr();
-	lsn = nm_adjust_lsn(lsn);
+		communicator_new_rel_truncate(InfoFromSMgrRel(reln), forknum, nblocks, lsn);
+	}
+	else
+	{
+		XLogRecPtr	lsn;

-	/*
-	 * Flush it, too. We don't actually care about it here, but let's uphold
-	 * the invariant that last-written LSN <= flush LSN.
-	 */
-	XLogFlush(lsn);
+		set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);

-	/*
-	 * Truncate may affect several chunks of relations. So we should either
-	 * update last written LSN for all of them, or update LSN for "dummy"
-	 * metadata block. Second approach seems more efficient. If the relation
-	 * is extended again later, the extension will update the last-written LSN
-	 * for the extended pages, so there's no harm in leaving behind obsolete
-	 * entries for the truncated chunks.
-	 */
-	neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);
+		/*
+		 * Truncating a relation drops all its buffers from the buffer cache
+		 * without calling smgrwrite() on them. But we must account for that in
+		 * our tracking of last-written-LSN all the same: any future smgrnblocks()
+		 * request must return the new size after the truncation. We don't know
+		 * what the LSN of the truncation record was, so be conservative and use
+		 * the most recently inserted WAL record's LSN.
+		 */
+		lsn = GetXLogInsertRecPtr();
+		lsn = nm_adjust_lsn(lsn);
+
+		/*
+		 * Flush it, too. We don't actually care about it here, but let's uphold
+		 * the invariant that last-written LSN <= flush LSN.
+		 */
+		XLogFlush(lsn);
+
+		/*
+		 * Truncate may affect several chunks of relations. So we should either
+		 * update last written LSN for all of them, or update LSN for "dummy"
+		 * metadata block. Second approach seems more efficient. If the relation
+		 * is extended again later, the extension will update the last-written LSN
+		 * for the extended pages, so there's no harm in leaving behind obsolete
+		 * entries for the truncated chunks.
+		 */
+		neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);
+	}

 	if (debug_compare_local)
 	{
@@ -1912,7 +2104,8 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)

 	neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");

-	communicator_prefetch_pump_state();
+	if (!neon_use_communicator_worker)
+		communicator_prefetch_pump_state();

 	if (debug_compare_local)
 	{
@@ -2098,12 +2291,15 @@ neon_end_unlogged_build(SMgrRelation reln)
 		nblocks = mdnblocks(reln, MAIN_FORKNUM);
 		recptr = GetXLogInsertRecPtr();

-		neon_set_lwlsn_block_range(recptr,
-								   InfoFromNInfoB(rinfob),
-								   MAIN_FORKNUM, 0, nblocks);
-		neon_set_lwlsn_relation(recptr,
-								InfoFromNInfoB(rinfob),
-								MAIN_FORKNUM);
+		if (!neon_use_communicator_worker)
+		{
+			neon_set_lwlsn_block_range(recptr,
+									   InfoFromNInfoB(rinfob),
+									   MAIN_FORKNUM, 0, nblocks);
+			neon_set_lwlsn_relation(recptr,
+									InfoFromNInfoB(rinfob),
+									MAIN_FORKNUM);
+		}

 		/* Remove local copy */
 		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
@@ -2112,8 +2308,15 @@ neon_end_unlogged_build(SMgrRelation reln)
 				 RelFileInfoFmt(InfoFromNInfoB(rinfob)),
 				 forknum);

-			forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
-			lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
+			if (neon_use_communicator_worker)
+			{
+				communicator_new_update_cached_rel_size(InfoFromSMgrRel(reln), forknum, nblocks, recptr);
+			}
+			else
+			{
+				forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
+				lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
+			}

 			mdclose(reln, forknum);
 			if (!debug_compare_local)
@@ -2181,7 +2384,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	request_lsns.not_modified_since = not_modified_since;
 	request_lsns.effective_request_lsn = request_lsn;

-	n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);
+	if (neon_use_communicator_worker)
+		n_blocks = communicator_new_read_slru_segment(kind, (uint32_t)segno, &request_lsns, path);
+	else
+		n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);

 	return n_blocks;
 }
@@ -2218,7 +2424,8 @@ AtEOXact_neon(XactEvent event, void *arg)
 			}
 			break;
 	}
-	communicator_reconfigure_timeout_if_needed();
+	if (!neon_use_communicator_worker)
+		communicator_reconfigure_timeout_if_needed();
 }

 static const struct f_smgr neon_smgr =
@@ -2276,7 +2483,10 @@ smgr_init_neon(void)

 	smgr_init_standard();
 	neon_init();
-	communicator_init();
+	if (neon_use_communicator_worker)
+		communicator_new_init();
+	else
+		communicator_init();
 }


@@ -2288,6 +2498,20 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	/* This is only used in WAL replay */
 	Assert(RecoveryInProgress());

+	if (neon_use_communicator_worker)
+	{
+		relsize = communicator_new_rel_nblocks(rinfo, forknum);
+
+		if (blkno >= relsize)
+			communicator_new_rel_zeroextend(rinfo, forknum, relsize, (blkno - relsize) + 1, end_recptr);
+
+		/*
+		 * FIXME: does this need to update the last-written LSN too, like the
+		 * old implementation?
+		 */
+		return;
+	}
+
 	/* Extend the relation if we know its size */
 	if (get_cached_relsize(rinfo, forknum, &relsize))
 	{
@@ -2453,7 +2677,10 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 		 * We should perform this check after assigning LwLSN to prevent
 		 * prefetching of some older version of the page by some other backend.
 		 */
-		no_redo_needed = !lfc_cache_contains(rinfo, forknum, blkno);
+		if (neon_use_communicator_worker)
+			no_redo_needed = communicator_new_cache_contains(rinfo, forknum, blkno);
+		else
+			no_redo_needed = !lfc_cache_contains(rinfo, forknum, blkno);
 	}

 	LWLockRelease(partitionLock);
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -10,6 +10,7 @@
 */
 #include "postgres.h"

+#include "neon.h"
 #include "neon_pgversioncompat.h"

 #include "pagestore_client.h"
@@ -99,6 +100,8 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 {
 	bool		found = false;

+	Assert(!neon_use_communicator_worker);
+
 	if (relsize_hash_size > 0)
 	{
 		RelTag		tag;
@@ -130,6 +133,8 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 void
 set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 {
+	Assert(!neon_use_communicator_worker);
+
 	if (relsize_hash_size > 0)
 	{
 		RelTag		tag;
@@ -178,6 +183,8 @@ set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 void
 update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 {
+	Assert(!neon_use_communicator_worker);
+
 	if (relsize_hash_size > 0)
 	{
 		RelTag		tag;
@@ -212,6 +219,8 @@ update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 void
 forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum)
 {
+	Assert(!neon_use_communicator_worker);
+
 	if (relsize_hash_size > 0)
 	{
 		RelTag		tag;
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -5,8 +5,9 @@ use std::sync::Arc;
 use std::time::Duration;

 use anyhow::Context;
-use compute_api::spec::PageserverProtocol;
-use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
+use control_plane::endpoint::{
+    ComputeControlPlane, EndpointStatus, PageserverConnectionInfo, PageserverShardConnectionInfo,
+};
 use control_plane::local_env::LocalEnv;
 use futures::StreamExt;
 use hyper::StatusCode;
@@ -506,27 +507,40 @@ impl ApiMethod for ComputeHookTenant {
            if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
                tracing::info!("Reconfiguring pageservers for endpoint {endpoint_name}");

-                let pageservers = shards
-                    .iter()
-                    .map(|shard| {
-                        let ps_conf = env
-                            .get_pageserver_conf(shard.node_id)
-                            .expect("Unknown pageserver");
-                        if endpoint.grpc {
-                            let addr = ps_conf.listen_grpc_addr.as_ref().expect("no gRPC address");
-                            let (host, port) = parse_host_port(addr).expect("invalid gRPC address");
-                            let port = port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT);
-                            (PageserverProtocol::Grpc, host, port)
-                        } else {
-                            let (host, port) = parse_host_port(&ps_conf.listen_pg_addr)
-                                .expect("Unable to parse listen_pg_addr");
-                            (PageserverProtocol::Libpq, host, port.unwrap_or(5432))
-                        }
-                    })
-                    .collect::<Vec<_>>();
+                let mut shard_conninfos = HashMap::new();
+                for shard in shards.iter() {
+                    let ps_conf = env
+                        .get_pageserver_conf(shard.node_id)
+                        .expect("Unknown pageserver");
+
+                    let libpq_url = Some({
+                        let (host, port) = parse_host_port(&ps_conf.listen_pg_addr)
+                            .expect("Unable to parse listen_pg_addr");
+                        let port = port.unwrap_or(5432);
+                        format!("postgres://no_user@{host}:{port}")
+                    });
+                    let grpc_url = if let Some(grpc_addr) = &ps_conf.listen_grpc_addr {
+                        let (host, port) =
+                            parse_host_port(grpc_addr).expect("invalid gRPC address");
+                        let port = port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT);
+                        Some(format!("grpc://no_user@{host}:{port}"))
+                    } else {
+                        None
+                    };
+                    let pageserver = PageserverShardConnectionInfo {
+                        libpq_url,
+                        grpc_url,
+                    };
+                    shard_conninfos.insert(shard.shard_number.0 as u32, pageserver);
+                }
+
+                let pageserver_conninfo = PageserverConnectionInfo {
+                    shards: shard_conninfos,
+                    prefer_grpc: endpoint.grpc,
+                };

                endpoint
-                    .reconfigure_pageservers(pageservers, *stripe_size)
+                    .reconfigure_pageservers(pageserver_conninfo, *stripe_size)
                    .await
                    .map_err(NotifyError::NeonLocal)?;
            }
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4344,7 +4344,18 @@ class Endpoint(PgProtocol, LogUtils):

        # set small 'max_replication_write_lag' to enable backpressure
        # and make tests more stable.
-        config_lines = ["max_replication_write_lag=15MB"] + config_lines
+        config_lines += ["max_replication_write_lag=15MB"]
+
+        # If gRPC is enabled, use the new communicator too.
+        #
+        # NB: the communicator is enabled by default, so force it to false otherwise.
+        #
+        # XXX: By checking for None, we enable the new communicator for all tests
+        # by default
+        if grpc or grpc is None:
+            config_lines += ["neon.use_communicator_worker=on"]
+        else:
+            config_lines += ["neon.use_communicator_worker=off"]

        # Delete file cache if it exists (and we're recreating the endpoint)
        if USE_LFC:
@@ -5402,6 +5413,7 @@ SKIP_FILES = frozenset(
        "postmaster.pid",
        "pg_control",
        "pg_dynshmem",
+        ".metrics.socket",
    )
 )

--- a/test_runner/regress/test_gin_redo.py
+++ b/test_runner/regress/test_gin_redo.py
@@ -16,6 +16,7 @@ def test_gin_redo(neon_simple_env: NeonEnv):
    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
    con = primary.connect()
    cur = con.cursor()
+    cur.execute("select pg_switch_wal()")
    cur.execute("create table gin_test_tbl(id integer, i int4[])")
    cur.execute("create index gin_test_idx on gin_test_tbl using gin (i)")
    cur.execute("insert into gin_test_tbl select g,array[3, 1, g] from generate_series(1, 10000) g")
--- a/test_runner/regress/test_normal_work.py
+++ b/test_runner/regress/test_normal_work.py
@@ -17,7 +17,9 @@ def check_tenant(
    config_lines = [
        f"neon.safekeeper_proto_version = {safekeeper_proto_version}",
    ]
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id, config_lines=config_lines)
+    endpoint = env.endpoints.create_start(
+        "main", tenant_id=tenant_id, config_lines=config_lines, grpc=True
+    )
    # we rely upon autocommit after each statement
    res_1 = endpoint.safe_psql_many(
        queries=[
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17