New communicator, with "integrated" cache accessible from all processes

2026-06-02 13:00:37 +00:00 · 2025-04-29 11:52:44 +03:00
parent 11f6044338
commit e58d0fece1
57 changed files with 8418 additions and 385 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -253,6 +253,17 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"

+[[package]]
+name = "atomic_enum"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.100",
+]
+
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -687,13 +698,40 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "axum"
+version = "0.7.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
+dependencies = [
+ "async-trait",
+ "axum-core 0.4.5",
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
+ "itoa",
+ "matchit 0.7.3",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustversion",
+ "serde",
+ "sync_wrapper 1.0.1",
+ "tower 0.5.2",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "axum"
 version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6d6fd624c75e18b3b4c6b9caf42b1afe24437daaee904069137d8bab077be8b8"
 dependencies = [
- "axum-core",
+ "axum-core 0.5.0",
 "base64 0.22.1",
 "bytes",
 "form_urlencoded",
@@ -704,7 +742,7 @@ dependencies = [
 "hyper 1.4.1",
 "hyper-util",
 "itoa",
- "matchit",
+ "matchit 0.8.4",
 "memchr",
 "mime",
 "percent-encoding",
@@ -724,6 +762,26 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "axum-core"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
+ "mime",
+ "pin-project-lite",
+ "rustversion",
+ "sync_wrapper 1.0.1",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "axum-core"
 version = "0.5.0"
@@ -750,8 +808,8 @@ version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
 dependencies = [
- "axum",
- "axum-core",
+ "axum 0.8.1",
+ "axum-core 0.5.0",
 "bytes",
 "futures-util",
 "headers",
@@ -1086,6 +1144,25 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"

+[[package]]
+name = "cbindgen"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eadd868a2ce9ca38de7eeafdcec9c7065ef89b42b32f0839278d55f35c54d1ff"
+dependencies = [
+ "clap",
+ "heck 0.4.1",
+ "indexmap 2.9.0",
+ "log",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "serde_json",
+ "syn 2.0.100",
+ "tempfile",
+ "toml",
+]
+
 [[package]]
 name = "cc"
 version = "1.2.16"
@@ -1206,7 +1283,7 @@ version = "4.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -1264,13 +1341,40 @@ dependencies = [
 "unicode-width",
 ]

+[[package]]
+name = "communicator"
+version = "0.1.0"
+dependencies = [
+ "atomic_enum",
+ "bytes",
+ "cbindgen",
+ "http 1.1.0",
+ "libc",
+ "neonart",
+ "nix 0.27.1",
+ "pageserver_client_grpc",
+ "pageserver_data_api",
+ "prost 0.13.3",
+ "thiserror 1.0.69",
+ "tokio",
+ "tokio-epoll-uring",
+ "tokio-pipe",
+ "tonic",
+ "tracing",
+ "tracing-subscriber",
+ "uring-common",
+ "utils",
+ "zerocopy 0.8.24",
+ "zerocopy-derive 0.8.24",
+]
+
 [[package]]
 name = "compute_api"
 version = "0.1.0"
 dependencies = [
 "anyhow",
 "chrono",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "jsonwebtoken",
 "regex",
 "remote_storage",
@@ -1288,7 +1392,7 @@ dependencies = [
 "aws-sdk-kms",
 "aws-sdk-s3",
 "aws-smithy-types",
- "axum",
+ "axum 0.8.1",
 "axum-extra",
 "base64 0.13.1",
 "bytes",
@@ -1301,7 +1405,7 @@ dependencies = [
 "flate2",
 "futures",
 "http 1.1.0",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "jsonwebtoken",
 "metrics",
 "nix 0.27.1",
@@ -1927,7 +2031,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
 dependencies = [
 "darling",
 "either",
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -2041,7 +2145,7 @@ name = "endpoint_storage"
 version = "0.0.1"
 dependencies = [
 "anyhow",
- "axum",
+ "axum 0.8.1",
 "axum-extra",
 "camino",
 "camino-tempfile",
@@ -2588,7 +2692,7 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "http 0.2.9",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "slab",
 "tokio",
 "tokio-util",
@@ -2607,7 +2711,7 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "http 1.1.0",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "slab",
 "tokio",
 "tokio-util",
@@ -2703,6 +2807,12 @@ dependencies = [
 "http 1.1.0",
 ]

+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -3191,12 +3301,12 @@ dependencies = [

 [[package]]
 name = "indexmap"
-version = "2.0.1"
+version = "2.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
+checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e"
 dependencies = [
 "equivalent",
- "hashbrown 0.14.5",
+ "hashbrown 0.15.2",
 "serde",
 ]

@@ -3219,7 +3329,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88"
 dependencies = [
 "ahash",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "is-terminal",
 "itoa",
 "log",
@@ -3242,7 +3352,7 @@ dependencies = [
 "crossbeam-utils",
 "dashmap 6.1.0",
 "env_logger",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "itoa",
 "log",
 "num-format",
@@ -3594,6 +3704,12 @@ dependencies = [
 "regex-automata 0.1.10",
 ]

+[[package]]
+name = "matchit"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
+
 [[package]]
 name = "matchit"
 version = "0.8.4"
@@ -3639,7 +3755,7 @@ version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -3785,6 +3901,15 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"

+[[package]]
+name = "neonart"
+version = "0.1.0"
+dependencies = [
+ "rand 0.8.5",
+ "tracing",
+ "zerocopy 0.8.24",
+]
+
 [[package]]
 name = "never-say-never"
 version = "6.6.666"
@@ -4208,6 +4333,8 @@ dependencies = [
 "humantime-serde",
 "pageserver_api",
 "pageserver_client",
+ "pageserver_client_grpc",
+ "pageserver_data_api",
 "rand 0.8.5",
 "reqwest",
 "serde",
@@ -4284,6 +4411,8 @@ dependencies = [
 "pageserver_api",
 "pageserver_client",
 "pageserver_compaction",
+ "pageserver_data_api",
+ "peekable",
 "pem",
 "pin-project-lite",
 "postgres-protocol",
@@ -4295,6 +4424,7 @@ dependencies = [
 "pprof",
 "pq_proto",
 "procfs",
+ "prost 0.13.3",
 "rand 0.8.5",
 "range-set-blaze",
 "regex",
@@ -4326,6 +4456,7 @@ dependencies = [
 "tokio-tar",
 "tokio-util",
 "toml_edit",
+ "tonic",
 "tracing",
 "tracing-utils",
 "url",
@@ -4390,6 +4521,18 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "pageserver_client_grpc"
+version = "0.1.0"
+dependencies = [
+ "bytes",
+ "http 1.1.0",
+ "pageserver_data_api",
+ "thiserror 1.0.69",
+ "tonic",
+ "tracing",
+]
+
 [[package]]
 name = "pageserver_compaction"
 version = "0.1.0"
@@ -4413,6 +4556,17 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "pageserver_data_api"
+version = "0.1.0"
+dependencies = [
+ "prost 0.13.3",
+ "thiserror 1.0.69",
+ "tonic",
+ "tonic-build",
+ "utils",
+]
+
 [[package]]
 name = "papaya"
 version = "0.2.1"
@@ -4539,6 +4693,15 @@ dependencies = [
 "sha2",
 ]

+[[package]]
+name = "peekable"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b"
+dependencies = [
+ "smallvec",
+]
+
 [[package]]
 name = "pem"
 version = "3.0.3"
@@ -5010,7 +5173,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
 dependencies = [
 "bytes",
- "heck",
+ "heck 0.5.0",
 "itertools 0.12.1",
 "log",
 "multimap",
@@ -5031,7 +5194,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
 "bytes",
- "heck",
+ "heck 0.5.0",
 "itertools 0.12.1",
 "log",
 "multimap",
@@ -5134,7 +5297,7 @@ dependencies = [
 "hyper 0.14.30",
 "hyper 1.4.1",
 "hyper-util",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "ipnet",
 "itertools 0.10.5",
 "itoa",
@@ -5645,7 +5808,7 @@ dependencies = [
 "async-trait",
 "getrandom 0.2.11",
 "http 1.1.0",
- "matchit",
+ "matchit 0.8.4",
 "opentelemetry",
 "reqwest",
 "reqwest-middleware",
@@ -6806,7 +6969,7 @@ version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "rustversion",
@@ -7231,6 +7394,16 @@ dependencies = [
 "syn 2.0.100",
 ]

+[[package]]
+name = "tokio-pipe"
+version = "0.2.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784"
+dependencies = [
+ "libc",
+ "tokio",
+]
+
 [[package]]
 name = "tokio-postgres"
 version = "0.7.10"
@@ -7413,7 +7586,7 @@ version = "0.22.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38"
 dependencies = [
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "serde",
 "serde_spanned",
 "toml_datetime",
@@ -7426,9 +7599,13 @@ version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
 dependencies = [
+ "async-stream",
 "async-trait",
+ "axum 0.7.9",
 "base64 0.22.1",
 "bytes",
+ "flate2",
+ "h2 0.4.4",
 "http 1.1.0",
 "http-body 1.0.0",
 "http-body-util",
@@ -7440,6 +7617,7 @@ dependencies = [
 "prost 0.13.3",
 "rustls-native-certs 0.8.0",
 "rustls-pemfile 2.1.1",
+ "socket2",
 "tokio",
 "tokio-rustls 0.26.0",
 "tokio-stream",
@@ -7939,7 +8117,7 @@ name = "vm_monitor"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "axum",
+ "axum 0.8.1",
 "cgroups-rs",
 "clap",
 "futures",
@@ -8449,7 +8627,7 @@ dependencies = [
 "hyper 1.4.1",
 "hyper-util",
 "indexmap 1.9.3",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "itertools 0.12.1",
 "lazy_static",
 "libc",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,6 +8,7 @@ members = [
    "pageserver/compaction",
    "pageserver/ctl",
    "pageserver/client",
+    "pageserver/client_grpc",
    "pageserver/pagebench",
    "proxy",
    "safekeeper",
@@ -29,6 +30,7 @@ members = [
    "libs/pq_proto",
    "libs/tenant_size_model",
    "libs/metrics",
+    "libs/neonart",
    "libs/postgres_connection",
    "libs/remote_storage",
    "libs/tracing-utils",
@@ -41,6 +43,7 @@ members = [
    "libs/proxy/postgres-types2",
    "libs/proxy/tokio-postgres2",
    "endpoint_storage",
+    "pgxn/neon/communicator",
 ]

 [workspace.package]
@@ -142,6 +145,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pem = "3.0.3"
+peekable = "0.3.0"
 pin-project-lite = "0.2"
 pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
 procfs = "0.16"
@@ -187,7 +191,6 @@ thiserror = "1.0"
 tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
 tokio = { version = "1.43.1", features = ["macros"] }
-tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
 tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
@@ -196,7 +199,7 @@ tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.8"
 toml_edit = "0.22"
-tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
+tonic = {version = "0.12.3", default-features = false, features = ["channel", "server", "tls", "tls-roots", "gzip"]}
 tower = { version = "0.5.2", default-features = false }
 tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }

@@ -228,6 +231,9 @@ x509-cert = { version = "0.2.5" }
 env_logger = "0.11"
 log = "0.4"

+tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
+uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
+
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
@@ -245,9 +251,12 @@ compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 http-utils = { version = "0.1", path = "./libs/http-utils/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
+neonart = { version = "0.1", path = "./libs/neonart/" }
 pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
+pageserver_client_grpc = { path = "./pageserver/client_grpc" }
+pageserver_data_api = { path = "./pageserver/data_api" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
@@ -271,6 +280,7 @@ wal_decoder = { version = "0.1", path = "./libs/wal_decoder" }
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
+cbindgen = "0.28.0"
 criterion = "0.5.1"
 rcgen = "0.13"
 rstest = "0.18"
--- a/7
+++ b/7
@@ -18,10 +18,12 @@ ifeq ($(BUILD_TYPE),release)
 	PG_LDFLAGS = $(LDFLAGS)
 	# Unfortunately, `--profile=...` is a nightly feature
 	CARGO_BUILD_FLAGS += --release
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
 else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
 	PG_CFLAGS += -O0 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
 else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif
@@ -180,11 +182,16 @@ postgres-check-%: postgres-%

 .PHONY: neon-pg-ext-%
 neon-pg-ext-%: postgres-%
+	+@echo "Compiling communicator $*"
+	$(CARGO_CMD_PREFIX) cargo build -p communicator $(CARGO_BUILD_FLAGS)
+
 	+@echo "Compiling neon $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
+		LIBCOMMUNICATOR_PATH=$(NEON_CARGO_ARTIFACT_TARGET_DIR) \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
+
 	+@echo "Compiling neon_walredo $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
--- a/libs/neonart/Cargo.toml
+++ b/libs/neonart/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "neonart"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+tracing.workspace = true
+
+rand.workspace = true # for tests
+zerocopy = "0.8"
--- a/libs/neonart/src/algorithm.rs
+++ b/libs/neonart/src/algorithm.rs
@@ -0,0 +1,377 @@
+mod lock_and_version;
+mod node_ptr;
+mod node_ref;
+
+use std::vec::Vec;
+
+use crate::algorithm::lock_and_version::ResultOrRestart;
+use crate::algorithm::node_ptr::{MAX_PREFIX_LEN, NodePtr};
+use crate::algorithm::node_ref::ChildOrValue;
+use crate::algorithm::node_ref::{NodeRef, ReadLockedNodeRef, WriteLockedNodeRef};
+
+use crate::epoch::EpochPin;
+use crate::{Allocator, Key, Value};
+
+pub(crate) type RootPtr<V> = node_ptr::NodePtr<V>;
+
+pub fn new_root<V: Value>(allocator: &Allocator) -> RootPtr<V> {
+    node_ptr::new_root(allocator)
+}
+
+pub(crate) fn search<'e, K: Key, V: Value>(
+    key: &K,
+    root: RootPtr<V>,
+    epoch_pin: &'e EpochPin,
+) -> Option<V> {
+    loop {
+        let root_ref = NodeRef::from_root_ptr(root);
+        if let Ok(result) = lookup_recurse(key.as_bytes(), root_ref, None, epoch_pin) {
+            break result;
+        }
+        // retry
+    }
+}
+
+pub(crate) fn update_fn<'e, K: Key, V: Value, F>(
+    key: &K,
+    value_fn: F,
+    root: RootPtr<V>,
+    allocator: &Allocator,
+    epoch_pin: &'e EpochPin,
+) where
+    F: FnOnce(Option<&V>) -> Option<V>,
+{
+    let value_fn_cell = std::cell::Cell::new(Some(value_fn));
+    loop {
+        let root_ref = NodeRef::from_root_ptr(root);
+        let this_value_fn = |arg: Option<&V>| value_fn_cell.take().unwrap()(arg);
+        let key_bytes = key.as_bytes();
+        if let Ok(()) = update_recurse(
+            key_bytes,
+            this_value_fn,
+            root_ref,
+            None,
+            allocator,
+            epoch_pin,
+            0,
+            key_bytes,
+        ) {
+            break;
+        }
+        // retry
+    }
+}
+
+pub(crate) fn dump_tree<'e, V: Value + std::fmt::Debug>(root: RootPtr<V>, epoch_pin: &'e EpochPin) {
+    let root_ref = NodeRef::from_root_ptr(root);
+
+    let _ = dump_recurse(&[], root_ref, &epoch_pin, 0);
+}
+
+// Error means you must retry.
+//
+// This corresponds to the 'lookupOpt' function in the paper
+fn lookup_recurse<'e, V: Value>(
+    key: &[u8],
+    node: NodeRef<'e, V>,
+    parent: Option<ReadLockedNodeRef<V>>,
+    epoch_pin: &'e EpochPin,
+) -> ResultOrRestart<Option<V>> {
+    let rnode = node.read_lock_or_restart()?;
+    if let Some(parent) = parent {
+        parent.read_unlock_or_restart()?;
+    }
+
+    // check if prefix matches, may increment level
+    let prefix_len = if let Some(prefix_len) = rnode.prefix_matches(key) {
+        prefix_len
+    } else {
+        rnode.read_unlock_or_restart()?;
+        return Ok(None);
+    };
+    let key = &key[prefix_len..];
+
+    // find child (or leaf value)
+    let next_node = rnode.find_child_or_value_or_restart(key[0])?;
+
+    match next_node {
+        None => Ok(None), // key not found
+        Some(ChildOrValue::Value(vptr)) => {
+            // safety: It's OK to follow the pointer because we checked the version.
+            let v = unsafe { (*vptr).clone() };
+            Ok(Some(v))
+        }
+        Some(ChildOrValue::Child(v)) => lookup_recurse(&key[1..], v, Some(rnode), epoch_pin),
+    }
+}
+
+// This corresponds to the 'insertOpt' function in the paper
+pub(crate) fn update_recurse<'e, V: Value, F>(
+    key: &[u8],
+    value_fn: F,
+    node: NodeRef<'e, V>,
+    rparent: Option<(ReadLockedNodeRef<V>, u8)>,
+    allocator: &Allocator,
+    epoch_pin: &'e EpochPin,
+    level: usize,
+    orig_key: &[u8],
+) -> ResultOrRestart<()>
+where
+    F: FnOnce(Option<&V>) -> Option<V>,
+{
+    let rnode = node.read_lock_or_restart()?;
+
+    let prefix_match_len = rnode.prefix_matches(key);
+    if prefix_match_len.is_none() {
+        let (rparent, parent_key) = rparent.expect("direct children of the root have no prefix");
+        let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+        let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+        if let Some(new_value) = value_fn(None) {
+            insert_split_prefix(
+                key,
+                new_value,
+                &mut wnode,
+                &mut wparent,
+                parent_key,
+                allocator,
+            );
+        }
+        wnode.write_unlock();
+        wparent.write_unlock();
+        return Ok(());
+    }
+    let prefix_match_len = prefix_match_len.unwrap();
+    let key = &key[prefix_match_len as usize..];
+    let level = level + prefix_match_len as usize;
+
+    let next_node = rnode.find_child_or_value_or_restart(key[0])?;
+
+    if next_node.is_none() {
+        if rnode.is_full() {
+            let (rparent, parent_key) = rparent.expect("root node cannot become full");
+            let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
+            let wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+            if let Some(new_value) = value_fn(None) {
+                insert_and_grow(key, new_value, &wnode, &mut wparent, parent_key, allocator);
+                wnode.write_unlock_obsolete();
+                wparent.write_unlock();
+            } else {
+                wnode.write_unlock();
+                wparent.write_unlock();
+            }
+        } else {
+            let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
+            if let Some((rparent, _)) = rparent {
+                rparent.read_unlock_or_restart()?;
+            }
+            if let Some(new_value) = value_fn(None) {
+                insert_to_node(&mut wnode, key, new_value, allocator);
+            }
+            wnode.write_unlock();
+        }
+        return Ok(());
+    } else {
+        let next_node = next_node.unwrap(); // checked above it's not None
+        if let Some((rparent, _)) = rparent {
+            rparent.read_unlock_or_restart()?;
+        }
+
+        match next_node {
+            ChildOrValue::Value(existing_value_ptr) => {
+                assert!(key.len() == 1);
+                let wnode = rnode.upgrade_to_write_lock_or_restart()?;
+
+                // safety: Now that we have acquired the write lock, we have exclusive access to the
+                // value
+                let vmut = unsafe { existing_value_ptr.cast_mut().as_mut() }.unwrap();
+                if let Some(new_value) = value_fn(Some(vmut)) {
+                    *vmut = new_value;
+                } else {
+                    // TODO: Treat this as deletion?
+                }
+                wnode.write_unlock();
+
+                Ok(())
+            }
+            ChildOrValue::Child(next_child) => {
+                // recurse to next level
+                update_recurse(
+                    &key[1..],
+                    value_fn,
+                    next_child,
+                    Some((rnode, key[0])),
+                    allocator,
+                    epoch_pin,
+                    level + 1,
+                    orig_key,
+                )
+            }
+        }
+    }
+}
+
+#[derive(Clone)]
+enum PathElement {
+    Prefix(Vec<u8>),
+    KeyByte(u8),
+}
+
+impl std::fmt::Debug for PathElement {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            PathElement::Prefix(prefix) => write!(fmt, "{:?}", prefix),
+            PathElement::KeyByte(key_byte) => write!(fmt, "{}", key_byte),
+        }
+    }
+}
+
+fn dump_recurse<'e, V: Value + std::fmt::Debug>(
+    path: &[PathElement],
+    node: NodeRef<'e, V>,
+    epoch_pin: &'e EpochPin,
+    level: usize,
+) -> ResultOrRestart<()> {
+    let indent = str::repeat(" ", level);
+
+    let rnode = node.read_lock_or_restart()?;
+    let mut path = Vec::from(path);
+    let prefix = rnode.get_prefix();
+    if prefix.len() != 0 {
+        path.push(PathElement::Prefix(Vec::from(prefix)));
+    }
+
+    for key_byte in 0..u8::MAX {
+        match rnode.find_child_or_value_or_restart(key_byte)? {
+            None => continue,
+            Some(ChildOrValue::Child(child_ref)) => {
+                let rchild = child_ref.read_lock_or_restart()?;
+                eprintln!(
+                    "{} {:?}, {}: prefix {:?}",
+                    indent,
+                    &path,
+                    key_byte,
+                    rchild.get_prefix()
+                );
+
+                let mut child_path = path.clone();
+                child_path.push(PathElement::KeyByte(key_byte));
+
+                dump_recurse(&child_path, child_ref, epoch_pin, level + 1)?;
+            }
+            Some(ChildOrValue::Value(val)) => {
+                eprintln!("{} {:?}, {}: {:?}", indent, path, key_byte, unsafe {
+                    val.as_ref().unwrap()
+                });
+            }
+        }
+    }
+
+    Ok(())
+}
+
+///```text
+///        [fooba]r -> value
+///
+/// [foo]b -> [a]r  -> value
+///      e -> [ls]e -> value
+///```
+fn insert_split_prefix<'a, V: Value>(
+    key: &[u8],
+    value: V,
+    node: &mut WriteLockedNodeRef<V>,
+    parent: &mut WriteLockedNodeRef<V>,
+    parent_key: u8,
+    allocator: &Allocator,
+) {
+    let old_node = node;
+    let old_prefix = old_node.get_prefix();
+    let common_prefix_len = common_prefix(key, old_prefix);
+
+    // Allocate a node for the new value.
+    let new_value_node = allocate_node_for_value(&key[common_prefix_len + 1..], value, allocator);
+
+    // Allocate a new internal node with the common prefix
+    let mut prefix_node = node_ref::new_internal(&key[..common_prefix_len], allocator);
+
+    // Add the old node and the new nodes to the new internal node
+    prefix_node.insert_child(old_prefix[common_prefix_len], old_node.as_ptr());
+    prefix_node.insert_child(key[common_prefix_len], new_value_node);
+
+    // Modify the prefix of the old child in place
+    old_node.truncate_prefix(old_prefix.len() - common_prefix_len - 1);
+
+    // replace the pointer in the parent
+    parent.replace_child(parent_key, prefix_node.into_ptr());
+}
+
+fn insert_to_node<V: Value>(
+    wnode: &mut WriteLockedNodeRef<V>,
+    key: &[u8],
+    value: V,
+    allocator: &Allocator,
+) {
+    if wnode.is_leaf() {
+        wnode.insert_value(key[0], value);
+    } else {
+        let value_child = allocate_node_for_value(&key[1..], value, allocator);
+        wnode.insert_child(key[0], value_child);
+    }
+}
+
+// On entry: 'parent' and 'node' are locked
+fn insert_and_grow<V: Value>(
+    key: &[u8],
+    value: V,
+    wnode: &WriteLockedNodeRef<V>,
+    parent: &mut WriteLockedNodeRef<V>,
+    parent_key_byte: u8,
+    allocator: &Allocator,
+) {
+    let mut bigger_node = wnode.grow(allocator);
+
+    if wnode.is_leaf() {
+        bigger_node.insert_value(key[0], value);
+    } else {
+        let value_child = allocate_node_for_value(&key[1..], value, allocator);
+        bigger_node.insert_child(key[0], value_child);
+    }
+
+    // Replace the pointer in the parent
+    parent.replace_child(parent_key_byte, bigger_node.into_ptr());
+}
+
+// Allocate a new leaf node to hold 'value'. If key is long, we may need to allocate
+// new internal nodes to hold it too
+fn allocate_node_for_value<V: Value>(key: &[u8], value: V, allocator: &Allocator) -> NodePtr<V> {
+    let mut prefix_off = key.len().saturating_sub(MAX_PREFIX_LEN + 1);
+
+    let mut leaf_node = node_ref::new_leaf(&key[prefix_off..key.len() - 1], allocator);
+    leaf_node.insert_value(*key.last().unwrap(), value);
+
+    let mut node = leaf_node;
+    while prefix_off > 0 {
+        // Need another internal node
+        let remain_prefix = &key[0..prefix_off];
+
+        prefix_off = remain_prefix.len().saturating_sub(MAX_PREFIX_LEN + 1);
+        let mut internal_node = node_ref::new_internal(
+            &remain_prefix[prefix_off..remain_prefix.len() - 1],
+            allocator,
+        );
+        internal_node.insert_child(*remain_prefix.last().unwrap(), node.into_ptr());
+        node = internal_node;
+    }
+
+    node.into_ptr()
+}
+
+fn common_prefix(a: &[u8], b: &[u8]) -> usize {
+    for i in 0..MAX_PREFIX_LEN {
+        if a[i] != b[i] {
+            return i;
+        }
+    }
+    panic!("prefixes are equal");
+}
--- a/libs/neonart/src/algorithm/lock_and_version.rs
+++ b/libs/neonart/src/algorithm/lock_and_version.rs
@@ -0,0 +1,85 @@
+use std::sync::atomic::{AtomicU64, Ordering};
+
+pub(crate) struct AtomicLockAndVersion {
+    inner: AtomicU64,
+}
+
+impl AtomicLockAndVersion {
+    pub(crate) fn new() -> AtomicLockAndVersion {
+        AtomicLockAndVersion {
+            inner: AtomicU64::new(0),
+        }
+    }
+}
+
+pub(crate) type ResultOrRestart<T> = Result<T, ()>;
+
+const fn restart<T>() -> ResultOrRestart<T> {
+    Err(())
+}
+
+impl AtomicLockAndVersion {
+    pub(crate) fn read_lock_or_restart(&self) -> ResultOrRestart<u64> {
+        let version = self.await_node_unlocked();
+        if is_obsolete(version) {
+            return restart();
+        }
+        Ok(version)
+    }
+
+    pub(crate) fn check_or_restart(&self, version: u64) -> ResultOrRestart<()> {
+        self.read_unlock_or_restart(version)
+    }
+
+    pub(crate) fn read_unlock_or_restart(&self, version: u64) -> ResultOrRestart<()> {
+        if self.inner.load(Ordering::Acquire) != version {
+            return restart();
+        }
+        Ok(())
+    }
+
+    pub(crate) fn upgrade_to_write_lock_or_restart(&self, version: u64) -> ResultOrRestart<()> {
+        if self
+            .inner
+            .compare_exchange(
+                version,
+                set_locked_bit(version),
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            )
+            .is_err()
+        {
+            return restart();
+        }
+        Ok(())
+    }
+
+    pub(crate) fn write_unlock(&self) {
+        // reset locked bit and overflow into version
+        self.inner.fetch_add(2, Ordering::Release);
+    }
+
+    pub(crate) fn write_unlock_obsolete(&self) {
+        // set obsolete, reset locked, overflow into version
+        self.inner.fetch_add(3, Ordering::Release);
+    }
+
+    // Helper functions
+    fn await_node_unlocked(&self) -> u64 {
+        let mut version = self.inner.load(Ordering::Acquire);
+        while (version & 2) == 2 {
+            // spinlock
+            std::thread::yield_now();
+            version = self.inner.load(Ordering::Acquire)
+        }
+        version
+    }
+}
+
+fn set_locked_bit(version: u64) -> u64 {
+    return version + 2;
+}
+
+fn is_obsolete(version: u64) -> bool {
+    return (version & 1) == 1;
+}
--- a/libs/neonart/src/algorithm/node_ptr.rs
+++ b/libs/neonart/src/algorithm/node_ptr.rs
@@ -0,0 +1,983 @@
+use std::marker::PhantomData;
+use std::ptr::NonNull;
+
+use super::lock_and_version::AtomicLockAndVersion;
+
+use crate::Allocator;
+use crate::Value;
+
+pub(crate) const MAX_PREFIX_LEN: usize = 8;
+
+enum NodeTag {
+    Internal4,
+    Internal16,
+    Internal48,
+    Internal256,
+    Leaf4,
+    Leaf16,
+    Leaf48,
+    Leaf256,
+}
+
+#[repr(C)]
+struct NodeBase {
+    tag: NodeTag,
+    lock_and_version: AtomicLockAndVersion,
+}
+
+pub(crate) struct NodePtr<V> {
+    ptr: *mut NodeBase,
+
+    phantom_value: PhantomData<V>,
+}
+
+impl<V> std::fmt::Debug for NodePtr<V> {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "0x{}", self.ptr.addr())
+    }
+}
+
+impl<V> Copy for NodePtr<V> {}
+impl<V> Clone for NodePtr<V> {
+    fn clone(&self) -> NodePtr<V> {
+        NodePtr {
+            ptr: self.ptr,
+            phantom_value: PhantomData,
+        }
+    }
+}
+
+enum NodeVariant<'a, V> {
+    Internal4(&'a NodeInternal4<V>),
+    Internal16(&'a NodeInternal16<V>),
+    Internal48(&'a NodeInternal48<V>),
+    Internal256(&'a NodeInternal256<V>),
+    Leaf4(&'a NodeLeaf4<V>),
+    Leaf16(&'a NodeLeaf16<V>),
+    Leaf48(&'a NodeLeaf48<V>),
+    Leaf256(&'a NodeLeaf256<V>),
+}
+
+enum NodeVariantMut<'a, V> {
+    Internal4(&'a mut NodeInternal4<V>),
+    Internal16(&'a mut NodeInternal16<V>),
+    Internal48(&'a mut NodeInternal48<V>),
+    Internal256(&'a mut NodeInternal256<V>),
+    Leaf4(&'a mut NodeLeaf4<V>),
+    Leaf16(&'a mut NodeLeaf16<V>),
+    Leaf48(&'a mut NodeLeaf48<V>),
+    Leaf256(&'a mut NodeLeaf256<V>),
+}
+
+pub(crate) enum ChildOrValuePtr<V> {
+    Child(NodePtr<V>),
+    Value(*const V),
+}
+
+#[repr(C)]
+struct NodeInternal4<V> {
+    tag: NodeTag,
+    lock_and_version: AtomicLockAndVersion,
+
+    prefix: [u8; MAX_PREFIX_LEN],
+    prefix_len: u8,
+    num_children: u8,
+
+    child_keys: [u8; 4],
+    child_ptrs: [NodePtr<V>; 4],
+}
+
+#[repr(C)]
+struct NodeInternal16<V> {
+    tag: NodeTag,
+    lock_and_version: AtomicLockAndVersion,
+
+    prefix: [u8; MAX_PREFIX_LEN],
+    prefix_len: u8,
+
+    num_children: u8,
+    child_keys: [u8; 16],
+    child_ptrs: [NodePtr<V>; 16],
+}
+
+const INVALID_CHILD_INDEX: u8 = u8::MAX;
+
+#[repr(C)]
+struct NodeInternal48<V> {
+    tag: NodeTag,
+    lock_and_version: AtomicLockAndVersion,
+
+    prefix: [u8; MAX_PREFIX_LEN],
+    prefix_len: u8,
+
+    num_children: u8,
+    child_indexes: [u8; 256],
+    child_ptrs: [NodePtr<V>; 48],
+}
+
+#[repr(C)]
+pub(crate) struct NodeInternal256<V> {
+    tag: NodeTag,
+    lock_and_version: AtomicLockAndVersion,
+
+    prefix: [u8; MAX_PREFIX_LEN],
+    prefix_len: u8,
+
+    num_children: u16,
+    child_ptrs: [NodePtr<V>; 256],
+}
+
+#[repr(C)]
+struct NodeLeaf4<V> {
+    tag: NodeTag,
+    lock_and_version: AtomicLockAndVersion,
+
+    prefix: [u8; MAX_PREFIX_LEN],
+    prefix_len: u8,
+
+    num_values: u8,
+    child_keys: [u8; 4],
+    child_values: [Option<V>; 4],
+}
+
+#[repr(C)]
+struct NodeLeaf16<V> {
+    tag: NodeTag,
+    lock_and_version: AtomicLockAndVersion,
+
+    prefix: [u8; MAX_PREFIX_LEN],
+    prefix_len: u8,
+
+    num_values: u8,
+    child_keys: [u8; 16],
+    child_values: [Option<V>; 16],
+}
+
+#[repr(C)]
+struct NodeLeaf48<V> {
+    tag: NodeTag,
+    lock_and_version: AtomicLockAndVersion,
+
+    prefix: [u8; MAX_PREFIX_LEN],
+    prefix_len: u8,
+
+    num_values: u8,
+    child_indexes: [u8; 256],
+    child_values: [Option<V>; 48],
+}
+
+#[repr(C)]
+struct NodeLeaf256<V> {
+    tag: NodeTag,
+    lock_and_version: AtomicLockAndVersion,
+
+    prefix: [u8; MAX_PREFIX_LEN],
+    prefix_len: u8,
+
+    num_values: u16,
+    child_values: [Option<V>; 256],
+}
+
+impl<V> NodePtr<V> {
+    pub(crate) fn is_leaf(&self) -> bool {
+        match self.variant() {
+            NodeVariant::Internal4(_) => false,
+            NodeVariant::Internal16(_) => false,
+            NodeVariant::Internal48(_) => false,
+            NodeVariant::Internal256(_) => false,
+            NodeVariant::Leaf4(_) => true,
+            NodeVariant::Leaf16(_) => true,
+            NodeVariant::Leaf48(_) => true,
+            NodeVariant::Leaf256(_) => true,
+        }
+    }
+
+    pub(crate) fn lockword(&self) -> &AtomicLockAndVersion {
+        match self.variant() {
+            NodeVariant::Internal4(n) => &n.lock_and_version,
+            NodeVariant::Internal16(n) => &n.lock_and_version,
+            NodeVariant::Internal48(n) => &n.lock_and_version,
+            NodeVariant::Internal256(n) => &n.lock_and_version,
+            NodeVariant::Leaf4(n) => &n.lock_and_version,
+            NodeVariant::Leaf16(n) => &n.lock_and_version,
+            NodeVariant::Leaf48(n) => &n.lock_and_version,
+            NodeVariant::Leaf256(n) => &n.lock_and_version,
+        }
+    }
+
+    pub(crate) fn is_null(&self) -> bool {
+        self.ptr.is_null()
+    }
+
+    pub(crate) const fn null() -> NodePtr<V> {
+        NodePtr {
+            ptr: std::ptr::null_mut(),
+            phantom_value: PhantomData,
+        }
+    }
+
+    fn variant(&self) -> NodeVariant<V> {
+        unsafe {
+            match (*self.ptr).tag {
+                NodeTag::Internal4 => NodeVariant::Internal4(
+                    NonNull::new_unchecked(self.ptr.cast::<NodeInternal4<V>>()).as_ref(),
+                ),
+                NodeTag::Internal16 => NodeVariant::Internal16(
+                    NonNull::new_unchecked(self.ptr.cast::<NodeInternal16<V>>()).as_ref(),
+                ),
+                NodeTag::Internal48 => NodeVariant::Internal48(
+                    NonNull::new_unchecked(self.ptr.cast::<NodeInternal48<V>>()).as_ref(),
+                ),
+                NodeTag::Internal256 => NodeVariant::Internal256(
+                    NonNull::new_unchecked(self.ptr.cast::<NodeInternal256<V>>()).as_ref(),
+                ),
+                NodeTag::Leaf4 => NodeVariant::Leaf4(
+                    NonNull::new_unchecked(self.ptr.cast::<NodeLeaf4<V>>()).as_ref(),
+                ),
+                NodeTag::Leaf16 => NodeVariant::Leaf16(
+                    NonNull::new_unchecked(self.ptr.cast::<NodeLeaf16<V>>()).as_ref(),
+                ),
+                NodeTag::Leaf48 => NodeVariant::Leaf48(
+                    NonNull::new_unchecked(self.ptr.cast::<NodeLeaf48<V>>()).as_ref(),
+                ),
+                NodeTag::Leaf256 => NodeVariant::Leaf256(
+                    NonNull::new_unchecked(self.ptr.cast::<NodeLeaf256<V>>()).as_ref(),
+                ),
+            }
+        }
+    }
+
+    fn variant_mut(&mut self) -> NodeVariantMut<V> {
+        unsafe {
+            match (*self.ptr).tag {
+                NodeTag::Internal4 => NodeVariantMut::Internal4(
+                    NonNull::new_unchecked(self.ptr.cast::<NodeInternal4<V>>()).as_mut(),
+                ),
+                NodeTag::Internal16 => NodeVariantMut::Internal16(
+                    NonNull::new_unchecked(self.ptr.cast::<NodeInternal16<V>>()).as_mut(),
+                ),
+                NodeTag::Internal48 => NodeVariantMut::Internal48(
+                    NonNull::new_unchecked(self.ptr.cast::<NodeInternal48<V>>()).as_mut(),
+                ),
+                NodeTag::Internal256 => NodeVariantMut::Internal256(
+                    NonNull::new_unchecked(self.ptr.cast::<NodeInternal256<V>>()).as_mut(),
+                ),
+                NodeTag::Leaf4 => NodeVariantMut::Leaf4(
+                    NonNull::new_unchecked(self.ptr.cast::<NodeLeaf4<V>>()).as_mut(),
+                ),
+                NodeTag::Leaf16 => NodeVariantMut::Leaf16(
+                    NonNull::new_unchecked(self.ptr.cast::<NodeLeaf16<V>>()).as_mut(),
+                ),
+                NodeTag::Leaf48 => NodeVariantMut::Leaf48(
+                    NonNull::new_unchecked(self.ptr.cast::<NodeLeaf48<V>>()).as_mut(),
+                ),
+                NodeTag::Leaf256 => NodeVariantMut::Leaf256(
+                    NonNull::new_unchecked(self.ptr.cast::<NodeLeaf256<V>>()).as_mut(),
+                ),
+            }
+        }
+    }
+}
+
+impl<V: Value> NodePtr<V> {
+    pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
+        let node_prefix = self.get_prefix();
+        assert!(node_prefix.len() <= key.len()); // because we only use fixed-size keys
+        if &key[0..node_prefix.len()] != node_prefix {
+            None
+        } else {
+            Some(node_prefix.len())
+        }
+    }
+
+    pub(crate) fn get_prefix(&self) -> &[u8] {
+        match self.variant() {
+            NodeVariant::Internal4(n) => n.get_prefix(),
+            NodeVariant::Internal16(n) => n.get_prefix(),
+            NodeVariant::Internal48(n) => n.get_prefix(),
+            NodeVariant::Internal256(n) => n.get_prefix(),
+            NodeVariant::Leaf4(n) => n.get_prefix(),
+            NodeVariant::Leaf16(n) => n.get_prefix(),
+            NodeVariant::Leaf48(n) => n.get_prefix(),
+            NodeVariant::Leaf256(n) => n.get_prefix(),
+        }
+    }
+
+    pub(crate) fn is_full(&self) -> bool {
+        match self.variant() {
+            NodeVariant::Internal4(n) => n.is_full(),
+            NodeVariant::Internal16(n) => n.is_full(),
+            NodeVariant::Internal48(n) => n.is_full(),
+            NodeVariant::Internal256(n) => n.is_full(),
+            NodeVariant::Leaf4(n) => n.is_full(),
+            NodeVariant::Leaf16(n) => n.is_full(),
+            NodeVariant::Leaf48(n) => n.is_full(),
+            NodeVariant::Leaf256(n) => n.is_full(),
+        }
+    }
+
+    pub(crate) fn find_child_or_value(&self, key_byte: u8) -> Option<ChildOrValuePtr<V>> {
+        match self.variant() {
+            NodeVariant::Internal4(n) => n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c)),
+            NodeVariant::Internal16(n) => n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c)),
+            NodeVariant::Internal48(n) => n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c)),
+            NodeVariant::Internal256(n) => {
+                n.find_child(key_byte).map(|c| ChildOrValuePtr::Child(c))
+            }
+            NodeVariant::Leaf4(n) => n
+                .get_leaf_value(key_byte)
+                .map(|v| ChildOrValuePtr::Value(v)),
+            NodeVariant::Leaf16(n) => n
+                .get_leaf_value(key_byte)
+                .map(|v| ChildOrValuePtr::Value(v)),
+            NodeVariant::Leaf48(n) => n
+                .get_leaf_value(key_byte)
+                .map(|v| ChildOrValuePtr::Value(v)),
+            NodeVariant::Leaf256(n) => n
+                .get_leaf_value(key_byte)
+                .map(|v| ChildOrValuePtr::Value(v)),
+        }
+    }
+
+    pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
+        match self.variant_mut() {
+            NodeVariantMut::Internal4(n) => n.truncate_prefix(new_prefix_len),
+            NodeVariantMut::Internal16(n) => n.truncate_prefix(new_prefix_len),
+            NodeVariantMut::Internal48(n) => n.truncate_prefix(new_prefix_len),
+            NodeVariantMut::Internal256(n) => n.truncate_prefix(new_prefix_len),
+            NodeVariantMut::Leaf4(n) => n.truncate_prefix(new_prefix_len),
+            NodeVariantMut::Leaf16(n) => n.truncate_prefix(new_prefix_len),
+            NodeVariantMut::Leaf48(n) => n.truncate_prefix(new_prefix_len),
+            NodeVariantMut::Leaf256(n) => n.truncate_prefix(new_prefix_len),
+        }
+    }
+
+    pub(crate) fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
+        match self.variant() {
+            NodeVariant::Internal4(n) => n.grow(allocator),
+            NodeVariant::Internal16(n) => n.grow(allocator),
+            NodeVariant::Internal48(n) => n.grow(allocator),
+            NodeVariant::Internal256(_) => panic!("cannot grow Internal256 node"),
+            NodeVariant::Leaf4(n) => n.grow(allocator),
+            NodeVariant::Leaf16(n) => n.grow(allocator),
+            NodeVariant::Leaf48(n) => n.grow(allocator),
+            NodeVariant::Leaf256(_) => panic!("cannot grow Leaf256 node"),
+        }
+    }
+
+    pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
+        match self.variant_mut() {
+            NodeVariantMut::Internal4(n) => n.insert_child(key_byte, child),
+            NodeVariantMut::Internal16(n) => n.insert_child(key_byte, child),
+            NodeVariantMut::Internal48(n) => n.insert_child(key_byte, child),
+            NodeVariantMut::Internal256(n) => n.insert_child(key_byte, child),
+            NodeVariantMut::Leaf4(_)
+            | NodeVariantMut::Leaf16(_)
+            | NodeVariantMut::Leaf48(_)
+            | NodeVariantMut::Leaf256(_) => panic!("insert_child called on leaf node"),
+        }
+    }
+
+    pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
+        match self.variant_mut() {
+            NodeVariantMut::Internal4(n) => n.replace_child(key_byte, replacement),
+            NodeVariantMut::Internal16(n) => n.replace_child(key_byte, replacement),
+            NodeVariantMut::Internal48(n) => n.replace_child(key_byte, replacement),
+            NodeVariantMut::Internal256(n) => n.replace_child(key_byte, replacement),
+            NodeVariantMut::Leaf4(_)
+            | NodeVariantMut::Leaf16(_)
+            | NodeVariantMut::Leaf48(_)
+            | NodeVariantMut::Leaf256(_) => panic!("replace_child called on leaf node"),
+        }
+    }
+
+    pub(crate) fn insert_value(&mut self, key_byte: u8, value: V) {
+        match self.variant_mut() {
+            NodeVariantMut::Internal4(_)
+            | NodeVariantMut::Internal16(_)
+            | NodeVariantMut::Internal48(_)
+            | NodeVariantMut::Internal256(_) => panic!("insert_value called on internal node"),
+            NodeVariantMut::Leaf4(n) => n.insert_value(key_byte, value),
+            NodeVariantMut::Leaf16(n) => n.insert_value(key_byte, value),
+            NodeVariantMut::Leaf48(n) => n.insert_value(key_byte, value),
+            NodeVariantMut::Leaf256(n) => n.insert_value(key_byte, value),
+        }
+    }
+}
+
+pub fn new_root<V: Value>(allocator: &Allocator) -> NodePtr<V> {
+    NodePtr {
+        ptr: allocator.alloc(NodeInternal256::<V>::new()).as_ptr().cast(),
+        phantom_value: PhantomData,
+    }
+}
+
+pub fn new_internal<V: Value>(prefix: &[u8], allocator: &Allocator) -> NodePtr<V> {
+    let mut node = allocator.alloc(NodeInternal4 {
+        tag: NodeTag::Internal4,
+        lock_and_version: AtomicLockAndVersion::new(),
+
+        prefix: [8; MAX_PREFIX_LEN],
+        prefix_len: prefix.len() as u8,
+        num_children: 0,
+
+        child_keys: [0; 4],
+        child_ptrs: [const { NodePtr::null() }; 4],
+    });
+    node.prefix[0..prefix.len()].copy_from_slice(prefix);
+
+    node.as_ptr().into()
+}
+
+pub fn new_leaf<V: Value>(prefix: &[u8], allocator: &Allocator) -> NodePtr<V> {
+    let mut node = allocator.alloc(NodeLeaf4 {
+        tag: NodeTag::Leaf4,
+        lock_and_version: AtomicLockAndVersion::new(),
+
+        prefix: [8; MAX_PREFIX_LEN],
+        prefix_len: prefix.len() as u8,
+        num_values: 0,
+
+        child_keys: [0; 4],
+        child_values: [const { None }; 4],
+    });
+    node.prefix[0..prefix.len()].copy_from_slice(prefix);
+
+    node.as_ptr().into()
+}
+
+impl<V: Value> NodeInternal4<V> {
+    fn get_prefix(&self) -> &[u8] {
+        &self.prefix[0..self.prefix_len as usize]
+    }
+
+    fn truncate_prefix(&mut self, new_prefix_len: usize) {
+        assert!(new_prefix_len < self.prefix_len as usize);
+        let prefix = &mut self.prefix;
+        let offset = self.prefix_len as usize - new_prefix_len;
+        for i in 0..new_prefix_len {
+            prefix[i] = prefix[i + offset];
+        }
+        self.prefix_len = new_prefix_len as u8;
+    }
+
+    fn find_child(&self, key: u8) -> Option<NodePtr<V>> {
+        for i in 0..self.num_children as usize {
+            if self.child_keys[i] == key {
+                return Some(self.child_ptrs[i]);
+            }
+        }
+        None
+    }
+
+    fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
+        for i in 0..self.num_children as usize {
+            if self.child_keys[i] == key_byte {
+                self.child_ptrs[i] = replacement;
+                return;
+            }
+        }
+        panic!("could not re-find parent with key {}", key_byte);
+    }
+
+    fn is_full(&self) -> bool {
+        self.num_children == 4
+    }
+
+    fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
+        assert!(self.num_children < 4);
+
+        let idx = self.num_children as usize;
+        self.child_keys[idx] = key_byte;
+        self.child_ptrs[idx] = child;
+        self.num_children += 1;
+    }
+
+    fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
+        let mut node16 = allocator.alloc(NodeInternal16 {
+            tag: NodeTag::Internal16,
+            lock_and_version: AtomicLockAndVersion::new(),
+
+            prefix: self.prefix.clone(),
+            prefix_len: self.prefix_len,
+            num_children: self.num_children,
+
+            child_keys: [0; 16],
+            child_ptrs: [const { NodePtr::null() }; 16],
+        });
+        for i in 0..self.num_children as usize {
+            node16.child_keys[i] = self.child_keys[i];
+            node16.child_ptrs[i] = self.child_ptrs[i];
+        }
+
+        node16.as_ptr().into()
+    }
+}
+
+impl<V: Value> NodeInternal16<V> {
+    fn get_prefix(&self) -> &[u8] {
+        &self.prefix[0..self.prefix_len as usize]
+    }
+
+    fn truncate_prefix(&mut self, new_prefix_len: usize) {
+        assert!(new_prefix_len < self.prefix_len as usize);
+        let prefix = &mut self.prefix;
+        let offset = self.prefix_len as usize - new_prefix_len;
+        for i in 0..new_prefix_len {
+            prefix[i] = prefix[i + offset];
+        }
+        self.prefix_len = new_prefix_len as u8;
+    }
+
+    fn find_child(&self, key_byte: u8) -> Option<NodePtr<V>> {
+        for i in 0..self.num_children as usize {
+            if self.child_keys[i] == key_byte {
+                return Some(self.child_ptrs[i]);
+            }
+        }
+        None
+    }
+
+    fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
+        for i in 0..self.num_children as usize {
+            if self.child_keys[i] == key_byte {
+                self.child_ptrs[i] = replacement;
+                return;
+            }
+        }
+        panic!("could not re-find parent with key {}", key_byte);
+    }
+
+    fn is_full(&self) -> bool {
+        self.num_children == 16
+    }
+
+    fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
+        assert!(self.num_children < 16);
+
+        let idx = self.num_children as usize;
+        self.child_keys[idx] = key_byte;
+        self.child_ptrs[idx] = child;
+        self.num_children += 1;
+    }
+
+    fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
+        let mut node48 = allocator.alloc(NodeInternal48 {
+            tag: NodeTag::Internal48,
+            lock_and_version: AtomicLockAndVersion::new(),
+
+            prefix: self.prefix.clone(),
+            prefix_len: self.prefix_len,
+            num_children: self.num_children,
+
+            child_indexes: [INVALID_CHILD_INDEX; 256],
+            child_ptrs: [const { NodePtr::null() }; 48],
+        });
+        for i in 0..self.num_children as usize {
+            let idx = self.child_keys[i] as usize;
+            node48.child_indexes[idx] = i as u8;
+            node48.child_ptrs[i] = self.child_ptrs[i];
+        }
+
+        node48.as_ptr().into()
+    }
+}
+
+impl<V: Value> NodeInternal48<V> {
+    fn get_prefix(&self) -> &[u8] {
+        &self.prefix[0..self.prefix_len as usize]
+    }
+
+    fn truncate_prefix(&mut self, new_prefix_len: usize) {
+        assert!(new_prefix_len < self.prefix_len as usize);
+        let prefix = &mut self.prefix;
+        let offset = self.prefix_len as usize - new_prefix_len;
+        for i in 0..new_prefix_len {
+            prefix[i] = prefix[i + offset];
+        }
+        self.prefix_len = new_prefix_len as u8;
+    }
+
+    fn find_child(&self, key_byte: u8) -> Option<NodePtr<V>> {
+        let idx = self.child_indexes[key_byte as usize];
+        if idx != INVALID_CHILD_INDEX {
+            Some(self.child_ptrs[idx as usize])
+        } else {
+            None
+        }
+    }
+
+    fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
+        let idx = self.child_indexes[key_byte as usize];
+        if idx != INVALID_CHILD_INDEX {
+            self.child_ptrs[idx as usize] = replacement
+        } else {
+            panic!("could not re-find parent with key {}", key_byte);
+        }
+    }
+
+    fn is_full(&self) -> bool {
+        self.num_children == 48
+    }
+
+    fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
+        assert!(self.num_children < 48);
+        assert!(self.child_indexes[key_byte as usize] == INVALID_CHILD_INDEX);
+        let idx = self.num_children;
+        self.child_indexes[key_byte as usize] = idx;
+        self.child_ptrs[idx as usize] = child;
+        self.num_children += 1;
+    }
+
+    fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
+        let mut node256 = allocator.alloc(NodeInternal256 {
+            tag: NodeTag::Internal256,
+            lock_and_version: AtomicLockAndVersion::new(),
+
+            prefix: self.prefix.clone(),
+            prefix_len: self.prefix_len,
+            num_children: self.num_children as u16,
+
+            child_ptrs: [const { NodePtr::null() }; 256],
+        });
+        for i in 0..256 {
+            let idx = self.child_indexes[i];
+            if idx != INVALID_CHILD_INDEX {
+                node256.child_ptrs[i] = self.child_ptrs[idx as usize];
+            }
+        }
+        node256.as_ptr().into()
+    }
+}
+
+impl<V: Value> NodeInternal256<V> {
+    fn get_prefix(&self) -> &[u8] {
+        &self.prefix[0..self.prefix_len as usize]
+    }
+
+    fn truncate_prefix(&mut self, new_prefix_len: usize) {
+        assert!(new_prefix_len < self.prefix_len as usize);
+        let prefix = &mut self.prefix;
+        let offset = self.prefix_len as usize - new_prefix_len;
+        for i in 0..new_prefix_len {
+            prefix[i] = prefix[i + offset];
+        }
+        self.prefix_len = new_prefix_len as u8;
+    }
+
+    fn find_child(&self, key_byte: u8) -> Option<NodePtr<V>> {
+        let idx = key_byte as usize;
+        if !self.child_ptrs[idx].is_null() {
+            Some(self.child_ptrs[idx])
+        } else {
+            None
+        }
+    }
+
+    fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
+        let idx = key_byte as usize;
+        if !self.child_ptrs[idx].is_null() {
+            self.child_ptrs[idx] = replacement
+        } else {
+            panic!("could not re-find parent with key {}", key_byte);
+        }
+    }
+
+    fn is_full(&self) -> bool {
+        self.num_children == 256
+    }
+
+    fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
+        assert!(self.num_children < 256);
+        assert!(self.child_ptrs[key_byte as usize].is_null());
+        self.child_ptrs[key_byte as usize] = child;
+        self.num_children += 1;
+    }
+}
+
+impl<V: Value> NodeLeaf4<V> {
+    fn get_prefix(&self) -> &[u8] {
+        &self.prefix[0..self.prefix_len as usize]
+    }
+
+    fn truncate_prefix(&mut self, new_prefix_len: usize) {
+        assert!(new_prefix_len < self.prefix_len as usize);
+        let prefix = &mut self.prefix;
+        let offset = self.prefix_len as usize - new_prefix_len;
+        for i in 0..new_prefix_len {
+            prefix[i] = prefix[i + offset];
+        }
+        self.prefix_len = new_prefix_len as u8;
+    }
+
+    fn get_leaf_value<'a: 'b, 'b>(&'a self, key: u8) -> Option<&'b V> {
+        for i in 0..self.num_values {
+            if self.child_keys[i as usize] == key {
+                assert!(self.child_values[i as usize].is_some());
+                return self.child_values[i as usize].as_ref();
+            }
+        }
+        None
+    }
+    fn is_full(&self) -> bool {
+        self.num_values == 4
+    }
+
+    fn insert_value(&mut self, key_byte: u8, value: V) {
+        assert!(self.num_values < 16);
+
+        let idx = self.num_values as usize;
+        self.child_keys[idx] = key_byte;
+        self.child_values[idx] = Some(value);
+        self.num_values += 1;
+    }
+
+    fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
+        let mut node16 = allocator.alloc(NodeLeaf16 {
+            tag: NodeTag::Leaf16,
+            lock_and_version: AtomicLockAndVersion::new(),
+
+            prefix: self.prefix.clone(),
+            prefix_len: self.prefix_len,
+            num_values: self.num_values,
+
+            child_keys: [0; 16],
+            child_values: [const { None }; 16],
+        });
+        for i in 0..self.num_values as usize {
+            node16.child_keys[i] = self.child_keys[i];
+            node16.child_values[i] = self.child_values[i].clone();
+        }
+        node16.as_ptr().into()
+    }
+}
+
+impl<V: Value> NodeLeaf16<V> {
+    fn get_prefix(&self) -> &[u8] {
+        &self.prefix[0..self.prefix_len as usize]
+    }
+
+    fn truncate_prefix(&mut self, new_prefix_len: usize) {
+        assert!(new_prefix_len < self.prefix_len as usize);
+        let prefix = &mut self.prefix;
+        let offset = self.prefix_len as usize - new_prefix_len;
+        for i in 0..new_prefix_len {
+            prefix[i] = prefix[i + offset];
+        }
+        self.prefix_len = new_prefix_len as u8;
+    }
+
+    fn get_leaf_value(&self, key: u8) -> Option<&V> {
+        for i in 0..self.num_values {
+            if self.child_keys[i as usize] == key {
+                assert!(self.child_values[i as usize].is_some());
+                return self.child_values[i as usize].as_ref();
+            }
+        }
+        None
+    }
+    fn is_full(&self) -> bool {
+        self.num_values == 16
+    }
+
+    fn insert_value(&mut self, key_byte: u8, value: V) {
+        assert!(self.num_values < 16);
+
+        let idx = self.num_values as usize;
+        self.child_keys[idx] = key_byte;
+        self.child_values[idx] = Some(value);
+        self.num_values += 1;
+    }
+    fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
+        let mut node48 = allocator.alloc(NodeLeaf48 {
+            tag: NodeTag::Leaf48,
+            lock_and_version: AtomicLockAndVersion::new(),
+
+            prefix: self.prefix.clone(),
+            prefix_len: self.prefix_len,
+            num_values: self.num_values,
+
+            child_indexes: [INVALID_CHILD_INDEX; 256],
+            child_values: [const { None }; 48],
+        });
+        for i in 0..self.num_values {
+            let idx = self.child_keys[i as usize];
+            node48.child_indexes[idx as usize] = i;
+            node48.child_values[i as usize] = self.child_values[i as usize].clone();
+        }
+        node48.as_ptr().into()
+    }
+}
+
+impl<V: Value> NodeLeaf48<V> {
+    fn get_prefix(&self) -> &[u8] {
+        &self.prefix[0..self.prefix_len as usize]
+    }
+
+    fn truncate_prefix(&mut self, new_prefix_len: usize) {
+        assert!(new_prefix_len < self.prefix_len as usize);
+        let prefix = &mut self.prefix;
+        let offset = self.prefix_len as usize - new_prefix_len;
+        for i in 0..new_prefix_len {
+            prefix[i] = prefix[i + offset];
+        }
+        self.prefix_len = new_prefix_len as u8;
+    }
+
+    fn get_leaf_value(&self, key: u8) -> Option<&V> {
+        let idx = self.child_indexes[key as usize];
+        if idx != INVALID_CHILD_INDEX {
+            assert!(self.child_values[idx as usize].is_some());
+            self.child_values[idx as usize].as_ref()
+        } else {
+            None
+        }
+    }
+    fn is_full(&self) -> bool {
+        self.num_values == 48
+    }
+
+    fn insert_value(&mut self, key_byte: u8, value: V) {
+        assert!(self.num_values < 48);
+        assert!(self.child_indexes[key_byte as usize] == INVALID_CHILD_INDEX);
+        let idx = self.num_values;
+        self.child_indexes[key_byte as usize] = idx;
+        self.child_values[idx as usize] = Some(value);
+        self.num_values += 1;
+    }
+    fn grow(&self, allocator: &Allocator) -> NodePtr<V> {
+        let mut node256 = allocator.alloc(NodeLeaf256 {
+            tag: NodeTag::Leaf256,
+            lock_and_version: AtomicLockAndVersion::new(),
+
+            prefix: self.prefix.clone(),
+            prefix_len: self.prefix_len,
+            num_values: self.num_values as u16,
+
+            child_values: [const { None }; 256],
+        });
+        for i in 0..256 {
+            let idx = self.child_indexes[i];
+            if idx != INVALID_CHILD_INDEX {
+                node256.child_values[i] = self.child_values[idx as usize].clone();
+            }
+        }
+        node256.as_ptr().into()
+    }
+}
+
+impl<V: Value> NodeLeaf256<V> {
+    fn get_prefix(&self) -> &[u8] {
+        &self.prefix[0..self.prefix_len as usize]
+    }
+
+    fn truncate_prefix(&mut self, new_prefix_len: usize) {
+        assert!(new_prefix_len < self.prefix_len as usize);
+        let prefix = &mut self.prefix;
+        let offset = self.prefix_len as usize - new_prefix_len;
+        for i in 0..new_prefix_len {
+            prefix[i] = prefix[i + offset];
+        }
+        self.prefix_len = new_prefix_len as u8;
+    }
+
+    fn get_leaf_value(&self, key: u8) -> Option<&V> {
+        let idx = key as usize;
+        self.child_values[idx].as_ref()
+    }
+    fn is_full(&self) -> bool {
+        self.num_values == 256
+    }
+
+    fn insert_value(&mut self, key_byte: u8, value: V) {
+        assert!(self.num_values < 256);
+        assert!(self.child_values[key_byte as usize].is_none());
+        self.child_values[key_byte as usize] = Some(value);
+        self.num_values += 1;
+    }
+}
+
+impl<V: Value> NodeInternal256<V> {
+    pub(crate) fn new() -> NodeInternal256<V> {
+        NodeInternal256 {
+            tag: NodeTag::Internal256,
+            lock_and_version: AtomicLockAndVersion::new(),
+
+            prefix: [0; MAX_PREFIX_LEN],
+            prefix_len: 0,
+            num_children: 0,
+
+            child_ptrs: [const { NodePtr::null() }; 256],
+        }
+    }
+}
+
+impl<V: Value> From<*mut NodeInternal4<V>> for NodePtr<V> {
+    fn from(val: *mut NodeInternal4<V>) -> NodePtr<V> {
+        NodePtr {
+            ptr: val.cast(),
+            phantom_value: PhantomData,
+        }
+    }
+}
+impl<V: Value> From<*mut NodeInternal16<V>> for NodePtr<V> {
+    fn from(val: *mut NodeInternal16<V>) -> NodePtr<V> {
+        NodePtr {
+            ptr: val.cast(),
+            phantom_value: PhantomData,
+        }
+    }
+}
+
+impl<V: Value> From<*mut NodeInternal48<V>> for NodePtr<V> {
+    fn from(val: *mut NodeInternal48<V>) -> NodePtr<V> {
+        NodePtr {
+            ptr: val.cast(),
+            phantom_value: PhantomData,
+        }
+    }
+}
+
+impl<V: Value> From<*mut NodeInternal256<V>> for NodePtr<V> {
+    fn from(val: *mut NodeInternal256<V>) -> NodePtr<V> {
+        NodePtr {
+            ptr: val.cast(),
+            phantom_value: PhantomData,
+        }
+    }
+}
+
+impl<V: Value> From<*mut NodeLeaf4<V>> for NodePtr<V> {
+    fn from(val: *mut NodeLeaf4<V>) -> NodePtr<V> {
+        NodePtr {
+            ptr: val.cast(),
+            phantom_value: PhantomData,
+        }
+    }
+}
+impl<V: Value> From<*mut NodeLeaf16<V>> for NodePtr<V> {
+    fn from(val: *mut NodeLeaf16<V>) -> NodePtr<V> {
+        NodePtr {
+            ptr: val.cast(),
+            phantom_value: PhantomData,
+        }
+    }
+}
+
+impl<V: Value> From<*mut NodeLeaf48<V>> for NodePtr<V> {
+    fn from(val: *mut NodeLeaf48<V>) -> NodePtr<V> {
+        NodePtr {
+            ptr: val.cast(),
+            phantom_value: PhantomData,
+        }
+    }
+}
+
+impl<V: Value> From<*mut NodeLeaf256<V>> for NodePtr<V> {
+    fn from(val: *mut NodeLeaf256<V>) -> NodePtr<V> {
+        NodePtr {
+            ptr: val.cast(),
+            phantom_value: PhantomData,
+        }
+    }
+}
--- a/libs/neonart/src/algorithm/node_ref.rs
+++ b/libs/neonart/src/algorithm/node_ref.rs
@@ -0,0 +1,202 @@
+use std::fmt::Debug;
+use std::marker::PhantomData;
+
+use super::lock_and_version::ResultOrRestart;
+use super::node_ptr;
+use super::node_ptr::ChildOrValuePtr;
+use super::node_ptr::NodePtr;
+use crate::EpochPin;
+use crate::algorithm::lock_and_version::AtomicLockAndVersion;
+use crate::{Allocator, Value};
+
+pub struct NodeRef<'e, V> {
+    ptr: NodePtr<V>,
+
+    phantom: PhantomData<&'e EpochPin>,
+}
+
+impl<'e, V> Debug for NodeRef<'e, V> {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.ptr)
+    }
+}
+
+impl<'e, V: Value> NodeRef<'e, V> {
+    pub(crate) fn from_root_ptr(root_ptr: NodePtr<V>) -> NodeRef<'e, V> {
+        NodeRef {
+            ptr: root_ptr,
+            phantom: PhantomData,
+        }
+    }
+
+    pub(crate) fn read_lock_or_restart(&self) -> ResultOrRestart<ReadLockedNodeRef<'e, V>> {
+        let version = self.lockword().read_lock_or_restart()?;
+        Ok(ReadLockedNodeRef {
+            ptr: self.ptr,
+            version,
+            phantom: self.phantom,
+        })
+    }
+
+    fn lockword(&self) -> &AtomicLockAndVersion {
+        self.ptr.lockword()
+    }
+}
+
+/// A reference to a node that has been optimistically read-locked. The functions re-check
+/// the version after each read.
+pub struct ReadLockedNodeRef<'e, V> {
+    ptr: NodePtr<V>,
+    version: u64,
+
+    phantom: PhantomData<&'e EpochPin>,
+}
+
+pub(crate) enum ChildOrValue<'e, V> {
+    Child(NodeRef<'e, V>),
+    Value(*const V),
+}
+
+impl<'e, V: Value> ReadLockedNodeRef<'e, V> {
+    pub(crate) fn is_full(&self) -> bool {
+        self.ptr.is_full()
+    }
+
+    pub(crate) fn get_prefix(&self) -> &[u8] {
+        self.ptr.get_prefix()
+    }
+
+    /// Note: because we're only holding a read lock, the prefix can change concurrently.
+    /// You must be prepared to restart, if read_unlock() returns error later.
+    ///
+    /// Returns the length of the prefix, or None if it's not a match
+    pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
+        self.ptr.prefix_matches(key)
+    }
+
+    pub(crate) fn find_child_or_value_or_restart(
+        &self,
+        key_byte: u8,
+    ) -> ResultOrRestart<Option<ChildOrValue<'e, V>>> {
+        let child_or_value = self.ptr.find_child_or_value(key_byte);
+        self.ptr.lockword().check_or_restart(self.version)?;
+
+        match child_or_value {
+            None => Ok(None),
+            Some(ChildOrValuePtr::Value(vptr)) => Ok(Some(ChildOrValue::Value(vptr))),
+            Some(ChildOrValuePtr::Child(child_ptr)) => Ok(Some(ChildOrValue::Child(NodeRef {
+                ptr: child_ptr,
+                phantom: self.phantom,
+            }))),
+        }
+    }
+
+    pub(crate) fn upgrade_to_write_lock_or_restart(
+        self,
+    ) -> ResultOrRestart<WriteLockedNodeRef<'e, V>> {
+        self.ptr
+            .lockword()
+            .upgrade_to_write_lock_or_restart(self.version)?;
+
+        Ok(WriteLockedNodeRef {
+            ptr: self.ptr,
+            phantom: self.phantom,
+        })
+    }
+
+    pub(crate) fn read_unlock_or_restart(self) -> ResultOrRestart<()> {
+        self.ptr.lockword().check_or_restart(self.version)?;
+        Ok(())
+    }
+}
+
+/// A reference to a node that has been optimistically read-locked. The functions re-check
+/// the version after each read.
+pub struct WriteLockedNodeRef<'e, V> {
+    ptr: NodePtr<V>,
+    phantom: PhantomData<&'e EpochPin>,
+}
+
+impl<'e, V: Value> WriteLockedNodeRef<'e, V> {
+    pub(crate) fn is_leaf(&self) -> bool {
+        self.ptr.is_leaf()
+    }
+
+    pub(crate) fn write_unlock(mut self) {
+        self.ptr.lockword().write_unlock();
+        self.ptr = NodePtr::null();
+    }
+
+    pub(crate) fn write_unlock_obsolete(mut self) {
+        self.ptr.lockword().write_unlock_obsolete();
+        self.ptr = NodePtr::null();
+    }
+
+    pub(crate) fn get_prefix(&self) -> &[u8] {
+        self.ptr.get_prefix()
+    }
+
+    pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
+        self.ptr.truncate_prefix(new_prefix_len)
+    }
+
+    pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
+        self.ptr.insert_child(key_byte, child)
+    }
+
+    pub(crate) fn insert_value(&mut self, key_byte: u8, value: V) {
+        self.ptr.insert_value(key_byte, value)
+    }
+
+    pub(crate) fn grow(&self, allocator: &Allocator) -> NewNodeRef<V> {
+        let new_node = self.ptr.grow(allocator);
+        NewNodeRef { ptr: new_node }
+    }
+
+    pub(crate) fn as_ptr(&self) -> NodePtr<V> {
+        self.ptr
+    }
+
+    pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
+        self.ptr.replace_child(key_byte, replacement);
+    }
+}
+
+impl<'e, V> Drop for WriteLockedNodeRef<'e, V> {
+    fn drop(&mut self) {
+        if !self.ptr.is_null() {
+            self.ptr.lockword().write_unlock();
+        }
+    }
+}
+
+pub(crate) struct NewNodeRef<V> {
+    ptr: NodePtr<V>,
+}
+
+impl<V: Value> NewNodeRef<V> {
+    pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
+        self.ptr.insert_child(key_byte, child)
+    }
+
+    pub(crate) fn insert_value(&mut self, key_byte: u8, value: V) {
+        self.ptr.insert_value(key_byte, value)
+    }
+
+    pub(crate) fn into_ptr(self) -> NodePtr<V> {
+        let ptr = self.ptr;
+        ptr
+    }
+}
+
+pub(crate) fn new_internal<V: Value>(prefix: &[u8], allocator: &Allocator) -> NewNodeRef<V> {
+    NewNodeRef {
+        ptr: node_ptr::new_internal(prefix, allocator),
+    }
+}
+
+pub(crate) fn new_leaf<V: Value>(prefix: &[u8], allocator: &Allocator) -> NewNodeRef<V> {
+    NewNodeRef {
+        ptr: node_ptr::new_leaf(prefix, allocator),
+    }
+}
--- a/libs/neonart/src/allocator.rs
+++ b/libs/neonart/src/allocator.rs
@@ -0,0 +1,107 @@
+use std::marker::PhantomData;
+use std::mem::MaybeUninit;
+use std::ops::{Deref, DerefMut};
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+pub struct Allocator {
+    area: *mut MaybeUninit<u8>,
+    allocated: AtomicUsize,
+    size: usize,
+}
+
+// FIXME: I don't know if these are really safe...
+unsafe impl Send for Allocator {}
+unsafe impl Sync for Allocator {}
+
+#[repr(transparent)]
+pub struct AllocatedBox<'a, T> {
+    inner: NonNull<T>,
+
+    _phantom: PhantomData<&'a Allocator>,
+}
+
+// FIXME: I don't know if these are really safe...
+unsafe impl<'a, T> Send for AllocatedBox<'a, T> {}
+unsafe impl<'a, T> Sync for AllocatedBox<'a, T> {}
+
+impl<T> Deref for AllocatedBox<'_, T> {
+    type Target = T;
+
+    fn deref(&self) -> &T {
+        unsafe { self.inner.as_ref() }
+    }
+}
+
+impl<T> DerefMut for AllocatedBox<'_, T> {
+    fn deref_mut(&mut self) -> &mut T {
+        unsafe { self.inner.as_mut() }
+    }
+}
+
+impl<T> AsMut<T> for AllocatedBox<'_, T> {
+    fn as_mut(&mut self) -> &mut T {
+        unsafe { self.inner.as_mut() }
+    }
+}
+
+impl<T> AllocatedBox<'_, T> {
+    pub fn as_ptr(&self) -> *mut T {
+        self.inner.as_ptr()
+    }
+}
+
+const MAXALIGN: usize = std::mem::align_of::<usize>();
+
+impl Allocator {
+    pub fn new_uninit(area: &'static mut [MaybeUninit<u8>]) -> Allocator {
+        let ptr = area.as_mut_ptr();
+        let size = area.len();
+        Self::new_from_ptr(ptr, size)
+    }
+
+    pub fn new(area: &'static mut [u8]) -> Allocator {
+        let ptr: *mut MaybeUninit<u8> = area.as_mut_ptr().cast();
+        let size = area.len();
+        Self::new_from_ptr(ptr, size)
+    }
+
+    pub fn new_from_ptr(ptr: *mut MaybeUninit<u8>, size: usize) -> Allocator {
+        let padding = ptr.align_offset(MAXALIGN);
+
+        Allocator {
+            area: ptr,
+            allocated: AtomicUsize::new(padding),
+            size,
+        }
+    }
+
+    pub fn alloc<'a, T: Sized>(&'a self, value: T) -> AllocatedBox<'a, T> {
+        let sz = std::mem::size_of::<T>();
+
+        // pad all allocations to MAXALIGN boundaries
+        assert!(std::mem::align_of::<T>() <= MAXALIGN);
+        let sz = sz.next_multiple_of(MAXALIGN);
+
+        let offset = self.allocated.fetch_add(sz, Ordering::Relaxed);
+
+        if offset + sz > self.size {
+            panic!("out of memory");
+        }
+
+        let inner = unsafe {
+            let inner = self.area.offset(offset as isize).cast::<T>();
+            *inner = value;
+            NonNull::new_unchecked(inner)
+        };
+
+        AllocatedBox {
+            inner,
+            _phantom: PhantomData,
+        }
+    }
+
+    pub fn _dealloc_node<T>(&self, _node: AllocatedBox<T>) {
+        // doesn't free it immediately.
+    }
+}
--- a/libs/neonart/src/epoch.rs
+++ b/libs/neonart/src/epoch.rs
@@ -0,0 +1,23 @@
+//! This is similar to crossbeam_epoch crate, but works in shared memory
+//!
+//! FIXME: not implemented yet. (We haven't implemented removing any nodes from the ART
+//! tree, which is why we get away without this now)
+
+pub(crate) struct EpochPin {}
+
+pub(crate) fn pin_epoch() -> EpochPin {
+    EpochPin {}
+}
+
+/*
+struct CollectorGlobal {
+    epoch: AtomicU64,
+
+    participants: CachePadded<AtomicU64>, // make it an array
+}
+
+
+struct CollectorQueue {
+
+}
+*/
--- a/libs/neonart/src/lib.rs
+++ b/libs/neonart/src/lib.rs
@@ -0,0 +1,301 @@
+//! Adaptive Radix Tree (ART) implementation, with Optimistic Lock Coupling.
+//!
+//! The data structure is described in these two papers:
+//!
+//! [1] Leis, V. & Kemper, Alfons & Neumann, Thomas. (2013).
+//!     The adaptive radix tree: ARTful indexing for main-memory databases.
+//!     Proceedings - International Conference on Data Engineering. 38-49. 10.1109/ICDE.2013.6544812.
+//!     https://db.in.tum.de/~leis/papers/ART.pdf
+//!
+//! [2] Leis, Viktor & Scheibner, Florian & Kemper, Alfons & Neumann, Thomas. (2016).
+//!     The ART of practical synchronization.
+//!     1-8. 10.1145/2933349.2933352.
+//!     https://db.in.tum.de/~leis/papers/artsync.pdf
+//!
+//! [1] describes the base data structure, and [2] describes the Optimistic Lock Coupling that we
+//! use.
+//!
+//! The papers mention a few different variants. We have made the following choices in this
+//! implementation:
+//!
+//! - All keys have the same length
+//!
+//! - Multi-value leaves. The values are stored directly in one of the four different leaf node
+//!   types.
+//!
+//! - For collapsing inner nodes, we use the Pessimistic approach, where each inner node stores a
+//!   variable length "prefix", which stores the keys of all the one-way nodes which have been
+//!   removed. However, similar to the "hybrid" approach described in the paper, each node only has
+//!   space for a constant-size prefix of 8 bytes. If a node would have a longer prefix, then we
+//!   create create one-way nodes to store them. (There was no particular reason for this choice,
+//!   the "hybrid" approach described in the paper might be better.)
+//!
+//! - For concurrency, we use Optimistic Lock Coupling. The paper [2] also describes another method,
+//!   ROWEX, which generally performs better when there is contention, but that is not important
+//!   for use and Optimisic Lock Coupling is simpler to implement.
+//!
+//! ## Requirements
+//!
+//! This data structure is currently used for the integrated LFC, relsize and last-written LSN cache
+//! in the compute communicator, part of the 'neon' Postgres extension. We have some unique
+//! requirements, which is why we had to write our own. Namely:
+//!
+//! - The data structure has to live in fixed-sized shared memory segment. That rules out any
+//!   built-in Rust collections and most crates. (Except possibly with the 'allocator_api' rust
+//!   feature, which still nightly-only experimental as of this writing).
+//!
+//! - The data structure is accessed from multiple processes. Only one process updates the data
+//!   structure, but other processes perform reads. That rules out using built-in Rust locking
+//!   primitives like Mutex and RwLock, and most crates too.
+//!
+//! - Within the one process with write-access, multiple threads can perform updates concurrently.
+//!   That rules out using PostgreSQL LWLocks for the locking.
+//!
+//! The implementation is generic, and doesn't depend on any PostgreSQL specifics, but it has been
+//! written with that usage and the above constraints in mind. Some noteworthy assumptions:
+//!
+//! - Contention is assumed to be rare. In the integrated cache in PostgreSQL, there's higher level
+//!   locking in the PostgreSQL buffer manager, which ensures that two backends should not try to
+//!   read / write the same page at the same time. (Prefetching can conflict with actual reads,
+//!   however.)
+//!
+//!  - The keys in the integrated cache are 17 bytes long.
+//!
+//! ## Usage
+//!
+//! Because this is designed to be used as a Postgres shared memory data structure, initialization
+//! happens in three stages:
+//!
+//! 0. A fixed area of shared memory is allocated at postmaster startup.
+//!
+//! 1. TreeInitStruct::new() is called to initialize it, still in Postmaster process, before any
+//!    other process or thread is running. It returns a TreeInitStruct, which is inherited by all
+//!    the processes through fork().
+//!
+//! 2. One process may have write-access to the struct, by calling
+//!    [TreeInitStruct::attach_writer]. (That process is the communicator process.)
+//!
+//! 3. Other processes get read-access to the struct, by calling [TreeInitStruct::attach_reader]
+//!
+//! "Write access" means that you can insert / update / delete values in the tree.
+//!
+//! NOTE: The Values stored in the tree are sometimes moved, when a leaf node fills up and a new
+//! larger node needs to be allocated. The versioning and epoch-based allocator ensure that the data
+//! structure stays consistent, but if the Value has interior mutability, like atomic fields,
+//! updates to such fields might be lost if the leaf node is concurrently moved! If that becomes a
+//! problem, the version check could be passed up to the caller, so that the caller could detect the
+//! lost updates and retry the operation.
+//!
+//! ## Implementation
+//!
+//! node_ptr: Provides low-level implementations of the four different node types (eight actually,
+//! since there is an Internal and Leaf variant of each)
+//!
+//! lock_and_version.rs: Provides an abstraction for the combined lock and version counter on each
+//! node.
+//!
+//! node_ref.rs: The code in node_ptr.rs deals with raw pointers. node_ref.rs provides more type-safe
+//!   abstractions on top.
+//!
+//! algorithm.rs: Contains the functions to implement lookups and updates in the tree
+//!
+//! allocator.rs: Provides a facility to allocate memory for the tree nodes. (We must provide our
+//!   own abstraction for that because we need the data structure to live in a pre-allocated shared
+//!   memory segment).
+//!
+//! epoch.rs: The data structure requires that when a node is removed from the tree, it is not
+//!   immediately deallocated, but stays around for as long as concurrent readers might still have
+//!   pointers to them. This is enforced by an epoch system. This is similar to
+//!   e.g. crossbeam_epoch, but we couldn't use that either because it has to work across processes
+//!   communicating over the shared memory segment.
+//!
+//! ## See also
+//!
+//! There are some existing Rust ART implementations out there, but none of them filled all
+//! the requirements:
+//!
+//! - https://github.com/XiangpengHao/congee
+//! - https://github.com/declanvk/blart
+//!
+//! ## TODO
+//!
+//! - Removing values has not been implemented
+
+mod algorithm;
+mod allocator;
+mod epoch;
+
+use algorithm::RootPtr;
+
+use allocator::AllocatedBox;
+
+use std::fmt::Debug;
+use std::marker::PhantomData;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+use crate::epoch::EpochPin;
+
+#[cfg(test)]
+mod tests;
+
+pub use allocator::Allocator;
+
+/// Fixed-length key type.
+///
+pub trait Key: Clone + Debug {
+    const KEY_LEN: usize;
+
+    fn as_bytes(&self) -> &[u8];
+}
+
+/// Values stored in the tree
+///
+/// Values need to be Cloneable, because when a node "grows", the value is copied to a new node and
+/// the old sticks around until all readers that might see the old value are gone.
+pub trait Value: Clone {}
+
+struct Tree<K: Key, V: Value> {
+    root: RootPtr<V>,
+
+    writer_attached: AtomicBool,
+
+    phantom_key: PhantomData<K>,
+}
+
+/// Struct created at postmaster startup
+pub struct TreeInitStruct<'t, K: Key, V: Value> {
+    tree: AllocatedBox<'t, Tree<K, V>>,
+
+    allocator: &'t Allocator,
+}
+
+/// The worker process has a reference to this. The write operations are only safe
+/// from the worker process
+pub struct TreeWriteAccess<'t, K: Key, V: Value>
+where
+    K: Key,
+    V: Value,
+{
+    tree: AllocatedBox<'t, Tree<K, V>>,
+
+    allocator: &'t Allocator,
+}
+
+/// The backends have a reference to this. It cannot be used to modify the tree
+pub struct TreeReadAccess<'t, K: Key, V: Value>
+where
+    K: Key,
+    V: Value,
+{
+    tree: AllocatedBox<'t, Tree<K, V>>,
+}
+
+impl<'a, 't: 'a, K: Key, V: Value> TreeInitStruct<'t, K, V> {
+    pub fn new(allocator: &'t Allocator) -> TreeInitStruct<'t, K, V> {
+        let tree = allocator.alloc(Tree {
+            root: algorithm::new_root(allocator),
+            writer_attached: AtomicBool::new(false),
+            phantom_key: PhantomData,
+        });
+
+        TreeInitStruct { tree, allocator }
+    }
+
+    pub fn attach_writer(self) -> TreeWriteAccess<'t, K, V> {
+        let previously_attached = self.tree.writer_attached.swap(true, Ordering::Relaxed);
+        if previously_attached {
+            panic!("writer already attached");
+        }
+        TreeWriteAccess {
+            tree: self.tree,
+            allocator: self.allocator,
+        }
+    }
+
+    pub fn attach_reader(self) -> TreeReadAccess<'t, K, V> {
+        TreeReadAccess { tree: self.tree }
+    }
+}
+
+impl<'t, K: Key + Clone, V: Value> TreeWriteAccess<'t, K, V> {
+    pub fn start_write(&'t self) -> TreeWriteGuard<'t, K, V> {
+        // TODO: grab epoch guard
+        TreeWriteGuard {
+            allocator: self.allocator,
+            tree: &self.tree,
+            epoch_pin: epoch::pin_epoch(),
+        }
+    }
+
+    pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
+        TreeReadGuard {
+            tree: &self.tree,
+            epoch_pin: epoch::pin_epoch(),
+        }
+    }
+}
+
+impl<'t, K: Key + Clone, V: Value> TreeReadAccess<'t, K, V> {
+    pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
+        TreeReadGuard {
+            tree: &self.tree,
+            epoch_pin: epoch::pin_epoch(),
+        }
+    }
+}
+
+pub struct TreeReadGuard<'t, K, V>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'t AllocatedBox<'t, Tree<K, V>>,
+
+    epoch_pin: EpochPin,
+}
+
+impl<'t, K: Key, V: Value> TreeReadGuard<'t, K, V> {
+    pub fn get(&self, key: &K) -> Option<V> {
+        algorithm::search(key, self.tree.root, &self.epoch_pin)
+    }
+}
+
+pub struct TreeWriteGuard<'t, K, V>
+where
+    K: Key,
+    V: Value,
+{
+    tree: &'t AllocatedBox<'t, Tree<K, V>>,
+    allocator: &'t Allocator,
+
+    epoch_pin: EpochPin,
+}
+
+impl<'t, K: Key, V: Value> TreeWriteGuard<'t, K, V> {
+    pub fn insert(&mut self, key: &K, value: V) {
+        self.update_with_fn(key, |_| Some(value))
+    }
+
+    pub fn update_with_fn<F>(&mut self, key: &K, value_fn: F)
+    where
+        F: FnOnce(Option<&V>) -> Option<V>,
+    {
+        algorithm::update_fn(
+            key,
+            value_fn,
+            self.tree.root,
+            self.allocator,
+            &self.epoch_pin,
+        )
+    }
+
+    pub fn get(&mut self, key: &K) -> Option<V> {
+        algorithm::search(key, self.tree.root, &self.epoch_pin)
+    }
+}
+
+impl<'t, K: Key, V: Value + Debug> TreeWriteGuard<'t, K, V> {
+    pub fn dump(&mut self) {
+        algorithm::dump_tree(self.tree.root, &self.epoch_pin)
+    }
+}
--- a/libs/neonart/src/tests.rs
+++ b/libs/neonart/src/tests.rs
@@ -0,0 +1,90 @@
+use std::collections::HashSet;
+
+use crate::Allocator;
+use crate::TreeInitStruct;
+
+use crate::{Key, Value};
+
+use rand::seq::SliceRandom;
+use rand::thread_rng;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl Key for TestKey {
+    const KEY_LEN: usize = TEST_KEY_LEN;
+
+    fn as_bytes(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl Value for usize {}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
+    const MEM_SIZE: usize = 10000000;
+    let area = Box::leak(Box::new_uninit_slice(MEM_SIZE));
+
+    let allocator = Box::leak(Box::new(Allocator::new_uninit(area)));
+
+    let init_struct = TreeInitStruct::<TestKey, usize>::new(allocator);
+    let tree_writer = init_struct.attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+        let mut w = tree_writer.start_write();
+        w.insert(&(*k).into(), idx);
+        eprintln!("INSERTED {:?}", Into::<TestKey>::into(*k));
+    }
+
+    //tree_writer.start_read().dump();
+
+    for (idx, k) in keys.iter().enumerate() {
+        let r = tree_writer.start_read();
+        let value = r.get(&(*k).into());
+        assert_eq!(value, Some(idx));
+    }
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut thread_rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.get(&key).is_some() {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -42,12 +42,14 @@ nix.workspace = true
 num_cpus.workspace = true
 num-traits.workspace = true
 once_cell.workspace = true
+peekable.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
 postgres_initdb.workspace = true
 pprof.workspace = true
+prost.workspace = true
 rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
@@ -60,6 +62,7 @@ serde_path_to_error.workspace = true
 serde_with.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
+tonic.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
@@ -76,6 +79,7 @@ url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
 pageserver_api.workspace = true
+pageserver_data_api.workspace = true
 pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
 pageserver_compaction.workspace = true
 pem.workspace = true
--- a/pageserver/client_grpc/Cargo.toml
+++ b/pageserver/client_grpc/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "pageserver_client_grpc"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+bytes.workspace = true
+http.workspace = true
+thiserror.workspace = true
+tonic.workspace = true
+tracing.workspace = true
+
+pageserver_data_api.workspace = true
--- a/pageserver/client_grpc/src/lib.rs
+++ b/pageserver/client_grpc/src/lib.rs
@@ -0,0 +1,221 @@
+//! Pageserver Data API client
+//!
+//! - Manage connections to pageserver
+//! - Send requests to correct shards
+//!
+use std::collections::HashMap;
+use std::sync::RwLock;
+
+use bytes::Bytes;
+use http;
+use thiserror::Error;
+use tonic;
+use tonic::metadata::AsciiMetadataValue;
+use tonic::transport::Channel;
+
+use pageserver_data_api::model::*;
+use pageserver_data_api::proto;
+
+type Shardno = u16;
+
+use pageserver_data_api::client::PageServiceClient;
+
+type MyPageServiceClient = pageserver_data_api::client::PageServiceClient<
+    tonic::service::interceptor::InterceptedService<tonic::transport::Channel, AuthInterceptor>,
+>;
+
+#[derive(Error, Debug)]
+pub enum PageserverClientError {
+    #[error("could not connect to service: {0}")]
+    ConnectError(#[from] tonic::transport::Error),
+    #[error("could not perform request: {0}`")]
+    RequestError(#[from] tonic::Status),
+
+    #[error("could not perform request: {0}`")]
+    InvalidUri(#[from] http::uri::InvalidUri),
+}
+
+pub struct PageserverClient {
+    _tenant_id: String,
+    _timeline_id: String,
+
+    _auth_token: Option<String>,
+
+    shard_map: HashMap<Shardno, String>,
+
+    channels: RwLock<HashMap<Shardno, Channel>>,
+
+    auth_interceptor: AuthInterceptor,
+}
+
+impl PageserverClient {
+    /// TODO: this doesn't currently react to changes in the shard map.
+    pub fn new(
+        tenant_id: &str,
+        timeline_id: &str,
+        auth_token: &Option<String>,
+        shard_map: HashMap<Shardno, String>,
+    ) -> Self {
+        Self {
+            _tenant_id: tenant_id.to_string(),
+            _timeline_id: timeline_id.to_string(),
+            _auth_token: auth_token.clone(),
+            shard_map,
+            channels: RwLock::new(HashMap::new()),
+            auth_interceptor: AuthInterceptor::new(tenant_id, timeline_id, auth_token.as_ref()),
+        }
+    }
+
+    pub async fn process_rel_exists_request(
+        &self,
+        request: &RelExistsRequest,
+    ) -> Result<bool, PageserverClientError> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let shard_no = 0;
+
+        let mut client = self.get_client(shard_no).await?;
+
+        let request = proto::RelExistsRequest::from(request);
+        let response = client.rel_exists(tonic::Request::new(request)).await?;
+
+        Ok(response.get_ref().exists)
+    }
+
+    pub async fn process_rel_size_request(
+        &self,
+        request: &RelSizeRequest,
+    ) -> Result<u32, PageserverClientError> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let shard_no = 0;
+
+        let mut client = self.get_client(shard_no).await?;
+
+        let request = proto::RelSizeRequest::from(request);
+        let response = client.rel_size(tonic::Request::new(request)).await?;
+
+        Ok(response.get_ref().num_blocks)
+    }
+
+    pub async fn get_page(&self, request: &GetPageRequest) -> Result<Bytes, PageserverClientError> {
+        // FIXME: calculate the shard number correctly
+        let shard_no = 0;
+
+        let mut client = self.get_client(shard_no).await?;
+
+        let request = proto::GetPageRequest::from(request);
+        let response = client.get_page(tonic::Request::new(request)).await?;
+
+        Ok(response.into_inner().page_image)
+    }
+
+    /// Process a request to get the size of a database.
+    pub async fn process_dbsize_request(
+        &self,
+        request: &DbSizeRequest,
+    ) -> Result<u64, PageserverClientError> {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let shard_no = 0;
+
+        let mut client = self.get_client(shard_no).await?;
+
+        let request = proto::DbSizeRequest::from(request);
+        let response = client.db_size(tonic::Request::new(request)).await?;
+
+        Ok(response.get_ref().num_bytes)
+    }
+
+    /// Process a request to get the size of a database.
+    pub async fn get_base_backup(
+        &self,
+        request: &GetBaseBackupRequest,
+        gzip: bool,
+    ) -> std::result::Result<
+        tonic::Response<tonic::codec::Streaming<proto::GetBaseBackupResponseChunk>>,
+        PageserverClientError,
+    > {
+        // Current sharding model assumes that all metadata is present only at shard 0.
+        let shard_no = 0;
+
+        let mut client = self.get_client(shard_no).await?;
+        if gzip {
+            client = client.accept_compressed(tonic::codec::CompressionEncoding::Gzip);
+        }
+
+        let request = proto::GetBaseBackupRequest::from(request);
+        let response = client.get_base_backup(tonic::Request::new(request)).await?;
+
+        Ok(response)
+    }
+
+    /// Get a client for given shard
+    ///
+    /// This implements very basic caching. If we already have a client for the given shard,
+    /// reuse it. If not, create a new client and put it to the cache.
+    async fn get_client(
+        &self,
+        shard_no: u16,
+    ) -> Result<MyPageServiceClient, PageserverClientError> {
+        let reused_channel: Option<Channel> = {
+            let channels = self.channels.read().unwrap();
+
+            channels.get(&shard_no).cloned()
+        };
+
+        let channel = if let Some(reused_channel) = reused_channel {
+            reused_channel
+        } else {
+            let endpoint: tonic::transport::Endpoint = self
+                .shard_map
+                .get(&shard_no)
+                .expect("no url for shard {shard_no}")
+                .parse()?;
+            let channel = endpoint.connect().await?;
+
+            // Insert it to the cache so that it can be reused on subsequent calls. It's possible
+            // that another thread did the same concurrently, in which case we will overwrite the
+            // client in the cache.
+            {
+                let mut channels = self.channels.write().unwrap();
+                channels.insert(shard_no, channel.clone());
+            }
+            channel
+        };
+
+        let client = PageServiceClient::with_interceptor(channel, self.auth_interceptor.clone());
+        Ok(client)
+    }
+}
+
+/// Inject tenant_id, timeline_id and authentication token to all pageserver requests.
+#[derive(Clone)]
+struct AuthInterceptor {
+    tenant_id: AsciiMetadataValue,
+    timeline_id: AsciiMetadataValue,
+
+    auth_token: Option<AsciiMetadataValue>,
+}
+
+impl AuthInterceptor {
+    fn new(tenant_id: &str, timeline_id: &str, auth_token: Option<&String>) -> Self {
+        Self {
+            tenant_id: tenant_id.parse().expect("could not parse tenant id"),
+            timeline_id: timeline_id.parse().expect("could not parse timeline id"),
+            auth_token: auth_token.map(|x| x.parse().expect("could not parse auth token")),
+        }
+    }
+}
+
+impl tonic::service::Interceptor for AuthInterceptor {
+    fn call(&mut self, mut req: tonic::Request<()>) -> Result<tonic::Request<()>, tonic::Status> {
+        req.metadata_mut()
+            .insert("neon-tenant-id", self.tenant_id.clone());
+        req.metadata_mut()
+            .insert("neon-timeline-id", self.timeline_id.clone());
+        if let Some(auth_token) = &self.auth_token {
+            req.metadata_mut()
+                .insert("neon-auth-token", auth_token.clone());
+        }
+
+        Ok(req)
+    }
+}
--- a/pageserver/data_api/Cargo.toml
+++ b/pageserver/data_api/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "pageserver_data_api"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+
+# For Lsn.
+#
+# TODO: move Lsn to separate crate? This draws in a lot more dependencies
+utils.workspace = true
+
+prost.workspace = true
+thiserror.workspace = true
+tonic.workspace = true
+
+[build-dependencies]
+tonic-build.workspace = true
--- a/pageserver/data_api/build.rs
+++ b/pageserver/data_api/build.rs
@@ -0,0 +1,8 @@
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // Generate rust code from .proto protobuf.
+    tonic_build::configure()
+        .bytes(&["."])
+        .compile_protos(&["proto/page_service.proto"], &["proto"])
+        .unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
+    Ok(())
+}
--- a/pageserver/data_api/proto/page_service.proto
+++ b/pageserver/data_api/proto/page_service.proto
@@ -0,0 +1,84 @@
+// Page service presented by pageservers, for computes
+//
+// Each request must come with the following metadata:
+// - neon-tenant-id
+// - neon-timeline-id
+// - neon-auth-token (if auth is enabled)
+//
+// TODO: what else? Priority? OpenTelemetry tracing?
+//
+
+syntax = "proto3";
+package page_service;
+
+service PageService {
+  rpc RelExists(RelExistsRequest) returns (RelExistsResponse);
+
+  // Returns size of a relation, as # of blocks
+  rpc RelSize (RelSizeRequest) returns (RelSizeResponse);
+
+  rpc GetPage (GetPageRequest) returns (GetPageResponse);
+
+  // Returns total size of a database, as # of bytes
+  rpc DbSize (DbSizeRequest) returns (DbSizeResponse);
+
+  rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk);
+}
+
+message RequestCommon {
+  uint64 request_lsn = 1;
+  uint64 not_modified_since_lsn = 2;
+}
+
+message RelTag {
+    uint32 spc_oid = 1;
+    uint32 db_oid = 2;
+    uint32 rel_number = 3;
+    uint32 fork_number = 4;
+}
+
+message RelExistsRequest {
+  RequestCommon common = 1;
+  RelTag rel = 2;
+}
+
+message RelExistsResponse {
+  bool exists = 1;
+}
+
+message RelSizeRequest {
+  RequestCommon common = 1;
+  RelTag rel = 2;
+}
+
+message RelSizeResponse {
+  uint32 num_blocks = 1;
+}
+
+message GetPageRequest {
+  RequestCommon common = 1;
+  RelTag rel = 2;
+  uint32 block_number = 3;
+}
+
+message GetPageResponse {
+  bytes page_image = 1;
+}
+
+message DbSizeRequest {
+  RequestCommon common = 1;
+  uint32 db_oid = 2;
+}
+
+message DbSizeResponse {
+  uint64 num_bytes = 1;
+}
+
+message GetBaseBackupRequest {
+  RequestCommon common = 1;
+  bool replica = 2;
+}
+
+message GetBaseBackupResponseChunk {
+  bytes chunk = 1;
+}
--- a/pageserver/data_api/src/lib.rs
+++ b/pageserver/data_api/src/lib.rs
@@ -0,0 +1,17 @@
+//! This crate has two modules related to the Pageserver Data API:
+//!
+//! proto: code auto-generated from the protobuf definition
+//! model: slightly more ergonomic structs representing the same API
+//!
+//! See protobuf spec under the protos/ subdirectory.
+//!
+//! This crate is used by both the client and the server. Try to keep it slim.
+//!
+pub mod model;
+
+// Code generated by protobuf.
+pub mod proto {
+    tonic::include_proto!("page_service");
+}
+
+pub use proto::page_service_client as client;
--- a/pageserver/data_api/src/model.rs
+++ b/pageserver/data_api/src/model.rs
@@ -0,0 +1,239 @@
+//! Structs representing the API
+//!
+//! These mirror the pageserver APIs and the structs automatically generated
+//! from the protobuf specification. The differences are:
+//!
+//! - Types that are in fact required by the API are not Options. The protobuf "required"
+//!   attribute is deprecated and 'prost' marks a lot of members as optional because of that.
+//!   (See https://github.com/tokio-rs/prost/issues/800 for a gripe on this)
+//!
+//! - Use more precise datatypes, e.g. Lsn and uints shorter than 32 bits.
+
+use utils::lsn::Lsn;
+
+use crate::proto;
+
+#[derive(Clone, Debug)]
+pub struct RequestCommon {
+    pub request_lsn: Lsn,
+    pub not_modified_since_lsn: Lsn,
+}
+
+#[derive(Clone, Debug, Eq, PartialEq, Hash, PartialOrd, Ord)]
+pub struct RelTag {
+    pub spc_oid: u32,
+    pub db_oid: u32,
+    pub rel_number: u32,
+    pub fork_number: u8,
+}
+
+#[derive(Clone, Debug)]
+pub struct RelExistsRequest {
+    pub common: RequestCommon,
+    pub rel: RelTag,
+}
+
+#[derive(Clone, Debug)]
+pub struct RelSizeRequest {
+    pub common: RequestCommon,
+    pub rel: RelTag,
+}
+
+#[derive(Clone, Debug)]
+pub struct RelSizeResponse {
+    pub num_blocks: u32,
+}
+
+#[derive(Clone, Debug)]
+pub struct GetPageRequest {
+    pub common: RequestCommon,
+    pub rel: RelTag,
+    pub block_number: u32,
+}
+
+#[derive(Clone, Debug)]
+pub struct GetPageResponse {
+    pub page_image: std::vec::Vec<u8>,
+}
+
+#[derive(Clone, Debug)]
+pub struct DbSizeRequest {
+    pub common: RequestCommon,
+    pub db_oid: u32,
+}
+
+#[derive(Clone, Debug)]
+pub struct DbSizeResponse {
+    pub num_bytes: u64,
+}
+
+#[derive(Clone, Debug)]
+pub struct GetBaseBackupRequest {
+    pub common: RequestCommon,
+    pub replica: bool,
+}
+
+//--- Conversions to/from the generated proto types
+
+use thiserror::Error;
+
+#[derive(Error, Debug)]
+pub enum ProtocolError {
+    #[error("the value for field `{0}` is invalid")]
+    InvalidValue(&'static str),
+    #[error("the required field `{0}` is missing ")]
+    Missing(&'static str),
+}
+
+impl From<ProtocolError> for tonic::Status {
+    fn from(e: ProtocolError) -> Self {
+        match e {
+            ProtocolError::InvalidValue(_field) => tonic::Status::invalid_argument(e.to_string()),
+            ProtocolError::Missing(_field) => tonic::Status::invalid_argument(e.to_string()),
+        }
+    }
+}
+
+impl From<&RelTag> for proto::RelTag {
+    fn from(value: &RelTag) -> proto::RelTag {
+        proto::RelTag {
+            spc_oid: value.spc_oid,
+            db_oid: value.db_oid,
+            rel_number: value.rel_number,
+            fork_number: value.fork_number as u32,
+        }
+    }
+}
+impl TryFrom<&proto::RelTag> for RelTag {
+    type Error = ProtocolError;
+
+    fn try_from(value: &proto::RelTag) -> Result<RelTag, ProtocolError> {
+        Ok(RelTag {
+            spc_oid: value.spc_oid,
+            db_oid: value.db_oid,
+            rel_number: value.rel_number,
+            fork_number: value
+                .fork_number
+                .try_into()
+                .or(Err(ProtocolError::InvalidValue("fork_number")))?,
+        })
+    }
+}
+
+impl From<&RequestCommon> for proto::RequestCommon {
+    fn from(value: &RequestCommon) -> proto::RequestCommon {
+        proto::RequestCommon {
+            request_lsn: value.request_lsn.into(),
+            not_modified_since_lsn: value.not_modified_since_lsn.into(),
+        }
+    }
+}
+impl From<&proto::RequestCommon> for RequestCommon {
+    fn from(value: &proto::RequestCommon) -> RequestCommon {
+        RequestCommon {
+            request_lsn: value.request_lsn.into(),
+            not_modified_since_lsn: value.not_modified_since_lsn.into(),
+        }
+    }
+}
+
+impl From<&RelExistsRequest> for proto::RelExistsRequest {
+    fn from(value: &RelExistsRequest) -> proto::RelExistsRequest {
+        proto::RelExistsRequest {
+            common: Some((&value.common).into()),
+            rel: Some((&value.rel).into()),
+        }
+    }
+}
+impl TryFrom<&proto::RelExistsRequest> for RelExistsRequest {
+    type Error = ProtocolError;
+
+    fn try_from(value: &proto::RelExistsRequest) -> Result<RelExistsRequest, ProtocolError> {
+        Ok(RelExistsRequest {
+            common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
+            rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?,
+        })
+    }
+}
+
+impl From<&RelSizeRequest> for proto::RelSizeRequest {
+    fn from(value: &RelSizeRequest) -> proto::RelSizeRequest {
+        proto::RelSizeRequest {
+            common: Some((&value.common).into()),
+            rel: Some((&value.rel).into()),
+        }
+    }
+}
+impl TryFrom<&proto::RelSizeRequest> for RelSizeRequest {
+    type Error = ProtocolError;
+
+    fn try_from(value: &proto::RelSizeRequest) -> Result<RelSizeRequest, ProtocolError> {
+        Ok(RelSizeRequest {
+            common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
+            rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?,
+        })
+    }
+}
+
+impl From<&GetPageRequest> for proto::GetPageRequest {
+    fn from(value: &GetPageRequest) -> proto::GetPageRequest {
+        proto::GetPageRequest {
+            common: Some((&value.common).into()),
+            rel: Some((&value.rel).into()),
+            block_number: value.block_number,
+        }
+    }
+}
+impl TryFrom<&proto::GetPageRequest> for GetPageRequest {
+    type Error = ProtocolError;
+
+    fn try_from(value: &proto::GetPageRequest) -> Result<GetPageRequest, ProtocolError> {
+        Ok(GetPageRequest {
+            common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
+            rel: (&value.rel.ok_or(ProtocolError::Missing("rel"))?).try_into()?,
+            block_number: value.block_number,
+        })
+    }
+}
+
+impl From<&DbSizeRequest> for proto::DbSizeRequest {
+    fn from(value: &DbSizeRequest) -> proto::DbSizeRequest {
+        proto::DbSizeRequest {
+            common: Some((&value.common).into()),
+            db_oid: value.db_oid,
+        }
+    }
+}
+
+impl TryFrom<&proto::DbSizeRequest> for DbSizeRequest {
+    type Error = ProtocolError;
+
+    fn try_from(value: &proto::DbSizeRequest) -> Result<DbSizeRequest, ProtocolError> {
+        Ok(DbSizeRequest {
+            common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
+            db_oid: value.db_oid,
+        })
+    }
+}
+
+impl From<&GetBaseBackupRequest> for proto::GetBaseBackupRequest {
+    fn from(value: &GetBaseBackupRequest) -> proto::GetBaseBackupRequest {
+        proto::GetBaseBackupRequest {
+            common: Some((&value.common).into()),
+            replica: value.replica,
+        }
+    }
+}
+
+impl TryFrom<&proto::GetBaseBackupRequest> for GetBaseBackupRequest {
+    type Error = ProtocolError;
+
+    fn try_from(
+        value: &proto::GetBaseBackupRequest,
+    ) -> Result<GetBaseBackupRequest, ProtocolError> {
+        Ok(GetBaseBackupRequest {
+            common: (&value.common.ok_or(ProtocolError::Missing("common"))?).into(),
+            replica: value.replica,
+        })
+    }
+}
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -23,6 +23,8 @@ tokio.workspace = true
 tokio-util.workspace = true

 pageserver_client.workspace = true
+pageserver_client_grpc.workspace = true
+pageserver_data_api.workspace = true
 pageserver_api.workspace = true
 utils = { path = "../../libs/utils/" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -9,6 +9,9 @@ use anyhow::Context;
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
 use pageserver_client::page_service::BasebackupRequest;
+use pageserver_client_grpc;
+use pageserver_data_api::model::{GetBaseBackupRequest, RequestCommon};
+
 use rand::prelude::*;
 use tokio::sync::Barrier;
 use tokio::task::JoinSet;
@@ -22,6 +25,8 @@ use crate::util::{request_stats, tokio_thread_local_stats};
 /// basebackup@LatestLSN
 #[derive(clap::Parser)]
 pub(crate) struct Args {
+    #[clap(long, default_value = "false")]
+    grpc: bool,
    #[clap(long, default_value = "http://localhost:9898")]
    mgmt_api_endpoint: String,
    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
@@ -52,7 +57,7 @@ impl LiveStats {

 struct Target {
    timeline: TenantTimelineId,
-    lsn_range: Option<Range<Lsn>>,
+    lsn_range: Range<Lsn>,
 }

 #[derive(serde::Serialize)]
@@ -105,7 +110,7 @@ async fn main_impl(
                anyhow::Ok(Target {
                    timeline,
                    // TODO: support lsn_range != latest LSN
-                    lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)),
+                    lsn_range: info.last_record_lsn..(info.last_record_lsn + 1),
                })
            }
        });
@@ -149,14 +154,27 @@ async fn main_impl(
    for tl in &timelines {
        let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
        work_senders.insert(tl, sender);
-        tasks.push(tokio::spawn(client(
-            args,
-            *tl,
-            Arc::clone(&start_work_barrier),
-            receiver,
-            Arc::clone(&all_work_done_barrier),
-            Arc::clone(&live_stats),
-        )));
+
+        let client_task = if args.grpc {
+            tokio::spawn(client_grpc(
+                args,
+                *tl,
+                Arc::clone(&start_work_barrier),
+                receiver,
+                Arc::clone(&all_work_done_barrier),
+                Arc::clone(&live_stats),
+            ))
+        } else {
+            tokio::spawn(client(
+                args,
+                *tl,
+                Arc::clone(&start_work_barrier),
+                receiver,
+                Arc::clone(&all_work_done_barrier),
+                Arc::clone(&live_stats),
+            ))
+        };
+        tasks.push(client_task);
    }

    let work_sender = async move {
@@ -165,7 +183,7 @@ async fn main_impl(
            let (timeline, work) = {
                let mut rng = rand::thread_rng();
                let target = all_targets.choose(&mut rng).unwrap();
-                let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
+                let lsn = rng.gen_range(target.lsn_range.clone());
                (
                    target.timeline,
                    Work {
@@ -215,7 +233,7 @@ async fn main_impl(

 #[derive(Copy, Clone)]
 struct Work {
-    lsn: Option<Lsn>,
+    lsn: Lsn,
    gzip: bool,
 }

@@ -240,7 +258,7 @@ async fn client(
            .basebackup(&BasebackupRequest {
                tenant_id: timeline.tenant_id,
                timeline_id: timeline.timeline_id,
-                lsn,
+                lsn: Some(lsn),
                gzip,
            })
            .await
@@ -270,3 +288,71 @@ async fn client(

    all_work_done_barrier.wait().await;
 }
+
+#[instrument(skip_all)]
+async fn client_grpc(
+    args: &'static Args,
+    timeline: TenantTimelineId,
+    start_work_barrier: Arc<Barrier>,
+    mut work: tokio::sync::mpsc::Receiver<Work>,
+    all_work_done_barrier: Arc<Barrier>,
+    live_stats: Arc<LiveStats>,
+) {
+    let shard_map = HashMap::from([(0, args.page_service_connstring.clone())]);
+    let client = pageserver_client_grpc::PageserverClient::new(
+        &timeline.tenant_id.to_string(),
+        &timeline.timeline_id.to_string(),
+        &None,
+        shard_map,
+    );
+
+    start_work_barrier.wait().await;
+
+    while let Some(Work { lsn, gzip }) = work.recv().await {
+        let start = Instant::now();
+
+        //tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+
+        info!("starting get_base_backup");
+        let mut basebackup_stream = client
+            .get_base_backup(
+                &GetBaseBackupRequest {
+                    common: RequestCommon {
+                        request_lsn: lsn,
+                        not_modified_since_lsn: lsn,
+                    },
+                    replica: false,
+                },
+                gzip,
+            )
+            .await
+            .with_context(|| format!("start basebackup for {timeline}"))
+            .unwrap()
+            .into_inner();
+
+        info!("starting receive");
+        use futures::StreamExt;
+        let mut size = 0;
+        let mut nchunks = 0;
+        while let Some(chunk) = basebackup_stream.next().await {
+            let chunk = chunk
+                .with_context(|| format!("error during basebackup"))
+                .unwrap();
+            size += chunk.chunk.len();
+            nchunks += 1;
+        }
+
+        info!(
+            "basebackup size is {} bytes, avg chunk size {} bytes",
+            size,
+            size as f32 / nchunks as f32
+        );
+        let elapsed = start.elapsed();
+        live_stats.inc();
+        STATS.with(|stats| {
+            stats.borrow().lock().unwrap().observe(elapsed).unwrap();
+        });
+    }
+
+    all_work_done_barrier.wait().await;
+}
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,4 +1,4 @@
-use std::collections::{HashSet, VecDeque};
+use std::collections::{HashMap, HashSet, VecDeque};
 use std::future::Future;
 use std::num::NonZeroUsize;
 use std::pin::Pin;
@@ -8,6 +8,8 @@ use std::time::{Duration, Instant};

 use anyhow::Context;
 use camino::Utf8PathBuf;
+use futures::StreamExt;
+use futures::stream::FuturesOrdered;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
@@ -25,6 +27,8 @@ use crate::util::{request_stats, tokio_thread_local_stats};
 /// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
 #[derive(clap::Parser)]
 pub(crate) struct Args {
+    #[clap(long, default_value = "false")]
+    grpc: bool,
    #[clap(long, default_value = "http://localhost:9898")]
    mgmt_api_endpoint: String,
    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
@@ -295,7 +299,29 @@ async fn main_impl(
                .unwrap();

        Box::pin(async move {
-            client_libpq(args, worker_id, ss, cancel, rps_period, ranges, weights).await
+            if args.grpc {
+                client_grpc(
+                    args,
+                    worker_id,
+                    ss,
+                    cancel,
+                    rps_period,
+                    ranges,
+                    weights,
+                )
+                .await
+            } else {
+                client_libpq(
+                    args,
+                    worker_id,
+                    ss,
+                    cancel,
+                    rps_period,
+                    ranges,
+                    weights,
+                )
+                .await
+            }
        })
    };

@@ -434,3 +460,100 @@ async fn client_libpq(
        }
    }
 }
+
+async fn client_grpc(
+    args: &Args,
+    worker_id: WorkerId,
+    shared_state: Arc<SharedState>,
+    cancel: CancellationToken,
+    rps_period: Option<Duration>,
+    ranges: Vec<KeyRange>,
+    weights: rand::distributions::weighted::WeightedIndex<i128>,
+) {
+    let shard_map = HashMap::from([(0, args.page_service_connstring.clone())]);
+    let client = pageserver_client_grpc::PageserverClient::new(
+        &worker_id.timeline.tenant_id.to_string(),
+        &worker_id.timeline.timeline_id.to_string(),
+        &None,
+        shard_map,
+    );
+    let client = Arc::new(client);
+
+    shared_state.start_work_barrier.wait().await;
+    let client_start = Instant::now();
+    let mut ticks_processed = 0;
+    let mut inflight = FuturesOrdered::new();
+    while !cancel.is_cancelled() {
+        // Detect if a request took longer than the RPS rate
+        if let Some(period) = &rps_period {
+            let periods_passed_until_now =
+                usize::try_from(client_start.elapsed().as_micros() / period.as_micros()).unwrap();
+
+            if periods_passed_until_now > ticks_processed {
+                shared_state
+                    .live_stats
+                    .missed((periods_passed_until_now - ticks_processed) as u64);
+            }
+            ticks_processed = periods_passed_until_now;
+        }
+
+        while inflight.len() < args.queue_depth.get() {
+            let start = Instant::now();
+            let req = {
+                let mut rng = rand::thread_rng();
+                let r = &ranges[weights.sample(&mut rng)];
+                let key: i128 = rng.gen_range(r.start..r.end);
+                let key = Key::from_i128(key);
+                assert!(key.is_rel_block_key());
+                let (rel_tag, block_no) = key
+                    .to_rel_block()
+                    .expect("we filter non-rel-block keys out above");
+                pageserver_data_api::model::GetPageRequest {
+                    common: pageserver_data_api::model::RequestCommon {
+                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
+                            Lsn::MAX
+                        } else {
+                            r.timeline_lsn
+                        },
+                        not_modified_since_lsn: r.timeline_lsn,
+                    },
+                    rel: pageserver_data_api::model::RelTag {
+                        spc_oid: rel_tag.spcnode,
+                        db_oid: rel_tag.dbnode,
+                        rel_number: rel_tag.relnode,
+                        fork_number: rel_tag.forknum,
+                    },
+                    block_number: block_no,
+                }
+            };
+            let client_clone = client.clone();
+            let getpage_fut = async move {
+                let result = client_clone.get_page(&req).await;
+                (start, result)
+            };
+            inflight.push_back(getpage_fut);
+        }
+
+        let (start, result) = inflight.next().await.unwrap();
+        result.expect("getpage request should succeed");
+        let end = Instant::now();
+        shared_state.live_stats.request_done();
+        ticks_processed += 1;
+        STATS.with(|stats| {
+            stats
+                .borrow()
+                .lock()
+                .unwrap()
+                .observe(end.duration_since(start))
+                .unwrap();
+        });
+
+        if let Some(period) = &rps_period {
+            let next_at = client_start
+                + Duration::from_micros(
+                    (ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
+                );
+            tokio::time::sleep_until(next_at.into()).await;
+        }
+    }
+}
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -151,10 +151,14 @@ where
                .map_err(|_| BasebackupError::Shutdown)?,
        ),
    };
-    basebackup
+    let res = basebackup
        .send_tarball()
        .instrument(info_span!("send_tarball", backup_lsn=%backup_lsn))
-        .await
+        .await;
+
+    info!("basebackup done!");
+
+    res
 }

 /// This is short-living object only for the time of tarball creation,
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -16,6 +16,7 @@ use http_utils::tls_certs::ReloadingCertificateResolver;
 use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric};
 use metrics::set_build_info_metric;
 use nix::sys::socket::{setsockopt, sockopt};
+use pageserver::compute_service;
 use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields};
 use pageserver::controller_upcall_client::StorageControllerUpcallClient;
 use pageserver::deletion_queue::DeletionQueue;
@@ -27,7 +28,7 @@ use pageserver::task_mgr::{
 use pageserver::tenant::{TenantSharedResources, mgr, secondary};
 use pageserver::{
    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
-    page_cache, page_service, task_mgr, virtual_file,
+    page_cache, task_mgr, virtual_file,
 };
 use postgres_backend::AuthType;
 use remote_storage::GenericRemoteStorage;
@@ -745,7 +746,7 @@ fn start_pageserver(
    // Spawn a task to listen for libpq connections. It will spawn further tasks
    // for each connection. We created the listener earlier already.
    let perf_trace_dispatch = otel_guard.as_ref().map(|g| g.dispatch.clone());
-    let page_service = page_service::spawn(
+    let compute_service = compute_service::spawn(
        conf,
        tenant_manager.clone(),
        pg_auth,
@@ -782,7 +783,7 @@ fn start_pageserver(
        pageserver::shutdown_pageserver(
            http_endpoint_listener,
            https_endpoint_listener,
-            page_service,
+            compute_service,
            consumption_metrics_tasks,
            disk_usage_eviction_task,
            &tenant_manager,
--- a/pageserver/src/compute_service.rs
+++ b/pageserver/src/compute_service.rs
@@ -0,0 +1,286 @@
+//!
+//! The Compute Service listens for compute connections, and serves requests like
+//! the GetPage@LSN requests.
+//!
+//! We support two protocols:
+//!
+//! 1. Legacy, connection-oriented libpq based protocol. That's
+//!    handled by the code in page_service.rs.
+//!
+//! 2. gRPC based protocol. See compute_service_grpc.rs.
+//!
+//! To make the transition smooth, without having to open up new firewall ports
+//! etc, both protocols are served on the same port. When a new TCP connection
+//! is accepted, we peek at the first few bytes incoming from the client to
+//! determine which protocol it speaks.
+//!
+//! TODO: This gets easier once we drop the legacy protocol support. Or if we
+//! open a separate port for them.
+
+use std::sync::Arc;
+
+use anyhow::Context;
+use futures::FutureExt;
+use pageserver_api::config::PageServicePipeliningConfig;
+use postgres_backend::AuthType;
+use tokio::task::JoinHandle;
+use tokio_util::sync::CancellationToken;
+use tracing::*;
+use utils::auth::SwappableJwtAuth;
+use utils::sync::gate::{Gate, GateGuard};
+
+use crate::compute_service_grpc::launch_compute_service_grpc_server;
+use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
+use crate::page_service::libpq_page_service_conn_main;
+use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME, TaskKind};
+use crate::tenant::mgr::TenantManager;
+
+///////////////////////////////////////////////////////////////////////////////
+
+pub type ConnectionHandlerResult = anyhow::Result<()>;
+
+pub struct Connections {
+    cancel: CancellationToken,
+    tasks: tokio::task::JoinSet<ConnectionHandlerResult>,
+    gate: Gate,
+}
+
+impl Connections {
+    pub(crate) async fn shutdown(self) {
+        let Self {
+            cancel,
+            mut tasks,
+            gate,
+        } = self;
+        cancel.cancel();
+        while let Some(res) = tasks.join_next().await {
+            Self::handle_connection_completion(res);
+        }
+        gate.close().await;
+    }
+
+    fn handle_connection_completion(res: Result<anyhow::Result<()>, tokio::task::JoinError>) {
+        match res {
+            Ok(Ok(())) => {}
+            Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
+            Err(e) => error!("page_service connection task panicked: {:?}", e),
+        }
+    }
+}
+
+pub struct Listener {
+    cancel: CancellationToken,
+    /// Cancel the listener task through `listen_cancel` to shut down the listener
+    /// and get a handle on the existing connections.
+    task: JoinHandle<Connections>,
+}
+
+pub fn spawn(
+    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
+    pg_auth: Option<Arc<SwappableJwtAuth>>,
+    perf_trace_dispatch: Option<Dispatch>,
+    tcp_listener: tokio::net::TcpListener,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
+) -> Listener {
+    let cancel = CancellationToken::new();
+    let libpq_ctx = RequestContext::todo_child(
+        TaskKind::LibpqEndpointListener,
+        // listener task shouldn't need to download anything. (We will
+        // create a separate sub-contexts for each connection, with their
+        // own download behavior. This context is used only to listen and
+        // accept connections.)
+        DownloadBehavior::Error,
+    );
+
+    let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
+        "compute connection listener",
+        compute_connection_listener_main(
+            conf,
+            tenant_manager,
+            pg_auth,
+            perf_trace_dispatch,
+            tcp_listener,
+            conf.pg_auth_type,
+            tls_config,
+            conf.page_service_pipelining.clone(),
+            libpq_ctx,
+            cancel.clone(),
+        )
+        .map(anyhow::Ok),
+    ));
+
+    Listener { cancel, task }
+}
+
+impl Listener {
+    pub async fn stop_accepting(self) -> Connections {
+        self.cancel.cancel();
+        self.task
+            .await
+            .expect("unreachable: we wrap the listener task in task_mgr::exit_on_panic_or_error")
+    }
+}
+
+/// Listener loop. Listens for connections, and launches a new handler
+/// task for each.
+///
+/// Returns Ok(()) upon cancellation via `cancel`, returning the set of
+/// open connections.
+///
+#[allow(clippy::too_many_arguments)]
+pub async fn compute_connection_listener_main(
+    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
+    auth: Option<Arc<SwappableJwtAuth>>,
+    perf_trace_dispatch: Option<Dispatch>,
+    listener: tokio::net::TcpListener,
+    auth_type: AuthType,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
+    pipelining_config: PageServicePipeliningConfig,
+    listener_ctx: RequestContext,
+    listener_cancel: CancellationToken,
+) -> Connections {
+    let connections_cancel = CancellationToken::new();
+    let connections_gate = Gate::default();
+    let mut connection_handler_tasks = tokio::task::JoinSet::default();
+
+    // The connection handling task passes the gRPC protocol
+    // connections to this channel. The tonic gRPC server reads the
+    // channel and takes over the connections from there.
+    let (grpc_connections_tx, grpc_connections_rx) = tokio::sync::mpsc::channel(1000);
+
+    // Set up the gRPC service
+    launch_compute_service_grpc_server(
+        grpc_connections_rx,
+        conf,
+        tenant_manager.clone(),
+        auth.clone(),
+        auth_type,
+        connections_cancel.clone(),
+        &listener_ctx,
+    );
+
+    // Main listener loop
+    loop {
+        let gate_guard = match connections_gate.enter() {
+            Ok(guard) => guard,
+            Err(_) => break,
+        };
+
+        let accepted = tokio::select! {
+            biased;
+            _ = listener_cancel.cancelled() => break,
+            next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => {
+                let res = next.expect("we dont poll while empty");
+                Connections::handle_connection_completion(res);
+                continue;
+            }
+            accepted = listener.accept() => accepted,
+        };
+
+        match accepted {
+            Ok((socket, peer_addr)) => {
+                // Connection established. Spawn a new task to handle it.
+                debug!("accepted connection from {}", peer_addr);
+                let local_auth = auth.clone();
+                let connection_ctx = RequestContextBuilder::from(&listener_ctx)
+                    .task_kind(TaskKind::PageRequestHandler)
+                    .download_behavior(DownloadBehavior::Download)
+                    .perf_span_dispatch(perf_trace_dispatch.clone())
+                    .detached_child();
+
+                connection_handler_tasks.spawn(page_service_conn_main(
+                    conf,
+                    tenant_manager.clone(),
+                    local_auth,
+                    socket,
+                    auth_type,
+                    tls_config.clone(),
+                    pipelining_config.clone(),
+                    connection_ctx,
+                    connections_cancel.child_token(),
+                    gate_guard,
+                    grpc_connections_tx.clone(),
+                ));
+            }
+            Err(err) => {
+                // accept() failed. Log the error, and loop back to retry on next connection.
+                error!("accept() failed: {:?}", err);
+            }
+        }
+    }
+
+    debug!("page_service listener loop terminated");
+
+    Connections {
+        cancel: connections_cancel,
+        tasks: connection_handler_tasks,
+        gate: connections_gate,
+    }
+}
+
+/// Handle a new incoming connection.
+///
+/// This peeks at the first few incoming bytes and dispatches the connection
+/// to the legacy libpq handler or the new gRPC handler accordingly.
+#[instrument(skip_all, fields(peer_addr, application_name, compute_mode))]
+#[allow(clippy::too_many_arguments)]
+pub async fn page_service_conn_main(
+    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
+    auth: Option<Arc<SwappableJwtAuth>>,
+    socket: tokio::net::TcpStream,
+    auth_type: AuthType,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
+    pipelining_config: PageServicePipeliningConfig,
+    connection_ctx: RequestContext,
+    cancel: CancellationToken,
+    gate_guard: GateGuard,
+    grpc_connections_tx: tokio::sync::mpsc::Sender<tokio::io::Result<tokio::net::TcpStream>>,
+) -> ConnectionHandlerResult {
+    let mut buf: [u8; 4] = [0; 4];
+
+    socket
+        .set_nodelay(true)
+        .context("could not set TCP_NODELAY")?;
+
+    // Peek
+    socket.peek(&mut buf).await?;
+
+    let mut grpc = false;
+    if buf[0] == 0x16 {
+        // looks like a TLS handshake. Assume gRPC.
+        // XXX: Starting with v17, PostgreSQL also supports "direct TLS mode". But
+        // the compute doesn't use it.
+        grpc = true;
+    }
+
+    if buf[0] == b'G' || buf[0] == b'P' {
+        // Looks like 'GET' or 'POST'
+        // or 'PRI', indicating gRPC over HTTP/2 with prior knowledge
+        grpc = true;
+    }
+
+    // Dispatch
+    if grpc {
+        grpc_connections_tx.send(Ok(socket)).await?;
+        info!("connection sent to channel");
+        Ok(())
+    } else {
+        libpq_page_service_conn_main(
+            conf,
+            tenant_manager,
+            auth,
+            socket,
+            auth_type,
+            tls_config,
+            pipelining_config,
+            connection_ctx,
+            cancel,
+            gate_guard,
+        )
+        .await
+    }
+}
--- a/pageserver/src/compute_service_grpc.rs
+++ b/pageserver/src/compute_service_grpc.rs
@@ -0,0 +1,746 @@
+//!
+//! Compute <-> Pageserver API handler. This is for the new gRPC-based protocol
+//!
+//! TODO:
+//!
+//! - Many of the API endpoints are still missing
+//!
+//! - This is very much not optimized.
+//!
+//! - Much of the code was copy-pasted from page_service.rs. Like the code to get the
+//!   Timeline object, and the JWT auth. Could refactor and share.
+//!
+//!
+
+use std::pin::Pin;
+use std::str::FromStr;
+use std::sync::Arc;
+use std::task::Poll;
+use std::time::Duration;
+use std::time::Instant;
+
+use crate::TenantManager;
+use crate::auth::check_permission;
+use crate::basebackup;
+use crate::basebackup::BasebackupError;
+use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
+use crate::task_mgr::TaskKind;
+use crate::tenant::Timeline;
+use crate::tenant::mgr::ShardResolveResult;
+use crate::tenant::mgr::ShardSelector;
+use crate::tenant::storage_layer::IoConcurrency;
+use crate::tenant::timeline::WaitLsnTimeout;
+use tokio::io::{AsyncWriteExt, ReadHalf, SimplexStream};
+use tokio::task::JoinHandle;
+use tokio_util::codec::{Decoder, FramedRead};
+use tokio_util::sync::CancellationToken;
+
+use futures::stream::StreamExt;
+
+use pageserver_data_api::model;
+use pageserver_data_api::proto::page_service_server::PageService;
+use pageserver_data_api::proto::page_service_server::PageServiceServer;
+
+use anyhow::Context;
+use bytes::BytesMut;
+use jsonwebtoken::TokenData;
+use tracing::Instrument;
+use tracing::{debug, error};
+use utils::auth::SwappableJwtAuth;
+
+use utils::id::{TenantId, TenantTimelineId, TimelineId};
+use utils::lsn::Lsn;
+use utils::simple_rcu::RcuReadGuard;
+
+use crate::tenant::PageReconstructError;
+
+use postgres_ffi::BLCKSZ;
+
+use tonic;
+use tonic::codec::CompressionEncoding;
+use tonic::service::interceptor::InterceptedService;
+
+use pageserver_api::key::rel_block_to_key;
+
+use crate::pgdatadir_mapping::Version;
+use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
+
+use postgres_backend::AuthType;
+
+pub use pageserver_data_api::proto;
+
+pub(super) fn launch_compute_service_grpc_server(
+    tcp_connections_rx: tokio::sync::mpsc::Receiver<tokio::io::Result<tokio::net::TcpStream>>,
+    conf: &'static PageServerConf,
+    tenant_manager: Arc<TenantManager>,
+    auth: Option<Arc<SwappableJwtAuth>>,
+    auth_type: AuthType,
+    connections_cancel: CancellationToken,
+    listener_ctx: &RequestContext,
+) {
+    // Set up the gRPC service
+    let service_ctx = RequestContextBuilder::from(listener_ctx)
+        .task_kind(TaskKind::PageRequestHandler)
+        .download_behavior(DownloadBehavior::Download)
+        .attached_child();
+    let service = crate::compute_service_grpc::PageServiceService {
+        conf,
+        tenant_mgr: tenant_manager.clone(),
+        ctx: Arc::new(service_ctx),
+    };
+    let authenticator = PageServiceAuthenticator {
+        auth: auth.clone(),
+        auth_type,
+    };
+
+    let server = InterceptedService::new(
+        PageServiceServer::new(service).send_compressed(CompressionEncoding::Gzip),
+        authenticator,
+    );
+
+    let cc = connections_cancel.clone();
+    tokio::spawn(async move {
+        tonic::transport::Server::builder()
+            .add_service(server)
+            .serve_with_incoming_shutdown(
+                tokio_stream::wrappers::ReceiverStream::new(tcp_connections_rx),
+                cc.cancelled(),
+            )
+            .await
+    });
+}
+
+struct PageServiceService {
+    conf: &'static PageServerConf,
+    tenant_mgr: Arc<TenantManager>,
+    ctx: Arc<RequestContext>,
+}
+
+/// An error happened in a get() operation.
+impl From<PageReconstructError> for tonic::Status {
+    fn from(e: PageReconstructError) -> Self {
+        match e {
+            PageReconstructError::Other(err) => tonic::Status::unknown(err.to_string()),
+            PageReconstructError::AncestorLsnTimeout(_) => {
+                tonic::Status::unavailable(e.to_string())
+            }
+            PageReconstructError::Cancelled => tonic::Status::aborted(e.to_string()),
+            PageReconstructError::WalRedo(_) => tonic::Status::internal(e.to_string()),
+            PageReconstructError::MissingKey(_) => tonic::Status::internal(e.to_string()),
+        }
+    }
+}
+
+fn convert_reltag(value: &model::RelTag) -> pageserver_api::reltag::RelTag {
+    pageserver_api::reltag::RelTag {
+        spcnode: value.spc_oid,
+        dbnode: value.db_oid,
+        relnode: value.rel_number,
+        forknum: value.fork_number,
+    }
+}
+
+#[tonic::async_trait]
+impl PageService for PageServiceService {
+    type GetBaseBackupStream = GetBaseBackupStream;
+
+    async fn rel_exists(
+        &self,
+        request: tonic::Request<proto::RelExistsRequest>,
+    ) -> std::result::Result<tonic::Response<proto::RelExistsResponse>, tonic::Status> {
+        let ttid = self.extract_ttid(request.metadata())?;
+        let req: model::RelExistsRequest = request.get_ref().try_into()?;
+
+        let rel = convert_reltag(&req.rel);
+        let span = tracing::info_span!("rel_exists", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, rel = %rel, req_lsn = %req.common.request_lsn);
+
+        async {
+            let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?;
+            let ctx = self.ctx.with_scope_timeline(&timeline);
+            let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
+            let lsn = Self::wait_or_get_last_lsn(
+                &timeline,
+                req.common.request_lsn,
+                req.common.not_modified_since_lsn,
+                &latest_gc_cutoff_lsn,
+                &ctx,
+            )
+            .await?;
+
+            let exists = timeline
+                .get_rel_exists(rel, Version::Lsn(lsn), &ctx)
+                .await?;
+
+            Ok(tonic::Response::new(proto::RelExistsResponse { exists }))
+        }
+        .instrument(span)
+        .await
+    }
+
+    /// Returns size of a relation, as # of blocks
+    async fn rel_size(
+        &self,
+        request: tonic::Request<proto::RelSizeRequest>,
+    ) -> std::result::Result<tonic::Response<proto::RelSizeResponse>, tonic::Status> {
+        let ttid = self.extract_ttid(request.metadata())?;
+        let req: model::RelSizeRequest = request.get_ref().try_into()?;
+        let rel = convert_reltag(&req.rel);
+
+        let span = tracing::info_span!("rel_size", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, rel = %rel, req_lsn = %req.common.request_lsn);
+
+        async {
+            let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?;
+            let ctx = self.ctx.with_scope_timeline(&timeline);
+            let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
+            let lsn = Self::wait_or_get_last_lsn(
+                &timeline,
+                req.common.request_lsn,
+                req.common.not_modified_since_lsn,
+                &latest_gc_cutoff_lsn,
+                &ctx,
+            )
+            .await?;
+
+            let num_blocks = timeline.get_rel_size(rel, Version::Lsn(lsn), &ctx).await?;
+
+            Ok(tonic::Response::new(proto::RelSizeResponse { num_blocks }))
+        }
+        .instrument(span)
+        .await
+    }
+
+    async fn get_page(
+        &self,
+        request: tonic::Request<proto::GetPageRequest>,
+    ) -> std::result::Result<tonic::Response<proto::GetPageResponse>, tonic::Status> {
+        let ttid = self.extract_ttid(request.metadata())?;
+        let req: model::GetPageRequest = request.get_ref().try_into()?;
+
+        // Calculate shard number.
+        //
+        // FIXME: this should probably be part of the data_api crate.
+        let rel = convert_reltag(&req.rel);
+        let key = rel_block_to_key(rel, req.block_number);
+        let timeline = self.get_timeline(ttid, ShardSelector::Page(key)).await?;
+
+        let ctx = self.ctx.with_scope_timeline(&timeline);
+        let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
+        let lsn = Self::wait_or_get_last_lsn(
+            &timeline,
+            req.common.request_lsn,
+            req.common.not_modified_since_lsn,
+            &latest_gc_cutoff_lsn,
+            &ctx,
+        )
+        .await?;
+
+        let shard_id = timeline.tenant_shard_id.shard_number;
+        let span = tracing::info_span!("get_page", tenant_id = %ttid.tenant_id, shard_id = %shard_id, timeline_id = %ttid.timeline_id, rel = %rel, block_number = %req.block_number, req_lsn = %req.common.request_lsn);
+
+        async {
+            let gate_guard = match timeline.gate.enter() {
+                Ok(guard) => guard,
+                Err(_) => {
+                    return Err(tonic::Status::unavailable("timeline is shutting down"));
+                }
+            };
+
+            let io_concurrency = IoConcurrency::spawn_from_conf(self.conf, gate_guard);
+
+            let page_image = timeline
+                .get_rel_page_at_lsn(
+                    rel,
+                    req.block_number,
+                    Version::Lsn(lsn),
+                    &ctx,
+                    io_concurrency,
+                )
+                .await?;
+
+            Ok(tonic::Response::new(proto::GetPageResponse {
+                page_image: page_image,
+            }))
+        }
+        .instrument(span)
+        .await
+    }
+
+    async fn db_size(
+        &self,
+        request: tonic::Request<proto::DbSizeRequest>,
+    ) -> Result<tonic::Response<proto::DbSizeResponse>, tonic::Status> {
+        let ttid = self.extract_ttid(request.metadata())?;
+        let req: model::DbSizeRequest = request.get_ref().try_into()?;
+
+        let span = tracing::info_span!("get_page", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, db_oid = %req.db_oid, req_lsn = %req.common.request_lsn);
+
+        async {
+            let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?;
+            let ctx = self.ctx.with_scope_timeline(&timeline);
+            let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
+            let lsn = Self::wait_or_get_last_lsn(
+                &timeline,
+                req.common.request_lsn,
+                req.common.not_modified_since_lsn,
+                &latest_gc_cutoff_lsn,
+                &ctx,
+            )
+            .await?;
+
+            let total_blocks = timeline
+                .get_db_size(DEFAULTTABLESPACE_OID, req.db_oid, Version::Lsn(lsn), &ctx)
+                .await?;
+
+            Ok(tonic::Response::new(proto::DbSizeResponse {
+                num_bytes: total_blocks as u64 * BLCKSZ as u64,
+            }))
+        }
+        .instrument(span)
+        .await
+    }
+
+    async fn get_base_backup(
+        &self,
+        request: tonic::Request<proto::GetBaseBackupRequest>,
+    ) -> Result<tonic::Response<Self::GetBaseBackupStream>, tonic::Status> {
+        let ttid = self.extract_ttid(request.metadata())?;
+        let req: model::GetBaseBackupRequest = request.get_ref().try_into()?;
+
+        let timeline = self.get_timeline(ttid, ShardSelector::Zero).await?;
+
+        let ctx = self.ctx.with_scope_timeline(&timeline);
+        let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
+        let lsn = Self::wait_or_get_last_lsn(
+            &timeline,
+            req.common.request_lsn,
+            req.common.not_modified_since_lsn,
+            &latest_gc_cutoff_lsn,
+            &ctx,
+        )
+        .await?;
+
+        let span = tracing::info_span!("get_base_backup", tenant_id = %ttid.tenant_id, timeline_id = %ttid.timeline_id, req_lsn = %req.common.request_lsn);
+
+        tracing::info!("starting basebackup");
+
+        #[allow(dead_code)]
+        enum TestMode {
+            /// Create real basebackup, in streaming fashion
+            Streaming,
+            /// Create real basebackup, but fully materialize it in the 'simplex' pipe buffer first
+            Materialize,
+            /// Create a dummy all-zeros basebackup, in streaming fashion
+            DummyStreaming,
+            /// Create a dummy all-zeros basebackup, but fully materialize it first
+            DummyMaterialize,
+        }
+        let mode = TestMode::Streaming;
+
+        let buf_size = match mode {
+            TestMode::Streaming | TestMode::DummyStreaming => 64 * 1024,
+            TestMode::Materialize | TestMode::DummyMaterialize => 64 * 1024 * 1024,
+        };
+
+        let (simplex_read, mut simplex_write) = tokio::io::simplex(buf_size);
+
+        let basebackup_task = match mode {
+            TestMode::DummyStreaming => {
+                tokio::spawn(
+                    async move {
+                        // hold onto the guard for as long as the basebackup runs
+                        let _latest_gc_cutoff_lsn = latest_gc_cutoff_lsn;
+
+                        let zerosbuf: [u8; 1024] = [0; 1024];
+                        let nbytes = 16900000;
+                        let mut bytes_written = 0;
+                        while bytes_written < nbytes {
+                            let s = std::cmp::min(1024, nbytes - bytes_written);
+                            let _ = simplex_write.write_all(&zerosbuf[0..s]).await;
+                            bytes_written += s;
+                        }
+                        simplex_write
+                            .shutdown()
+                            .await
+                            .context("shutdown of basebackup pipe")?;
+
+                        Ok(())
+                    }
+                    .instrument(span),
+                )
+            }
+            TestMode::DummyMaterialize => {
+                let zerosbuf: [u8; 1024] = [0; 1024];
+                let nbytes = 16900000;
+                let mut bytes_written = 0;
+                while bytes_written < nbytes {
+                    let s = std::cmp::min(1024, nbytes - bytes_written);
+                    let _ = simplex_write.write_all(&zerosbuf[0..s]).await;
+                    bytes_written += s;
+                }
+                simplex_write
+                    .shutdown()
+                    .await
+                    .expect("shutdown of basebackup pipe");
+                tracing::info!("basebackup (dummy) materialized");
+                let result = Ok(());
+
+                tokio::spawn(std::future::ready(result))
+            }
+            TestMode::Materialize => {
+                let result = basebackup::send_basebackup_tarball(
+                    &mut simplex_write,
+                    &timeline,
+                    Some(lsn),
+                    None,
+                    false,
+                    req.replica,
+                    &ctx,
+                )
+                .await;
+                simplex_write
+                    .shutdown()
+                    .await
+                    .expect("shutdown of basebackup pipe");
+                tracing::info!("basebackup materialized");
+
+                // Launch a task that writes the basebackup tarball to the simplex pipe
+                tokio::spawn(std::future::ready(result))
+            }
+            TestMode::Streaming => {
+                tokio::spawn(
+                    async move {
+                        // hold onto the guard for as long as the basebackup runs
+                        let _latest_gc_cutoff_lsn = latest_gc_cutoff_lsn;
+
+                        let result = basebackup::send_basebackup_tarball(
+                            &mut simplex_write,
+                            &timeline,
+                            Some(lsn),
+                            None,
+                            false,
+                            req.replica,
+                            &ctx,
+                        )
+                        .await;
+                        simplex_write
+                            .shutdown()
+                            .await
+                            .context("shutdown of basebackup pipe")?;
+                        result
+                    }
+                    .instrument(span),
+                )
+            }
+        };
+
+        let response = new_basebackup_response_stream(simplex_read, basebackup_task);
+
+        Ok(tonic::Response::new(response))
+    }
+}
+
+/// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
+/// NB: and also different from page_service::ACTIVE_TENANT_TIMEOUT
+const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
+
+impl PageServiceService {
+    async fn get_timeline(
+        &self,
+        ttid: TenantTimelineId,
+        shard_selector: ShardSelector,
+    ) -> Result<Arc<Timeline>, tonic::Status> {
+        let timeout = ACTIVE_TENANT_TIMEOUT;
+        let wait_start = Instant::now();
+        let deadline = wait_start + timeout;
+
+        let tenant_shard = loop {
+            let resolved = self
+                .tenant_mgr
+                .resolve_attached_shard(&ttid.tenant_id, shard_selector);
+
+            match resolved {
+                ShardResolveResult::Found(tenant_shard) => break tenant_shard,
+                ShardResolveResult::NotFound => {
+                    return Err(tonic::Status::not_found("tenant not found"));
+                }
+                ShardResolveResult::InProgress(barrier) => {
+                    // We can't authoritatively answer right now: wait for InProgress state
+                    // to end, then try again
+                    tokio::select! {
+                        _  = barrier.wait() => {
+                            // The barrier completed: proceed around the loop to try looking up again
+                        },
+                        _ = tokio::time::sleep(deadline.duration_since(Instant::now())) => {
+                            return Err(tonic::Status::unavailable("tenant is in InProgress state"));
+                        }
+                    }
+                }
+            }
+        };
+
+        tracing::debug!("Waiting for tenant to enter active state...");
+        tenant_shard
+            .wait_to_become_active(deadline.duration_since(Instant::now()))
+            .await
+            .map_err(|e| {
+                tonic::Status::unavailable(format!("tenant is not in active state: {e}"))
+            })?;
+
+        let timeline = tenant_shard
+            .get_timeline(ttid.timeline_id, true)
+            .map_err(|e| tonic::Status::unavailable(format!("could not get timeline: {e}")))?;
+
+        // FIXME: need to do something with the 'gate' here?
+
+        Ok(timeline)
+    }
+
+    /// Extract TenantTimelineId from the request metadata
+    ///
+    /// Note: the interceptor has already authenticated the request
+    ///
+    /// TOOD: Could we use "binary" metadata for these, for efficiency? gRPC has such a concept
+    fn extract_ttid(
+        &self,
+        metadata: &tonic::metadata::MetadataMap,
+    ) -> Result<TenantTimelineId, tonic::Status> {
+        let tenant_id = metadata
+            .get("neon-tenant-id")
+            .ok_or(tonic::Status::invalid_argument(
+                "neon-tenant-id metadata missing",
+            ))?;
+        let tenant_id = tenant_id.to_str().map_err(|_| {
+            tonic::Status::invalid_argument("invalid UTF-8 characters in neon-tenant-id metadata")
+        })?;
+        let tenant_id = TenantId::from_str(tenant_id)
+            .map_err(|_| tonic::Status::invalid_argument("invalid neon-tenant-id metadata"))?;
+
+        let timeline_id =
+            metadata
+                .get("neon-timeline-id")
+                .ok_or(tonic::Status::invalid_argument(
+                    "neon-timeline-id metadata missing",
+                ))?;
+        let timeline_id = timeline_id.to_str().map_err(|_| {
+            tonic::Status::invalid_argument("invalid UTF-8 characters in neon-timeline-id metadata")
+        })?;
+        let timeline_id = TimelineId::from_str(timeline_id)
+            .map_err(|_| tonic::Status::invalid_argument("invalid neon-timelineid metadata"))?;
+
+        Ok(TenantTimelineId::new(tenant_id, timeline_id))
+    }
+
+    // XXX: copied from PageServerHandler
+    async fn wait_or_get_last_lsn(
+        timeline: &Timeline,
+        request_lsn: Lsn,
+        not_modified_since: Lsn,
+        latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
+        ctx: &RequestContext,
+    ) -> Result<Lsn, tonic::Status> {
+        let last_record_lsn = timeline.get_last_record_lsn();
+
+        // Sanity check the request
+        if request_lsn < not_modified_since {
+            return Err(tonic::Status::invalid_argument(format!(
+                "invalid request with request LSN {} and not_modified_since {}",
+                request_lsn, not_modified_since,
+            )));
+        }
+
+        // Check explicitly for INVALID just to get a less scary error message if the request is obviously bogus
+        if request_lsn == Lsn::INVALID {
+            return Err(tonic::Status::invalid_argument("invalid LSN(0) in request"));
+        }
+
+        // Clients should only read from recent LSNs on their timeline, or from locations holding an LSN lease.
+        //
+        // We may have older data available, but we make a best effort to detect this case and return an error,
+        // to distinguish a misbehaving client (asking for old LSN) from a storage issue (data missing at a legitimate LSN).
+        if request_lsn < **latest_gc_cutoff_lsn && !timeline.is_gc_blocked_by_lsn_lease_deadline() {
+            let gc_info = &timeline.gc_info.read().unwrap();
+            if !gc_info.lsn_covered_by_lease(request_lsn) {
+                return Err(tonic::Status::not_found(format!(
+                    "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
+                    request_lsn, **latest_gc_cutoff_lsn
+                )));
+            }
+        }
+
+        // Wait for WAL up to 'not_modified_since' to arrive, if necessary
+        if not_modified_since > last_record_lsn {
+            timeline
+                .wait_lsn(
+                    not_modified_since,
+                    crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    WaitLsnTimeout::Default,
+                    ctx,
+                )
+                .await
+                .map_err(|_| {
+                    tonic::Status::unavailable("not_modified_since LSN not arrived yet")
+                })?;
+            // Since we waited for 'not_modified_since' to arrive, that is now the last
+            // record LSN. (Or close enough for our purposes; the last-record LSN can
+            // advance immediately after we return anyway)
+            Ok(not_modified_since)
+        } else {
+            // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
+            // here instead. That would give the same result, since we know that there
+            // haven't been any modifications since 'not_modified_since'. Using an older
+            // LSN might be faster, because that could allow skipping recent layers when
+            // finding the page. However, we have historically used 'last_record_lsn', so
+            // stick to that for now.
+            Ok(std::cmp::min(last_record_lsn, request_lsn))
+        }
+    }
+}
+
+#[derive(Clone)]
+pub struct PageServiceAuthenticator {
+    pub auth: Option<Arc<SwappableJwtAuth>>,
+    pub auth_type: AuthType,
+}
+
+impl tonic::service::Interceptor for PageServiceAuthenticator {
+    fn call(
+        &mut self,
+        req: tonic::Request<()>,
+    ) -> std::result::Result<tonic::Request<()>, tonic::Status> {
+        // Check the tenant_id in any case
+        let tenant_id =
+            req.metadata()
+                .get("neon-tenant-id")
+                .ok_or(tonic::Status::invalid_argument(
+                    "neon-tenant-id metadata missing",
+                ))?;
+        let tenant_id = tenant_id.to_str().map_err(|_| {
+            tonic::Status::invalid_argument("invalid UTF-8 characters in neon-tenant-id metadata")
+        })?;
+        let tenant_id = TenantId::from_str(tenant_id)
+            .map_err(|_| tonic::Status::invalid_argument("invalid neon-tenant-id metadata"))?;
+
+        // when accessing management api supply None as an argument
+        // when using to authorize tenant pass corresponding tenant id
+        let auth = if let Some(auth) = &self.auth {
+            auth
+        } else {
+            // auth is set to Trust, nothing to check so just return ok
+            return Ok(req);
+        };
+
+        let jwt = req
+            .metadata()
+            .get("neon-auth-token")
+            .ok_or(tonic::Status::unauthenticated("no neon-auth-token"))?;
+        let jwt = jwt.to_str().map_err(|_| {
+            tonic::Status::invalid_argument("invalid UTF-8 characters in neon-auth-token metadata")
+        })?;
+
+        let jwtdata: TokenData<utils::auth::Claims> = auth
+            .decode(jwt)
+            .map_err(|err| tonic::Status::unauthenticated(format!("invalid JWT token: {}", err)))?;
+        let claims = jwtdata.claims;
+
+        if matches!(claims.scope, utils::auth::Scope::Tenant) && claims.tenant_id.is_none() {
+            return Err(tonic::Status::unauthenticated(
+                "jwt token scope is Tenant, but tenant id is missing",
+            ));
+        }
+
+        debug!(
+            "jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
+            claims.scope, claims.tenant_id,
+        );
+
+        // The token is valid. Check if it's allowed to access the tenant ID
+        // given in the request.
+
+        check_permission(&claims, Some(tenant_id))
+            .map_err(|err| tonic::Status::permission_denied(err.to_string()))?;
+
+        // All checks out
+        Ok(req)
+    }
+}
+
+/// Stream of GetBaseBackupResponseChunk messages.
+///
+/// The first part of the Chain chunks the tarball. The second part checks the return value
+/// of the send_basebackup_tarball Future that created the tarball.
+
+type GetBaseBackupStream = futures::stream::Chain<BasebackupChunkedStream, CheckResultStream>;
+
+fn new_basebackup_response_stream(
+    simplex_read: ReadHalf<SimplexStream>,
+    basebackup_task: JoinHandle<Result<(), BasebackupError>>,
+) -> GetBaseBackupStream {
+    let framed = FramedRead::new(simplex_read, GetBaseBackupResponseDecoder {});
+
+    framed.chain(CheckResultStream { basebackup_task })
+}
+
+/// Stream that uses GetBaseBackupResponseDecoder
+type BasebackupChunkedStream =
+    tokio_util::codec::FramedRead<ReadHalf<SimplexStream>, GetBaseBackupResponseDecoder>;
+
+struct GetBaseBackupResponseDecoder;
+impl Decoder for GetBaseBackupResponseDecoder {
+    type Item = proto::GetBaseBackupResponseChunk;
+    type Error = tonic::Status;
+
+    fn decode(&mut self, src: &mut BytesMut) -> Result<Option<Self::Item>, Self::Error> {
+        if src.len() < 64 * 1024 {
+            return Ok(None);
+        }
+
+        let item = proto::GetBaseBackupResponseChunk {
+            chunk: bytes::Bytes::from(std::mem::take(src)),
+        };
+
+        Ok(Some(item))
+    }
+
+    fn decode_eof(&mut self, src: &mut BytesMut) -> Result<Option<Self::Item>, Self::Error> {
+        if src.is_empty() {
+            return Ok(None);
+        }
+
+        let item = proto::GetBaseBackupResponseChunk {
+            chunk: bytes::Bytes::from(std::mem::take(src)),
+        };
+
+        Ok(Some(item))
+    }
+}
+
+struct CheckResultStream {
+    basebackup_task: tokio::task::JoinHandle<Result<(), BasebackupError>>,
+}
+impl futures::Stream for CheckResultStream {
+    type Item = Result<proto::GetBaseBackupResponseChunk, tonic::Status>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        ctx: &mut std::task::Context<'_>,
+    ) -> Poll<Option<Self::Item>> {
+        let task = Pin::new(&mut self.basebackup_task);
+        match task.poll(ctx) {
+            Poll::Pending => Poll::Pending,
+            Poll::Ready(Ok(Ok(()))) => Poll::Ready(None),
+            Poll::Ready(Ok(Err(basebackup_err))) => {
+                error!(error=%basebackup_err, "error getting basebackup");
+                Poll::Ready(Some(Err(tonic::Status::internal(
+                    "could not get basebackup",
+                ))))
+            }
+            Poll::Ready(Err(join_err)) => {
+                error!(error=%join_err, "JoinError getting basebackup");
+                Poll::Ready(Some(Err(tonic::Status::internal(
+                    "could not get basebackup",
+                ))))
+            }
+        }
+    }
+}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -21,6 +21,8 @@ pub use pageserver_api::keyspace;
 use tokio_util::sync::CancellationToken;
 mod assert_u64_eq_usize;
 pub mod aux_file;
+pub mod compute_service;
+pub mod compute_service_grpc;
 pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
@@ -82,7 +84,7 @@ impl CancellableTask {
 pub async fn shutdown_pageserver(
    http_listener: HttpEndpointListener,
    https_listener: Option<HttpsEndpointListener>,
-    page_service: page_service::Listener,
+    compute_service: compute_service::Listener,
    consumption_metrics_worker: ConsumptionMetricsTasks,
    disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
    tenant_manager: &TenantManager,
@@ -167,11 +169,11 @@ pub async fn shutdown_pageserver(
        }
    });

-    // Shut down the libpq endpoint task. This prevents new connections from
+    // Shut down the compute service endpoint task. This prevents new connections from
    // being accepted.
    let remaining_connections = timed(
-        page_service.stop_accepting(),
-        "shutdown LibpqEndpointListener",
+        compute_service.stop_accepting(),
+        "shutdown compte service listener",
        Duration::from_secs(1),
    )
    .await;
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -13,7 +13,6 @@ use crate::PERF_TRACE_TARGET;
 use anyhow::{Context, bail};
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
-use futures::FutureExt;
 use itertools::Itertools;
 use jsonwebtoken::TokenData;
 use once_cell::sync::OnceCell;
@@ -40,7 +39,6 @@ use pq_proto::framed::ConnectionError;
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor};
 use strum_macros::IntoStaticStr;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufWriter};
-use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::{Claims, Scope, SwappableJwtAuth};
@@ -49,15 +47,13 @@ use utils::id::{TenantId, TimelineId};
 use utils::logging::log_slow;
 use utils::lsn::Lsn;
 use utils::simple_rcu::RcuReadGuard;
-use utils::sync::gate::{Gate, GateGuard};
+use utils::sync::gate::GateGuard;
 use utils::sync::spsc_fold;

 use crate::auth::check_permission;
 use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
-use crate::context::{
-    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
-};
+use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
 use crate::metrics::{
    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
    SmgrOpTimer, TimelineMetrics,
@@ -67,7 +63,6 @@ use crate::span::{
    debug_assert_current_span_has_tenant_and_timeline_id,
    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
 };
-use crate::task_mgr::{self, COMPUTE_REQUEST_RUNTIME, TaskKind};
 use crate::tenant::mgr::{
    GetActiveTenantError, GetTenantError, ShardResolveResult, ShardSelector, TenantManager,
 };
@@ -85,171 +80,6 @@ const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 /// Threshold at which to log slow GetPage requests.
 const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30);

-///////////////////////////////////////////////////////////////////////////////
-
-pub struct Listener {
-    cancel: CancellationToken,
-    /// Cancel the listener task through `listen_cancel` to shut down the listener
-    /// and get a handle on the existing connections.
-    task: JoinHandle<Connections>,
-}
-
-pub struct Connections {
-    cancel: CancellationToken,
-    tasks: tokio::task::JoinSet<ConnectionHandlerResult>,
-    gate: Gate,
-}
-
-pub fn spawn(
-    conf: &'static PageServerConf,
-    tenant_manager: Arc<TenantManager>,
-    pg_auth: Option<Arc<SwappableJwtAuth>>,
-    perf_trace_dispatch: Option<Dispatch>,
-    tcp_listener: tokio::net::TcpListener,
-    tls_config: Option<Arc<rustls::ServerConfig>>,
-) -> Listener {
-    let cancel = CancellationToken::new();
-    let libpq_ctx = RequestContext::todo_child(
-        TaskKind::LibpqEndpointListener,
-        // listener task shouldn't need to download anything. (We will
-        // create a separate sub-contexts for each connection, with their
-        // own download behavior. This context is used only to listen and
-        // accept connections.)
-        DownloadBehavior::Error,
-    );
-    let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
-        "libpq listener",
-        libpq_listener_main(
-            conf,
-            tenant_manager,
-            pg_auth,
-            perf_trace_dispatch,
-            tcp_listener,
-            conf.pg_auth_type,
-            tls_config,
-            conf.page_service_pipelining.clone(),
-            libpq_ctx,
-            cancel.clone(),
-        )
-        .map(anyhow::Ok),
-    ));
-
-    Listener { cancel, task }
-}
-
-impl Listener {
-    pub async fn stop_accepting(self) -> Connections {
-        self.cancel.cancel();
-        self.task
-            .await
-            .expect("unreachable: we wrap the listener task in task_mgr::exit_on_panic_or_error")
-    }
-}
-impl Connections {
-    pub(crate) async fn shutdown(self) {
-        let Self {
-            cancel,
-            mut tasks,
-            gate,
-        } = self;
-        cancel.cancel();
-        while let Some(res) = tasks.join_next().await {
-            Self::handle_connection_completion(res);
-        }
-        gate.close().await;
-    }
-
-    fn handle_connection_completion(res: Result<anyhow::Result<()>, tokio::task::JoinError>) {
-        match res {
-            Ok(Ok(())) => {}
-            Ok(Err(e)) => error!("error in page_service connection task: {:?}", e),
-            Err(e) => error!("page_service connection task panicked: {:?}", e),
-        }
-    }
-}
-
-///
-/// Main loop of the page service.
-///
-/// Listens for connections, and launches a new handler task for each.
-///
-/// Returns Ok(()) upon cancellation via `cancel`, returning the set of
-/// open connections.
-///
-#[allow(clippy::too_many_arguments)]
-pub async fn libpq_listener_main(
-    conf: &'static PageServerConf,
-    tenant_manager: Arc<TenantManager>,
-    auth: Option<Arc<SwappableJwtAuth>>,
-    perf_trace_dispatch: Option<Dispatch>,
-    listener: tokio::net::TcpListener,
-    auth_type: AuthType,
-    tls_config: Option<Arc<rustls::ServerConfig>>,
-    pipelining_config: PageServicePipeliningConfig,
-    listener_ctx: RequestContext,
-    listener_cancel: CancellationToken,
-) -> Connections {
-    let connections_cancel = CancellationToken::new();
-    let connections_gate = Gate::default();
-    let mut connection_handler_tasks = tokio::task::JoinSet::default();
-
-    loop {
-        let gate_guard = match connections_gate.enter() {
-            Ok(guard) => guard,
-            Err(_) => break,
-        };
-
-        let accepted = tokio::select! {
-            biased;
-            _ = listener_cancel.cancelled() => break,
-            next = connection_handler_tasks.join_next(), if !connection_handler_tasks.is_empty() => {
-                let res = next.expect("we dont poll while empty");
-                Connections::handle_connection_completion(res);
-                continue;
-            }
-            accepted = listener.accept() => accepted,
-        };
-
-        match accepted {
-            Ok((socket, peer_addr)) => {
-                // Connection established. Spawn a new task to handle it.
-                debug!("accepted connection from {}", peer_addr);
-                let local_auth = auth.clone();
-                let connection_ctx = RequestContextBuilder::from(&listener_ctx)
-                    .task_kind(TaskKind::PageRequestHandler)
-                    .download_behavior(DownloadBehavior::Download)
-                    .perf_span_dispatch(perf_trace_dispatch.clone())
-                    .detached_child();
-
-                connection_handler_tasks.spawn(page_service_conn_main(
-                    conf,
-                    tenant_manager.clone(),
-                    local_auth,
-                    socket,
-                    auth_type,
-                    tls_config.clone(),
-                    pipelining_config.clone(),
-                    connection_ctx,
-                    connections_cancel.child_token(),
-                    gate_guard,
-                ));
-            }
-            Err(err) => {
-                // accept() failed. Log the error, and loop back to retry on next connection.
-                error!("accept() failed: {:?}", err);
-            }
-        }
-    }
-
-    debug!("page_service listener loop terminated");
-
-    Connections {
-        cancel: connections_cancel,
-        tasks: connection_handler_tasks,
-        gate: connections_gate,
-    }
-}
-
 type ConnectionHandlerResult = anyhow::Result<()>;

 /// Perf root spans start at the per-request level, after shard routing.
@@ -261,9 +91,10 @@ struct ConnectionPerfSpanFields {
    compute_mode: Option<String>,
 }

+/// note: the caller has already set TCP_NODELAY on the socket
 #[instrument(skip_all, fields(peer_addr, application_name, compute_mode))]
 #[allow(clippy::too_many_arguments)]
-async fn page_service_conn_main(
+pub async fn libpq_page_service_conn_main(
    conf: &'static PageServerConf,
    tenant_manager: Arc<TenantManager>,
    auth: Option<Arc<SwappableJwtAuth>>,
@@ -279,10 +110,6 @@ async fn page_service_conn_main(
        .with_label_values(&["page_service"])
        .guard();

-    socket
-        .set_nodelay(true)
-        .context("could not set TCP_NODELAY")?;
-
    let socket_fd = socket.as_raw_fd();

    let peer_addr = socket.peer_addr().context("get peer address")?;
@@ -393,7 +220,7 @@ struct PageServerHandler {
    gate_guard: GateGuard,
 }

-struct TimelineHandles {
+pub struct TimelineHandles {
    wrapper: TenantManagerWrapper,
    /// Note on size: the typical size of this map is 1.  The largest size we expect
    /// to see is the number of shards divided by the number of pageservers (typically < 2),
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -1,10 +1,10 @@
 # pgxs/neon/Makefile

-
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
 	communicator.o \
+	communicator_new.o \
 	extension_server.o \
 	file_cache.o \
 	hll.o \
@@ -22,7 +22,8 @@ OBJS = \
 	walproposer.o \
 	walproposer_pg.o \
 	control_plane_connector.o \
-	walsender_hooks.o
+	walsender_hooks.o \
+	$(LIBCOMMUNICATOR_PATH)/libcommunicator.a

 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
--- a/pgxn/neon/communicator/Cargo.lock
+++ b/pgxn/neon/communicator/Cargo.lock
@@ -0,0 +1,372 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "addr2line"
+version = "0.24.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
+dependencies = [
+ "gimli",
+]
+
+[[package]]
+name = "adler2"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
+
+[[package]]
+name = "backtrace"
+version = "0.3.74"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a"
+dependencies = [
+ "addr2line",
+ "cfg-if",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+ "windows-targets",
+]
+
+[[package]]
+name = "base64"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
+
+[[package]]
+name = "bytes"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "communicator"
+version = "0.1.0"
+dependencies = [
+ "tonic",
+]
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "futures-core"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
+
+[[package]]
+name = "gimli"
+version = "0.31.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
+
+[[package]]
+name = "http"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
+dependencies = [
+ "bytes",
+ "fnv",
+ "itoa",
+]
+
+[[package]]
+name = "http-body"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
+dependencies = [
+ "bytes",
+ "http",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
+
+[[package]]
+name = "libc"
+version = "0.2.171"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6"
+
+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.8.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff70ce3e48ae43fa075863cef62e8b43b71a4f2382229920e0df362592919430"
+dependencies = [
+ "adler2",
+]
+
+[[package]]
+name = "object"
+version = "0.36.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
+
+[[package]]
+name = "percent-encoding"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
+
+[[package]]
+name = "pin-project"
+version = "1.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
+dependencies = [
+ "pin-project-internal",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "1.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.94"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rustc-demangle"
+version = "0.1.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
+
+[[package]]
+name = "syn"
+version = "2.0.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "tokio"
+version = "1.44.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48"
+dependencies = [
+ "backtrace",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "tokio-stream"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047"
+dependencies = [
+ "futures-core",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tonic"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85839f0b32fd242bb3209262371d07feda6d780d16ee9d2bc88581b89da1549b"
+dependencies = [
+ "base64",
+ "bytes",
+ "http",
+ "http-body",
+ "http-body-util",
+ "percent-encoding",
+ "pin-project",
+ "tokio-stream",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+
+[[package]]
+name = "tower-service"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
+
+[[package]]
+name = "tracing"
+version = "0.1.41"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
+dependencies = [
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
+dependencies = [
+ "once_cell",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
--- a/pgxn/neon/communicator/Cargo.toml
+++ b/pgxn/neon/communicator/Cargo.toml
@@ -0,0 +1,35 @@
+[package]
+name = "communicator"
+version = "0.1.0"
+edition = "2024"
+
+[lib]
+crate-type = ["staticlib"]
+
+[dependencies]
+bytes.workspace = true
+http.workspace = true
+libc.workspace = true
+nix.workspace = true
+atomic_enum = "0.3.0"
+prost.workspace = true
+tonic = { version = "0.12.0", default-features = false, features=["codegen", "prost", "transport"] }
+tokio = { version = "1.43.1", features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] }
+tokio-pipe = { version = "0.2.12" }
+thiserror.workspace = true
+tracing.workspace = true
+tracing-subscriber.workspace = true
+zerocopy = "0.8.0"
+zerocopy-derive = "0.8.0"
+
+tokio-epoll-uring.workspace = true
+uring-common.workspace = true
+
+pageserver_client_grpc.workspace = true
+pageserver_data_api.workspace = true
+
+neonart.workspace = true
+utils.workspace = true
+
+[build-dependencies]
+cbindgen.workspace = true
--- a/pgxn/neon/communicator/README.md
+++ b/pgxn/neon/communicator/README.md
@@ -0,0 +1,123 @@
+# Communicator
+
+This package provides the so-called "compute-pageserver communicator",
+or just "communicator" in short. It runs in a PostgreSQL server, as
+part of the neon extension, and handles the communication with the
+pageservers. On the PostgreSQL side, the glue code in pgxn/neon/ uses
+the communicator to implement the PostgreSQL Storage Manager (SMGR)
+interface.
+
+## Design criteria
+
+- Low latency
+- Saturate a 10 Gbit / s network interface without becoming a bottleneck
+
+## Source code view
+
+pgxn/neon/communicator_new.c
+	Contains the glue that interact with PostgreSQL code and the Rust
+	communicator code.
+
+pgxn/neon/communicator/src/backend_interface.rs
+	The entry point for calls from each backend.
+
+pgxn/neon/communicator/src/init.rs
+	Initialization at server startup
+
+pgxn/neon/communicator/src/worker_process/
+    Worker process main loop and glue code
+
+At compilation time, pgxn/neon/communicator/ produces a static
+library, libcommunicator.a. It is linked to the neon.so extension
+library.
+
+The real networking code, which is independent of PostgreSQL, is in
+the pageserver/client_grpc crate.
+
+## Process view
+
+The communicator runs in a dedicated background worker process, the
+"communicator process". The communicator uses a multi-threaded Tokio
+runtime to execute the IO requests. So the communicator process has
+multiple threads running. That's unusual for Postgres processes and
+care must be taken to make that work.
+
+### Backend <-> worker communication
+
+Each backend has a number of I/O request slots in shared memory. The
+slots are statically allocated for each backend, and must not be
+accessed by other backends. The worker process reads requests from the
+shared memory slots, and writes responses back to the slots.
+
+To submit an IO request, first pick one of your backend's free slots,
+and write the details of the IO request in the slot. Finally, update
+the 'state' field of the slot to Submitted. That informs the worker
+process that it can start processing the request. Once the state has
+been set to Submitted, the backend *must not* access the slot anymore,
+until the worker process sets its state to 'Completed'. In other
+words, each slot is owned by either the backend or the worker process
+at all times, and the 'state' field indicates who has ownership at the
+moment.
+
+To inform the worker process that a request slot has a pending IO
+request, there's a pipe shared by the worker process and all backend
+processes. After you have changed the slot's state to Submitted, write
+the index of the request slot to the pipe. This wakes up the worker
+process.
+
+(Note that the pipe is just used for wakeups, but the worker process
+is free to pick up Submitted IO requests even without receiving the
+wakeup. As of this writing, it doesn't do that, but it might be useful
+in the future to reduce latency even further, for example.)
+
+When the worker process has completed processing the request, it
+writes the result back in the request slot. A GetPage request can also
+contain a pointer to buffer in the shared buffer cache. In that case,
+the worker process writes the resulting page contents directly to the
+buffer, and just a result code in the request slot. It then updates
+the 'state' field to Completed, which passes the owner ship back to
+the originating backend. Finally, it signals the process Latch of the
+originating backend, waking it up.
+
+### Differences between PostgreSQL v16, v17 and v18
+
+PostgreSQL v18 introduced the new AIO mechanism. The PostgreSQL AIO
+mechanism uses a very similar mechanism as described in the previous
+section, for the communication between AIO worker processes and
+backends. With our communicator, the AIO worker processes are not
+used, but we use the same PgAioHandle request slots as in upstream.
+For Neon-specific IO requests like GetDbSize, a neon request slot is
+used. But for the actual IO requests, the request slot merely contains
+a pointer to the PgAioHandle slot. The worker process updates the
+status of that, calls the IO callbacks upon completionetc, just like
+the upstream AIO worker processes do.
+
+## Sequence diagram
+
+                      neon
+    PostgreSQL     extension       backend_interface.rs  worker_process.rs    processor    tonic
+       |               .                    .                   .                 .
+	   | smgr_read()   .                    .                   .                 .
+	   +-------------> +                    .                   .                 .
+	   .               |                    .                   .                 .
+	   .               |  rcommunicator_    .                   .                 .
+	   .               | get_page_at_lsn    .                   .                 .
+	   .               +------------------> +                   .                 .
+                                            |                   .                 .
+                                            | write request to  .                 .                 .
+                                            | slot              .                 .
+                                            |                   .                 .
+                                            |                   .                 .
+											| submit_request()  .                 .
+											+-----------------> +                 .
+											|                   |                 .
+											|					| db_size_request .               .
+																+---------------->.
+																                  . TODO
+
+
+
+### Compute <-> pageserver protocol
+
+The protocol between Compute and the pageserver is based on gRPC. See `protos/`.
+
--- a/pgxn/neon/communicator/build.rs
+++ b/pgxn/neon/communicator/build.rs
@@ -0,0 +1,24 @@
+use cbindgen;
+
+use std::env;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
+
+    cbindgen::generate(crate_dir).map_or_else(
+        |error| match error {
+            cbindgen::Error::ParseSyntaxError { .. } => {
+                // This means there was a syntax error in the Rust sources. Don't panic, because
+                // we want the build to continue and the Rust compiler to hit the error. The
+                // Rust compiler produces a better error message than cbindgen.
+                eprintln!("Generating C bindings failed because of a Rust syntax error");
+            }
+            e => panic!("Unable to generate C bindings: {:?}", e),
+        },
+        |bindings| {
+            bindings.write_to_file("communicator_bindings.h");
+        },
+    );
+
+    Ok(())
+}
--- a/pgxn/neon/communicator/cbindgen.toml
+++ b/pgxn/neon/communicator/cbindgen.toml
@@ -0,0 +1,4 @@
+language = "C"
+
+[enum]
+prefix_with_name = true
--- a/pgxn/neon/communicator/src/backend_comms.rs
+++ b/pgxn/neon/communicator/src/backend_comms.rs
@@ -0,0 +1,204 @@
+//! This module implements a request/response "slot" for submitting requests from backends
+//! to the communicator process.
+//!
+//! NB: The "backend" side of this code runs in Postgres backend processes,
+//! which means that it is not safe to use the 'tracing' crate for logging, nor
+//! to launch threads or use tokio tasks.
+use std::cell::UnsafeCell;
+use std::sync::atomic::fence;
+use std::sync::atomic::{AtomicI32, Ordering};
+
+use crate::neon_request::{NeonIORequest, NeonIOResult};
+
+use atomic_enum::atomic_enum;
+
+/// One request/response slot. Each backend has its own set of slots that it uses.
+///
+/// This is the moral equivalent of PgAioHandle for Postgres AIO requests
+/// Like PgAioHandle, try to keep this small.
+///
+/// There is an array of these in shared memory. Therefore, this must be Sized.
+///
+/// ## Lifecycle of a request
+///
+/// The slot is always owned by either the backend process or the communicator
+/// process, depending on the 'state'. Only the owning process is allowed to
+/// read or modify the slot, except for reading the 'state' itself to check who
+/// owns it.
+///
+/// A slot begins in the Idle state, where it is owned by the backend process.
+/// To submit a request, the backend process fills the slot with the request
+/// data, and changes it to the Submitted state. After changing the state, the
+/// slot is owned by the communicator process, and the backend is not allowed
+/// to access it until the communicator process marks it as Completed.
+///
+/// When the communicator process sees that the slot is in Submitted state, it
+/// starts to process the request. After processing the request, it stores the
+/// result in the slot, and changes the state to Completed. It is now owned by
+/// the backend process again, which may now read the result, and reuse the
+/// slot for a new request.
+///
+/// For correctness of the above protocol, we really only need two states:
+/// "owned by backend" and "owned by communicator process. But to help with
+/// debugging, there are a few more states. When the backend starts to fill in
+/// the request details in the slot, it first sets the state from Idle to
+/// Filling, and when it's done with that, from Filling to Submitted. In the
+/// Filling state, the slot is still owned by the backend. Similarly, when the
+/// communicator process starts to process a request, it sets it to Processing
+/// state first, but the slot is still owned by the communicator process.
+///
+/// This struct doesn't handle waking up the communicator process when a request
+/// has been submitted or when a response is ready. We only store the 'owner_procno'
+/// which can be used for waking up the backend on completion, but the wakeups are
+/// performed elsewhere.
+pub struct NeonIOHandle {
+    /// similar to PgAioHandleState
+    state: AtomicNeonIOHandleState,
+
+    /// The owning process's ProcNumber. The worker process uses this to set the process's
+    /// latch on completion.
+    ///
+    /// (This could be calculated from num_neon_request_slots_per_backend and the index of
+    /// this slot in the overall 'neon_requst_slots array')
+    owner_procno: AtomicI32,
+
+    /// SAFETY: This is modified by fill_request(), after it has established ownership
+    /// of the slot by setting state from Idle to Filling
+    request: UnsafeCell<NeonIORequest>,
+
+    /// valid when state is Completed
+    ///
+    /// SAFETY: This is modified by RequestProcessingGuard::complete(). There can be
+    /// only one RequestProcessingGuard outstanding for a slot at a time, because
+    /// it is returned by start_processing_request() which checks the state, so
+    /// RequestProcessingGuard has exclusive access to the slot.
+    result: UnsafeCell<NeonIOResult>,
+}
+
+// The protocol described in the "Lifecycle of a request" section above ensures
+// the safe access to the fields
+unsafe impl Send for NeonIOHandle {}
+unsafe impl Sync for NeonIOHandle {}
+
+impl Default for NeonIOHandle {
+    fn default() -> NeonIOHandle {
+        NeonIOHandle {
+            owner_procno: AtomicI32::new(-1),
+            request: UnsafeCell::new(NeonIORequest::Empty),
+            result: UnsafeCell::new(NeonIOResult::Empty),
+            state: AtomicNeonIOHandleState::new(NeonIOHandleState::Idle),
+        }
+    }
+}
+
+#[atomic_enum]
+#[derive(Eq, PartialEq)]
+pub enum NeonIOHandleState {
+    Idle,
+
+    /// backend is filling in the request
+    Filling,
+
+    /// Backend has submitted the request to the communicator, but the
+    /// communicator process has not yet started processing it.
+    Submitted,
+
+    /// Communicator is processing the request
+    Processing,
+
+    /// Communicator has completed the request, and the 'result' field is now
+    /// valid, but the backend has not read the result yet.
+    Completed,
+}
+
+pub struct RequestProcessingGuard<'a>(&'a NeonIOHandle);
+
+unsafe impl<'a> Send for RequestProcessingGuard<'a> {}
+unsafe impl<'a> Sync for RequestProcessingGuard<'a> {}
+
+impl<'a> RequestProcessingGuard<'a> {
+    pub fn get_request(&self) -> &NeonIORequest {
+        unsafe { &*self.0.request.get() }
+    }
+
+    pub fn get_owner_procno(&self) -> i32 {
+        self.0.owner_procno.load(Ordering::Relaxed)
+    }
+
+    pub fn completed(self, result: NeonIOResult) {
+        unsafe {
+            *self.0.result.get() = result;
+        };
+
+        // Ok, we have completed the IO. Mark the request as completed. After that,
+        // we no longer have ownership of the slot, and must not modify it.
+        let old_state = self
+            .0
+            .state
+            .swap(NeonIOHandleState::Completed, Ordering::Release);
+        assert!(old_state == NeonIOHandleState::Processing);
+    }
+}
+
+impl NeonIOHandle {
+    pub fn fill_request(&self, request: &NeonIORequest, proc_number: i32) {
+        // Verify that the slot is in Idle state previously, and start filling it.
+        //
+        // XXX: This step isn't strictly necessary. Assuming the caller didn't screw up
+        // and try to use a slot that's already in use, we could fill the slot and
+        // switch it directly from Idle to Submitted state.
+        if let Err(s) = self.state.compare_exchange(
+            NeonIOHandleState::Idle,
+            NeonIOHandleState::Filling,
+            Ordering::Relaxed,
+            Ordering::Relaxed,
+        ) {
+            panic!("unexpected state in request slot: {s:?}");
+        }
+
+        // This fence synchronizes-with store/swap in `communicator_process_main_loop`.
+        fence(Ordering::Acquire);
+
+        self.owner_procno.store(proc_number, Ordering::Relaxed);
+        unsafe { *self.request.get() = *request }
+        self.state
+            .store(NeonIOHandleState::Submitted, Ordering::Release);
+    }
+
+    pub fn try_get_result(&self) -> Option<NeonIOResult> {
+        // FIXME: ordering?
+        let state = self.state.load(Ordering::Relaxed);
+        if state == NeonIOHandleState::Completed {
+            // This fence synchronizes-with store/swap in `communicator_process_main_loop`.
+            fence(Ordering::Acquire);
+            let result = unsafe { *self.result.get() };
+            self.state.store(NeonIOHandleState::Idle, Ordering::Relaxed);
+            Some(result)
+        } else {
+            None
+        }
+    }
+
+    pub fn start_processing_request<'a>(&'a self) -> Option<RequestProcessingGuard<'a>> {
+        // Read the IO request from the slot indicated in the wakeup
+        //
+        // XXX: using compare_exchange for this is not strictly necessary, as long as
+        // the communicator process has _some_ means of tracking which requests it's
+        // already processing. That could be a flag somewhere in communicator's private
+        // memory, for example.
+        if let Err(s) = self.state.compare_exchange(
+            NeonIOHandleState::Submitted,
+            NeonIOHandleState::Processing,
+            Ordering::Relaxed,
+            Ordering::Relaxed,
+        ) {
+            // FIXME surprising state. This is unexpected at the moment, but if we
+            // started to process requests more aggressively, without waiting for the
+            // read from the pipe, then this could happen
+            panic!("unexpected state in request slot: {s:?}");
+        }
+        fence(Ordering::Acquire);
+
+        Some(RequestProcessingGuard(self))
+    }
+}
--- a/pgxn/neon/communicator/src/backend_interface.rs
+++ b/pgxn/neon/communicator/src/backend_interface.rs
@@ -0,0 +1,196 @@
+//! This code runs in each backend process. That means that launching Rust threads, panicking
+//! etc. is forbidden!
+
+use crate::backend_comms::NeonIOHandle;
+use crate::init::CommunicatorInitStruct;
+use crate::integrated_cache::{BackendCacheReadOp, IntegratedCacheReadAccess};
+use crate::neon_request::CCachedGetPageVResult;
+use crate::neon_request::{NeonIORequest, NeonIOResult};
+
+pub struct CommunicatorBackendStruct<'t> {
+    my_proc_number: i32,
+
+    next_neon_request_idx: u32,
+
+    my_start_idx: u32, // First request slot that belongs to this backend
+    my_end_idx: u32,   // end + 1 request slot that belongs to this backend
+
+    neon_request_slots: &'t [NeonIOHandle],
+
+    submission_pipe_write_fd: std::ffi::c_int,
+
+    pending_cache_read_op: Option<BackendCacheReadOp<'t>>,
+
+    integrated_cache: &'t IntegratedCacheReadAccess<'t>,
+}
+
+#[unsafe(no_mangle)]
+pub extern "C" fn rcommunicator_backend_init(
+    cis: Box<CommunicatorInitStruct>,
+    my_proc_number: i32,
+) -> &'static mut CommunicatorBackendStruct<'static> {
+    let start_idx = my_proc_number as u32 * cis.num_neon_request_slots_per_backend;
+    let end_idx = start_idx + cis.num_neon_request_slots_per_backend;
+
+    let integrated_cache = Box::leak(Box::new(cis.integrated_cache_init_struct.backend_init()));
+
+    let bs: &'static mut CommunicatorBackendStruct =
+        Box::leak(Box::new(CommunicatorBackendStruct {
+            my_proc_number,
+            next_neon_request_idx: start_idx,
+            my_start_idx: start_idx,
+            my_end_idx: end_idx,
+            neon_request_slots: cis.neon_request_slots,
+
+            submission_pipe_write_fd: cis.submission_pipe_write_fd,
+            pending_cache_read_op: None,
+
+            integrated_cache,
+        }));
+    bs
+}
+
+/// Start a request. You can poll for its completion and get the result by
+/// calling bcomm_poll_dbsize_request_completion(). The communicator will wake
+/// us up by setting our process latch, so to wait for the completion, wait on
+/// the latch and call bcomm_poll_dbsize_request_completion() every time the
+/// latch is set.
+///
+/// Safety: The C caller must ensure that the references are valid.
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_start_io_request<'t>(
+    bs: &'t mut CommunicatorBackendStruct,
+    request: &NeonIORequest,
+    immediate_result_ptr: &mut NeonIOResult,
+) -> i32 {
+    assert!(bs.pending_cache_read_op.is_none());
+
+    // Check if the request can be satisfied from the cache first
+    if let NeonIORequest::RelSize(req) = request {
+        if let Some(nblocks) = bs.integrated_cache.get_rel_size(&req.reltag()) {
+            *immediate_result_ptr = NeonIOResult::RelSize(nblocks);
+            return -1;
+        }
+    }
+
+    // Create neon request and submit it
+    let request_idx = bs.start_neon_request(request);
+
+    // Tell the communicator about it
+    bs.submit_request(request_idx);
+
+    return request_idx;
+}
+
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_start_get_page_v_request<'t>(
+    bs: &'t mut CommunicatorBackendStruct,
+    request: &NeonIORequest,
+    immediate_result_ptr: &mut CCachedGetPageVResult,
+) -> i32 {
+    let NeonIORequest::GetPageV(get_pagev_request) = request else {
+        panic!("invalid request passed to bcomm_start_get_page_v_request()");
+    };
+    assert!(matches!(request, NeonIORequest::GetPageV(_)));
+    assert!(bs.pending_cache_read_op.is_none());
+
+    // Check if the request can be satisfied from the cache first
+    let mut all_cached = true;
+    let read_op = bs.integrated_cache.start_read_op();
+    for i in 0..get_pagev_request.nblocks {
+        if let Some(cache_block) = read_op.get_page(
+            &get_pagev_request.reltag(),
+            get_pagev_request.block_number + i as u32,
+        ) {
+            (*immediate_result_ptr).cache_block_numbers[i as usize] = cache_block;
+        } else {
+            // not found in cache
+            all_cached = false;
+            break;
+        }
+    }
+    if all_cached {
+        bs.pending_cache_read_op = Some(read_op);
+        return -1;
+    }
+
+    // Create neon request and submit it
+    let request_idx = bs.start_neon_request(request);
+
+    // Tell the communicator about it
+    bs.submit_request(request_idx);
+
+    return request_idx;
+}
+
+/// Check if a request has completed. Returns:
+///
+/// -1 if the request is still being processed
+/// 0 on success
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_poll_request_completion(
+    bs: &mut CommunicatorBackendStruct,
+    request_idx: u32,
+    result_p: &mut NeonIOResult,
+) -> i32 {
+    match bs.neon_request_slots[request_idx as usize].try_get_result() {
+        None => -1, // still processing
+        Some(result) => {
+            *result_p = result;
+            0
+        }
+    }
+}
+
+// LFC functions
+
+/// Finish a local file cache read
+///
+//
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_finish_cache_read(bs: &mut CommunicatorBackendStruct) -> bool {
+    if let Some(op) = bs.pending_cache_read_op.take() {
+        op.finish()
+    } else {
+        panic!("bcomm_finish_cache_read() called with no cached read pending");
+    }
+}
+
+impl<'t> CommunicatorBackendStruct<'t> {
+    /// Send a wakeup to the communicator process
+    fn submit_request(self: &CommunicatorBackendStruct<'t>, request_idx: i32) {
+        // wake up communicator by writing the idx to the submission pipe
+        //
+        // This can block, if the pipe is full. That should be very rare,
+        // because the communicator tries hard to drain the pipe to prevent
+        // that. Also, there's a natural upper bound on how many wakeups can be
+        // queued up: there is only a limited number of request slots for each
+        // backend.
+        //
+        // If it does block very briefly, that's not too serious.
+        let idxbuf = request_idx.to_ne_bytes();
+        let _res = nix::unistd::write(self.submission_pipe_write_fd, &idxbuf);
+        // FIXME: check result, return any errors
+    }
+
+    /// Note: there's no guarantee on when the communicator might pick it up. You should ring
+    /// the doorbell. But it might pick it up immediately.
+    pub(crate) fn start_neon_request(&mut self, request: &NeonIORequest) -> i32 {
+        let my_proc_number = self.my_proc_number;
+
+        // Grab next free slot
+        // FIXME: any guarantee that there will be any?
+        let idx = self.next_neon_request_idx;
+
+        let next_idx = idx + 1;
+        self.next_neon_request_idx = if next_idx == self.my_end_idx {
+            self.my_start_idx
+        } else {
+            next_idx
+        };
+
+        self.neon_request_slots[idx as usize].fill_request(request, my_proc_number);
+
+        return idx as i32;
+    }
+}
--- a/pgxn/neon/communicator/src/file_cache.rs
+++ b/pgxn/neon/communicator/src/file_cache.rs
@@ -0,0 +1,109 @@
+//! Implement the "low-level" parts of the file cache.
+//!
+//! This module just deals with reading and writing the file, and keeping track
+//! which blocks in the cache file are in use and which are free. The "high
+//! level" parts of tracking which block in the cache file corresponds to which
+//! relation block is handled in 'integrated_cache' instead.
+//!
+//! This module is only used to access the file from the communicator
+//! process. The backend processes *also* read the file (and sometimes also
+//! write it? ), but the backends use direct C library calls for that.
+use std::fs::File;
+use std::path::Path;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicU64, Ordering};
+
+use tokio_epoll_uring;
+
+use crate::BLCKSZ;
+
+pub type CacheBlock = u64;
+
+pub struct FileCache {
+    uring_system: tokio_epoll_uring::SystemHandle,
+
+    file: Arc<File>,
+
+    // TODO: there's no reclamation mechanism, the cache grows
+    // indefinitely. This is the next free block, i.e. the current
+    // size of the file
+    next_free_block: AtomicU64,
+}
+
+impl FileCache {
+    pub fn new(
+        file_cache_path: &Path,
+        uring_system: tokio_epoll_uring::SystemHandle,
+    ) -> Result<FileCache, std::io::Error> {
+        let file = std::fs::OpenOptions::new()
+            .read(true)
+            .write(true)
+            .truncate(true)
+            .create(true)
+            .open(file_cache_path)?;
+
+        tracing::info!("Created cache file {file_cache_path:?}");
+
+        Ok(FileCache {
+            file: Arc::new(file),
+            uring_system,
+            next_free_block: AtomicU64::new(0),
+        })
+    }
+
+    // File cache management
+
+    pub async fn read_block(
+        &self,
+        cache_block: CacheBlock,
+        dst: impl uring_common::buf::IoBufMut + Send + Sync,
+    ) -> Result<(), std::io::Error> {
+        assert!(dst.bytes_total() == BLCKSZ);
+        let file = self.file.clone();
+
+        let ((_file, _buf), res) = self
+            .uring_system
+            .read(file, cache_block as u64 * BLCKSZ as u64, dst)
+            .await;
+
+        let res = res.map_err(map_io_uring_error)?;
+        if res != BLCKSZ {
+            panic!("unexpected read result");
+        }
+
+        Ok(())
+    }
+
+    pub async fn write_block(
+        &self,
+        cache_block: CacheBlock,
+        src: impl uring_common::buf::IoBuf + Send + Sync,
+    ) -> Result<(), std::io::Error> {
+        assert!(src.bytes_init() == BLCKSZ);
+        let file = self.file.clone();
+
+        let ((_file, _buf), res) = self
+            .uring_system
+            .write(file, cache_block as u64 * BLCKSZ as u64, src)
+            .await;
+        let res = res.map_err(map_io_uring_error)?;
+        if res != BLCKSZ {
+            panic!("unexpected read result");
+        }
+
+        Ok(())
+    }
+
+    pub fn alloc_block(&self) -> CacheBlock {
+        self.next_free_block.fetch_add(1, Ordering::Relaxed)
+    }
+}
+
+fn map_io_uring_error(err: tokio_epoll_uring::Error<std::io::Error>) -> std::io::Error {
+    match err {
+        tokio_epoll_uring::Error::Op(err) => err,
+        tokio_epoll_uring::Error::System(err) => {
+            std::io::Error::new(std::io::ErrorKind::Other, err)
+        }
+    }
+}
--- a/pgxn/neon/communicator/src/init.rs
+++ b/pgxn/neon/communicator/src/init.rs
@@ -0,0 +1,130 @@
+//! Initialization functions. These are executed in the postmaster process,
+//! at different stages of server startup.
+//!
+//!
+//! Communicator initialization steps:
+//!
+//! 1. At postmaster startup, before shared memory is allocated,
+//!    rcommunicator_shmem_size() is called to get the amount of
+//!    shared memory that this module needs.
+//!
+//! 2. Later, after the shared memory has been allocated,
+//!    rcommunicator_shmem_init() is called to initialize the shmem
+//!    area.
+//!
+//! Per process initialization:
+//!
+//! When a backend process starts up, it calls rcommunicator_backend_init().
+//! In the communicator worker process, other functions are called, see
+//! `worker_process` module.
+
+use std::ffi::c_int;
+use std::mem;
+
+use crate::backend_comms::NeonIOHandle;
+use crate::integrated_cache::IntegratedCacheInitStruct;
+
+const NUM_NEON_REQUEST_SLOTS_PER_BACKEND: u32 = 5;
+
+/// This struct is created in the postmaster process, and inherited to
+/// the communicator process and all backend processes through fork()
+#[repr(C)]
+pub struct CommunicatorInitStruct {
+    #[allow(dead_code)]
+    pub max_procs: u32,
+
+    pub submission_pipe_read_fd: std::ffi::c_int,
+    pub submission_pipe_write_fd: std::ffi::c_int,
+
+    // Shared memory data structures
+    pub num_neon_request_slots_per_backend: u32,
+
+    pub neon_request_slots: &'static [NeonIOHandle],
+
+    pub integrated_cache_init_struct: IntegratedCacheInitStruct<'static>,
+}
+
+impl std::fmt::Debug for CommunicatorInitStruct {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        fmt.debug_struct("CommunicatorInitStruct")
+            .field("max_procs", &self.max_procs)
+            .field("submission_pipe_read_fd", &self.submission_pipe_read_fd)
+            .field("submission_pipe_write_fd", &self.submission_pipe_write_fd)
+            .field(
+                "num_neon_request_slots_per_backend",
+                &self.num_neon_request_slots_per_backend,
+            )
+            .field("neon_request_slots length", &self.neon_request_slots.len())
+            .finish()
+    }
+}
+
+#[unsafe(no_mangle)]
+pub extern "C" fn rcommunicator_shmem_size(max_procs: u32) -> u64 {
+    let mut size = 0;
+
+    let num_neon_request_slots = max_procs * NUM_NEON_REQUEST_SLOTS_PER_BACKEND;
+    size += mem::size_of::<NeonIOHandle>() * num_neon_request_slots as usize;
+
+    // For integrated_cache's Allocator. TODO: make this adjustable
+    size += IntegratedCacheInitStruct::shmem_size(max_procs);
+
+    size as u64
+}
+
+/// Initialize the shared memory segment. Returns a backend-private
+/// struct, which will be inherited by backend processes through fork
+#[unsafe(no_mangle)]
+pub extern "C" fn rcommunicator_shmem_init(
+    submission_pipe_read_fd: c_int,
+    submission_pipe_write_fd: c_int,
+    max_procs: u32,
+    shmem_area_ptr: *mut u8,
+    shmem_area_len: u64,
+) -> &'static mut CommunicatorInitStruct {
+    let mut ptr = shmem_area_ptr;
+
+    // Carve out the request slots from the shmem area and initialize them
+    let num_neon_request_slots_per_backend = NUM_NEON_REQUEST_SLOTS_PER_BACKEND;
+    let num_neon_request_slots = max_procs * num_neon_request_slots_per_backend;
+
+    let len_used;
+    let neon_request_slots: &mut [NeonIOHandle] = unsafe {
+        ptr = ptr.add(ptr.align_offset(std::mem::align_of::<NeonIOHandle>()));
+        let neon_request_slots_ptr: *mut NeonIOHandle = ptr.cast();
+        for _i in 0..num_neon_request_slots {
+            let slot: *mut NeonIOHandle = ptr.cast();
+            *slot = NeonIOHandle::default();
+            ptr = ptr.byte_add(mem::size_of::<NeonIOHandle>());
+        }
+        len_used = ptr.byte_offset_from(shmem_area_ptr) as usize;
+        assert!(len_used <= shmem_area_len as usize);
+
+        std::slice::from_raw_parts_mut(neon_request_slots_ptr, num_neon_request_slots as usize)
+    };
+
+    let remaining_area =
+        unsafe { std::slice::from_raw_parts_mut(ptr, shmem_area_len as usize - len_used) };
+
+    // Give the rest of the area to the integrated cache
+    let integrated_cache_init_struct =
+        IntegratedCacheInitStruct::shmem_init(max_procs, remaining_area);
+
+    eprintln!(
+        "PIPE READ {} WRITE {}",
+        submission_pipe_read_fd, submission_pipe_write_fd
+    );
+
+    let cis: &'static mut CommunicatorInitStruct = Box::leak(Box::new(CommunicatorInitStruct {
+        max_procs,
+        submission_pipe_read_fd,
+        submission_pipe_write_fd,
+
+        num_neon_request_slots_per_backend: NUM_NEON_REQUEST_SLOTS_PER_BACKEND,
+        neon_request_slots,
+
+        integrated_cache_init_struct,
+    }));
+
+    cis
+}
--- a/pgxn/neon/communicator/src/integrated_cache.rs
+++ b/pgxn/neon/communicator/src/integrated_cache.rs
@@ -0,0 +1,423 @@
+//! Integrated communicator cache
+//!
+//! Tracks:
+//! - Relation sizes and existence
+//! - Last-written LSN
+//! - TODO: Block cache (also known as LFC)
+//!
+//! TODO: limit the size
+//! TODO: concurrency
+//!
+//! Note: This deals with "relations", which is really just one "relation fork" in Postgres
+//! terms. RelFileLocator + ForkNumber is the key.
+
+use utils::lsn::Lsn;
+
+use crate::file_cache::{CacheBlock, FileCache};
+use pageserver_data_api::model::RelTag;
+
+use neonart;
+use neonart::TreeInitStruct;
+
+const CACHE_AREA_SIZE: usize = 10 * 1024 * 1024;
+
+/// This struct is stored in the shared memory segment.
+struct IntegratedCacheShmemData {
+    allocator: neonart::Allocator,
+}
+
+/// This struct is initialized at postmaster startup, and passed to all the processes via fork().
+pub struct IntegratedCacheInitStruct<'t> {
+    shmem_data: &'t IntegratedCacheShmemData,
+    handle: TreeInitStruct<'t, TreeKey, TreeEntry>,
+}
+
+/// Represents write-access to the integrated cache. This is used by the communicator process.
+pub struct IntegratedCacheWriteAccess<'t> {
+    cache_tree: neonart::TreeWriteAccess<'t, TreeKey, TreeEntry>,
+
+    global_lw_lsn: Lsn,
+
+    file_cache: Option<FileCache>,
+}
+
+/// Represents read-only access to the integrated cache. Backend processes have this.
+pub struct IntegratedCacheReadAccess<'t> {
+    cache_tree: neonart::TreeReadAccess<'t, TreeKey, TreeEntry>,
+}
+
+impl<'t> IntegratedCacheInitStruct<'t> {
+    /// Return the desired size in bytes of the shared memory area to reserve for the integrated
+    /// cache.
+    pub fn shmem_size(_max_procs: u32) -> usize {
+        CACHE_AREA_SIZE
+    }
+
+    /// Initialize the shared memory segment. This runs once in postmaster. Returns a struct which
+    /// will be inherited by all processes through fork.
+    pub fn shmem_init(_max_procs: u32, shmem_area: &'t mut [u8]) -> IntegratedCacheInitStruct<'t> {
+        assert!(shmem_area.len() > std::mem::size_of::<IntegratedCacheShmemData>());
+
+        let mut ptr = shmem_area.as_mut_ptr();
+        let shmem_data_ptr;
+        let len_used;
+        unsafe {
+            ptr = ptr.byte_add(ptr.align_offset(align_of::<IntegratedCacheShmemData>()));
+            shmem_data_ptr = ptr.cast::<IntegratedCacheShmemData>();
+            ptr = ptr.byte_add(std::mem::size_of::<IntegratedCacheShmemData>());
+            len_used = ptr.byte_offset_from(shmem_area.as_mut_ptr()) as usize;
+        };
+        assert!(len_used < shmem_area.len());
+
+        let area_ptr = ptr;
+        let area_size = shmem_area.len() - len_used;
+
+        let cache_area: &mut [u8] = unsafe { std::slice::from_raw_parts_mut(area_ptr, area_size) };
+        let allocator = neonart::Allocator::new(cache_area);
+
+        // Initialize the shared memory area
+        let shmem_data = unsafe {
+            *shmem_data_ptr = IntegratedCacheShmemData { allocator };
+            &*shmem_data_ptr
+        };
+
+        let tree_handle = TreeInitStruct::new(&shmem_data.allocator);
+
+        IntegratedCacheInitStruct {
+            shmem_data,
+            handle: tree_handle,
+        }
+    }
+
+    pub fn worker_process_init(
+        self,
+        lsn: Lsn,
+        file_cache: Option<FileCache>,
+    ) -> IntegratedCacheWriteAccess<'t> {
+        let IntegratedCacheInitStruct {
+            shmem_data: _shmem,
+            handle,
+        } = self;
+        let tree_writer = handle.attach_writer();
+
+        IntegratedCacheWriteAccess {
+            cache_tree: tree_writer,
+            global_lw_lsn: lsn,
+            file_cache,
+        }
+    }
+
+    pub fn backend_init(self) -> IntegratedCacheReadAccess<'t> {
+        let IntegratedCacheInitStruct {
+            shmem_data: _shmem,
+            handle,
+        } = self;
+
+        let tree_reader = handle.attach_reader();
+
+        IntegratedCacheReadAccess {
+            cache_tree: tree_reader,
+        }
+    }
+}
+
+#[derive(Clone)]
+enum TreeEntry {
+    Rel(RelEntry),
+    Block(BlockEntry),
+}
+
+#[derive(Clone)]
+struct BlockEntry {
+    lw_lsn: Lsn,
+    cache_block: Option<CacheBlock>,
+}
+
+#[derive(Clone, Default)]
+struct RelEntry {
+    /// cached size of the relation
+    nblocks: Option<u32>,
+}
+
+#[derive(
+    Clone,
+    Debug,
+    PartialEq,
+    PartialOrd,
+    Eq,
+    Ord,
+    zerocopy_derive::IntoBytes,
+    zerocopy_derive::Immutable,
+)]
+#[repr(packed)]
+struct TreeKey {
+    spc_oid: u32,
+    db_oid: u32,
+    rel_number: u32,
+    fork_number: u8,
+    block_number: u32,
+}
+
+impl From<&RelTag> for TreeKey {
+    fn from(val: &RelTag) -> TreeKey {
+        TreeKey {
+            spc_oid: val.spc_oid,
+            db_oid: val.db_oid,
+            rel_number: val.rel_number,
+            fork_number: val.fork_number,
+            block_number: u32::MAX,
+        }
+    }
+}
+
+impl From<(&RelTag, u32)> for TreeKey {
+    fn from(val: (&RelTag, u32)) -> TreeKey {
+        TreeKey {
+            spc_oid: val.0.spc_oid,
+            db_oid: val.0.db_oid,
+            rel_number: val.0.rel_number,
+            fork_number: val.0.fork_number,
+            block_number: val.1,
+        }
+    }
+}
+
+impl neonart::Key for TreeKey {
+    const KEY_LEN: usize = 4 + 4 + 4 + 1 + 32;
+
+    fn as_bytes(&self) -> &[u8] {
+        zerocopy::IntoBytes::as_bytes(self)
+    }
+}
+
+impl neonart::Value for TreeEntry {}
+
+/// Return type used in the cache's get_*() functions. 'Found' means that the page, or other
+/// information that was enqueried, exists in the cache. '
+pub enum CacheResult<V> {
+    /// The enqueried page or other information existed in the cache.
+    Found(V),
+
+    /// The cache doesn't contain the page (or other enqueried information, like relation size). The
+    /// Lsn is the 'not_modified_since' LSN that should be used in the request to the pageserver to
+    /// read the page.
+    NotFound(Lsn),
+}
+
+impl<'t> IntegratedCacheWriteAccess<'t> {
+    pub fn get_rel_size(&'t self, rel: &RelTag) -> CacheResult<u32> {
+        let r = self.cache_tree.start_read();
+        if let Some(nblocks) = get_rel_size(&r, rel) {
+            CacheResult::Found(nblocks)
+        } else {
+            CacheResult::NotFound(self.global_lw_lsn)
+        }
+    }
+
+    pub async fn get_page(
+        &'t self,
+        rel: &RelTag,
+        block_number: u32,
+        dst: impl uring_common::buf::IoBufMut + Send + Sync,
+    ) -> Result<CacheResult<()>, std::io::Error> {
+        let r = self.cache_tree.start_read();
+        if let Some(block_tree_entry) = r.get(&TreeKey::from((rel, block_number))) {
+            let block_entry = if let TreeEntry::Block(e) = block_tree_entry {
+                e
+            } else {
+                panic!("unexpected tree entry type for block key");
+            };
+
+            if let Some(cache_block) = block_entry.cache_block {
+                self.file_cache
+                    .as_ref()
+                    .unwrap()
+                    .read_block(cache_block, dst)
+                    .await?;
+                Ok(CacheResult::Found(()))
+            } else {
+                Ok(CacheResult::NotFound(block_entry.lw_lsn))
+            }
+        } else {
+            Ok(CacheResult::NotFound(self.global_lw_lsn))
+        }
+    }
+
+    pub async fn page_is_cached(
+        &'t self,
+        rel: &RelTag,
+        block_number: u32,
+    ) -> Result<CacheResult<()>, std::io::Error> {
+        let r = self.cache_tree.start_read();
+        if let Some(block_tree_entry) = r.get(&TreeKey::from((rel, block_number))) {
+            let block_entry = if let TreeEntry::Block(e) = block_tree_entry {
+                e
+            } else {
+                panic!("unexpected tree entry type for block key");
+            };
+
+            if let Some(_cache_block) = block_entry.cache_block {
+                Ok(CacheResult::Found(()))
+            } else {
+                Ok(CacheResult::NotFound(block_entry.lw_lsn))
+            }
+        } else {
+            Ok(CacheResult::NotFound(self.global_lw_lsn))
+        }
+    }
+
+    /// Does the relation exists? CacheResult::NotFound means that the cache doesn't contain that
+    /// information, i.e. we don't know if the relation exists or not.
+    pub fn get_rel_exists(&'t self, rel: &RelTag) -> CacheResult<bool> {
+        // we don't currently cache negative entries, so if the relation is in the cache, it exists
+        let r = self.cache_tree.start_read();
+        if let Some(_rel_entry) = r.get(&TreeKey::from(rel)) {
+            CacheResult::Found(true)
+        } else {
+            CacheResult::NotFound(self.global_lw_lsn)
+        }
+    }
+
+    pub fn get_db_size(&'t self, _db_oid: u32) -> CacheResult<u64> {
+        // fixme: is this right lsn?
+        CacheResult::NotFound(self.global_lw_lsn)
+    }
+
+    pub fn remember_rel_size(&'t self, rel: &RelTag, nblocks: u32) {
+        let mut w = self.cache_tree.start_write();
+
+        w.insert(
+            &TreeKey::from(rel),
+            TreeEntry::Rel(RelEntry {
+                nblocks: Some(nblocks),
+            }),
+        );
+    }
+
+    /// Remember the given page contents in the cache.
+    pub async fn remember_page(
+        &'t self,
+        rel: &RelTag,
+        block_number: u32,
+        src: impl uring_common::buf::IoBuf + Send + Sync,
+        lw_lsn: Lsn,
+    ) {
+        if let Some(file_cache) = self.file_cache.as_ref() {
+            let mut w = self.cache_tree.start_write();
+
+            let key = TreeKey::from((rel, block_number));
+
+            let mut cache_block = None;
+
+            w.update_with_fn(&key, |existing| {
+                if let Some(existing) = existing {
+                    let mut block_entry = if let TreeEntry::Block(e) = existing.clone() {
+                        e
+                    } else {
+                        panic!("unexpected tree entry type for block key");
+                    };
+                    block_entry.lw_lsn = lw_lsn;
+                    if block_entry.cache_block.is_none() {
+                        block_entry.cache_block = Some(file_cache.alloc_block());
+                    }
+                    cache_block = block_entry.cache_block;
+                    Some(TreeEntry::Block(block_entry))
+                } else {
+                    cache_block = Some(file_cache.alloc_block());
+                    Some(TreeEntry::Block(BlockEntry {
+                        lw_lsn: lw_lsn,
+                        cache_block: cache_block,
+                    }))
+                }
+            });
+            let cache_block = cache_block.unwrap();
+            file_cache
+                .write_block(cache_block, src)
+                .await
+                .expect("error writing to cache");
+        }
+    }
+
+    /// Forget information about given relation in the cache. (For DROP TABLE and such)
+    pub fn forget_rel(&'t self, rel: &RelTag) {
+        // FIXME: not implemented properly. smgrexists() would still return true for this
+        let mut w = self.cache_tree.start_write();
+        w.insert(
+            &TreeKey::from(rel),
+            TreeEntry::Rel(RelEntry { nblocks: None }),
+        );
+    }
+}
+
+/// Read relation size from the cache.
+///
+/// This is in a separate function so that it can be shared by
+/// IntegratedCacheReadAccess::get_rel_size() and IntegratedCacheWriteAccess::get_rel_size()
+fn get_rel_size<'t>(r: &neonart::TreeReadGuard<TreeKey, TreeEntry>, rel: &RelTag) -> Option<u32> {
+    if let Some(existing) = r.get(&TreeKey::from(rel)) {
+        let rel_entry = if let TreeEntry::Rel(e) = existing {
+            e
+        } else {
+            panic!("unexpected tree entry type for rel key");
+        };
+
+        if let Some(nblocks) = rel_entry.nblocks {
+            Some(nblocks)
+        } else {
+            None
+        }
+    } else {
+        None
+    }
+}
+
+/// Accessor for other backends
+///
+/// This allows backends to read pages from the cache directly, on their own, without making a
+/// request to the communicator process.
+impl<'t> IntegratedCacheReadAccess<'t> {
+    pub fn get_rel_size(&'t self, rel: &RelTag) -> Option<u32> {
+        get_rel_size(&self.cache_tree.start_read(), rel)
+    }
+
+    pub fn start_read_op(&'t self) -> BackendCacheReadOp<'t> {
+        let r = self.cache_tree.start_read();
+        BackendCacheReadOp { read_guard: r }
+    }
+}
+
+pub struct BackendCacheReadOp<'t> {
+    read_guard: neonart::TreeReadGuard<'t, TreeKey, TreeEntry>,
+}
+
+impl<'e> BackendCacheReadOp<'e> {
+    /// Initiate a read of the page from the cache.
+    ///
+    /// This returns the "cache block number", i.e. the block number within the cache file, where
+    /// the page's contents is stored. To get the page contents, the caller needs to read that block
+    /// from the cache file. This returns a guard object that you must hold while it performs the
+    /// read. It's possible that while you are performing the read, the cache block is invalidated.
+    /// After you have completed the read, call BackendCacheReadResult::finish() to check if the
+    /// read was in fact valid or not. If it was concurrently invalidated, you need to retry.
+    pub fn get_page(&self, rel: &RelTag, block_number: u32) -> Option<u64> {
+        if let Some(block_tree_entry) = self.read_guard.get(&TreeKey::from((rel, block_number))) {
+            let block_entry = if let TreeEntry::Block(e) = block_tree_entry {
+                e
+            } else {
+                panic!("unexpected tree entry type for block key");
+            };
+
+            block_entry.cache_block
+        } else {
+            None
+        }
+    }
+
+    pub fn finish(self) -> bool {
+        // TODO: currently, we use a spinlock to protect the in-memory tree, so concurrent
+        // invalidations are not possible. But the plan is to switch to optimistic locking,
+        // and once we do that, this would return 'false' if the optimistic locking failed and
+        // you need to retry.
+        true
+    }
+}
--- a/pgxn/neon/communicator/src/lib.rs
+++ b/pgxn/neon/communicator/src/lib.rs
@@ -0,0 +1,25 @@
+//!
+//! Three main parts:
+//! - async tokio communicator core, which receives requests and processes them.
+//! - Main loop and requests queues, which routes requests from backends to the core
+//! - the per-backend glue code, which submits requests
+//!
+
+mod backend_comms;
+
+// mark this 'pub', because these functions are called from C code. Otherwise, the compiler
+// complains about a bunch of structs and enum variants being unused, because it thinkgs
+// the functions that use them are never called. There are some C-callable functions in
+// other modules too, but marking this as pub is currently enough to silence the warnings
+//
+// TODO: perhaps collect *all* the extern "C" functions to one module?
+pub mod backend_interface;
+
+mod file_cache;
+mod init;
+mod integrated_cache;
+mod neon_request;
+mod worker_process;
+
+// FIXME get this from postgres headers somehow
+pub const BLCKSZ: usize = 8192;
--- a/pgxn/neon/communicator/src/neon_request.rs
+++ b/pgxn/neon/communicator/src/neon_request.rs
@@ -0,0 +1,346 @@
+type CLsn = u64;
+type COid = u32;
+
+// This conveniently matches PG_IOV_MAX
+pub const MAX_GETPAGEV_PAGES: usize = 32;
+
+use pageserver_data_api::model;
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub enum NeonIORequest {
+    Empty,
+
+    // Read requests. These are C-friendly variants of the corresponding structs in
+    // pageserver_data_api::model.
+    RelExists(CRelExistsRequest),
+    RelSize(CRelSizeRequest),
+    GetPageV(CGetPageVRequest),
+    PrefetchV(CPrefetchVRequest),
+    DbSize(CDbSizeRequest),
+
+    // Write requests. These are needed to keep the relation size cache and LFC up-to-date.
+    // They are not sent to the pageserver.
+    WritePage(CWritePageRequest),
+    RelExtend(CRelExtendRequest),
+    RelZeroExtend(CRelZeroExtendRequest),
+    RelCreate(CRelCreateRequest),
+    RelTruncate(CRelTruncateRequest),
+    RelUnlink(CRelUnlinkRequest),
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub enum NeonIOResult {
+    Empty,
+    RelExists(bool),
+    RelSize(u32),
+
+    /// the result pages are written to the shared memory addresses given in the request
+    GetPageV,
+
+    /// A prefetch request returns as soon as the request has been received by the communicator.
+    /// It is processed in the background.
+    PrefetchVLaunched,
+
+    DbSize(u64),
+
+    // FIXME design compact error codes. Can't easily pass a string or other dynamic data.
+    // currently, this is 'errno'
+    Error(i32),
+
+    Aborted,
+
+    /// used for all write requests
+    WriteOK,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CCachedGetPageVResult {
+    pub cache_block_numbers: [u64; MAX_GETPAGEV_PAGES],
+}
+
+/// ShmemBuf represents a buffer in shared memory.
+///
+/// SAFETY: The pointer must point to an area in shared memory. The functions allow you to liberally
+/// get a mutable pointer to the contents; it is the caller's responsibility to ensure that you
+/// don't access a buffer that's you're not allowed to. Inappropriate access to the buffer doesn't
+/// violate Rust's safety semantics, but it will mess up and crash Postgres.
+///
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct ShmemBuf {
+    // These fields define where the result is written. Must point into a buffer in shared memory!
+    pub ptr: *mut u8,
+}
+
+unsafe impl Send for ShmemBuf {}
+unsafe impl Sync for ShmemBuf {}
+
+unsafe impl uring_common::buf::IoBuf for ShmemBuf {
+    fn stable_ptr(&self) -> *const u8 {
+        self.ptr
+    }
+
+    fn bytes_init(&self) -> usize {
+        crate::BLCKSZ
+    }
+
+    fn bytes_total(&self) -> usize {
+        crate::BLCKSZ
+    }
+}
+
+unsafe impl uring_common::buf::IoBufMut for ShmemBuf {
+    fn stable_mut_ptr(&mut self) -> *mut u8 {
+        self.ptr
+    }
+
+    unsafe fn set_init(&mut self, pos: usize) {
+        if pos > crate::BLCKSZ as usize {
+            panic!(
+                "set_init called past end of buffer, pos {}, buffer size {}",
+                pos,
+                crate::BLCKSZ
+            );
+        }
+    }
+}
+
+impl ShmemBuf {
+    pub fn as_mut_ptr(&self) -> *mut u8 {
+        self.ptr
+    }
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelExistsRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelSizeRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CGetPageVRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+    pub nblocks: u8,
+
+    // These fields define where the result is written. Must point into a buffer in shared memory!
+    pub dest: [ShmemBuf; MAX_GETPAGEV_PAGES],
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CPrefetchVRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+    pub nblocks: u8,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CDbSizeRequest {
+    pub db_oid: COid,
+    pub request_lsn: CLsn,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CWritePageRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+    pub lsn: CLsn,
+
+    // These fields define where the result is written. Must point into a buffer in shared memory!
+    pub src: ShmemBuf,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelExtendRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+    pub lsn: CLsn,
+
+    // These fields define page contents. Must point into a buffer in shared memory!
+    pub src_ptr: usize,
+    pub src_size: u32,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelZeroExtendRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+    pub nblocks: u32,
+    pub lsn: CLsn,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelCreateRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelTruncateRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub nblocks: u32,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CRelUnlinkRequest {
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+    pub nblocks: u32,
+}
+
+impl CRelExistsRequest {
+    pub fn reltag(&self) -> model::RelTag {
+        model::RelTag {
+            spc_oid: self.spc_oid,
+            db_oid: self.db_oid,
+            rel_number: self.rel_number,
+            fork_number: self.fork_number,
+        }
+    }
+}
+
+impl CRelSizeRequest {
+    pub fn reltag(&self) -> model::RelTag {
+        model::RelTag {
+            spc_oid: self.spc_oid,
+            db_oid: self.db_oid,
+            rel_number: self.rel_number,
+            fork_number: self.fork_number,
+        }
+    }
+}
+
+impl CGetPageVRequest {
+    pub fn reltag(&self) -> model::RelTag {
+        model::RelTag {
+            spc_oid: self.spc_oid,
+            db_oid: self.db_oid,
+            rel_number: self.rel_number,
+            fork_number: self.fork_number,
+        }
+    }
+}
+
+impl CPrefetchVRequest {
+    pub fn reltag(&self) -> model::RelTag {
+        model::RelTag {
+            spc_oid: self.spc_oid,
+            db_oid: self.db_oid,
+            rel_number: self.rel_number,
+            fork_number: self.fork_number,
+        }
+    }
+}
+
+impl CWritePageRequest {
+    pub fn reltag(&self) -> model::RelTag {
+        model::RelTag {
+            spc_oid: self.spc_oid,
+            db_oid: self.db_oid,
+            rel_number: self.rel_number,
+            fork_number: self.fork_number,
+        }
+    }
+}
+
+impl CRelExtendRequest {
+    pub fn reltag(&self) -> model::RelTag {
+        model::RelTag {
+            spc_oid: self.spc_oid,
+            db_oid: self.db_oid,
+            rel_number: self.rel_number,
+            fork_number: self.fork_number,
+        }
+    }
+}
+
+impl CRelZeroExtendRequest {
+    pub fn reltag(&self) -> model::RelTag {
+        model::RelTag {
+            spc_oid: self.spc_oid,
+            db_oid: self.db_oid,
+            rel_number: self.rel_number,
+            fork_number: self.fork_number,
+        }
+    }
+}
+
+impl CRelCreateRequest {
+    pub fn reltag(&self) -> model::RelTag {
+        model::RelTag {
+            spc_oid: self.spc_oid,
+            db_oid: self.db_oid,
+            rel_number: self.rel_number,
+            fork_number: self.fork_number,
+        }
+    }
+}
+
+impl CRelTruncateRequest {
+    pub fn reltag(&self) -> model::RelTag {
+        model::RelTag {
+            spc_oid: self.spc_oid,
+            db_oid: self.db_oid,
+            rel_number: self.rel_number,
+            fork_number: self.fork_number,
+        }
+    }
+}
+
+impl CRelUnlinkRequest {
+    pub fn reltag(&self) -> model::RelTag {
+        model::RelTag {
+            spc_oid: self.spc_oid,
+            db_oid: self.db_oid,
+            rel_number: self.rel_number,
+            fork_number: self.fork_number,
+        }
+    }
+}
--- a/pgxn/neon/communicator/src/worker_process/callbacks.rs
+++ b/pgxn/neon/communicator/src/worker_process/callbacks.rs
@@ -0,0 +1,28 @@
+//! C callbacks to PostgreSQL facilities that the neon extension needs
+//! to provide. These are implemented in `neon/pgxn/communicator_new.c`.
+//! The function signatures better match!
+//!
+//! These are called from the communicator threads! Careful what you do, most
+//! Postgres functions are not safe to call in that context.
+
+use utils::lsn::Lsn;
+
+unsafe extern "C" {
+    pub fn notify_proc_unsafe(procno: std::ffi::c_int);
+    pub fn callback_set_my_latch_unsafe();
+    pub fn callback_get_request_lsn_unsafe() -> u64;
+}
+
+// safe wrappers
+
+pub(super) fn notify_proc(procno: std::ffi::c_int) {
+    unsafe { notify_proc_unsafe(procno) };
+}
+
+pub(super) fn callback_set_my_latch() {
+    unsafe { callback_set_my_latch_unsafe() };
+}
+
+pub(super) fn get_request_lsn() -> Lsn {
+    Lsn(unsafe { callback_get_request_lsn_unsafe() })
+}
--- a/pgxn/neon/communicator/src/worker_process/logging.rs
+++ b/pgxn/neon/communicator/src/worker_process/logging.rs
@@ -0,0 +1,229 @@
+//! Glue code to hook up Rust logging, with the `tracing` crate, to the PostgreSQL log
+//!
+//! In the Rust threads, the log messages are written to a mpsc Channel, and the Postgres
+//! process latch is raised. That wakes up the loop in the  main thread. It reads the
+//! message from the channel and ereport()s it. This ensures that only one thread, the main
+//! thread, calls the PostgreSQL logging routines at any time.
+
+use std::sync::mpsc::sync_channel;
+use std::sync::mpsc::{Receiver, SyncSender};
+use std::sync::mpsc::{TryRecvError, TrySendError};
+
+use tracing::info;
+use tracing::{Event, Level, Metadata, Subscriber};
+use tracing_subscriber::filter::LevelFilter;
+use tracing_subscriber::fmt::FmtContext;
+use tracing_subscriber::fmt::FormatEvent;
+use tracing_subscriber::fmt::FormatFields;
+use tracing_subscriber::fmt::FormattedFields;
+use tracing_subscriber::fmt::MakeWriter;
+use tracing_subscriber::fmt::format::Writer;
+use tracing_subscriber::registry::LookupSpan;
+
+use crate::worker_process::callbacks::callback_set_my_latch;
+
+pub struct LoggingState {
+    receiver: Receiver<FormattedEventWithMeta>,
+}
+
+/// Called once, at worker process startup. The returned LoggingState is passed back
+/// in the subsequent calls to `pump_logging`. It is opaque to the C code.
+#[unsafe(no_mangle)]
+pub extern "C" fn configure_logging() -> Box<LoggingState> {
+    let (sender, receiver) = sync_channel(1000);
+
+    let maker = Maker { channel: sender };
+
+    use tracing_subscriber::prelude::*;
+    let r = tracing_subscriber::registry();
+
+    let r = r.with(
+        tracing_subscriber::fmt::layer()
+            .event_format(SimpleFormatter::new())
+            .with_writer(maker)
+            // TODO: derive this from log_min_messages?
+            .with_filter(LevelFilter::from_level(Level::INFO)),
+    );
+    r.init();
+
+    info!("communicator process logging started");
+
+    let state = LoggingState { receiver };
+
+    Box::new(state)
+}
+
+/// Read one message from the logging queue. This is essentially a wrapper to Receiver,
+/// with a C-friendly signature.
+///
+/// The message is copied into *errbuf, which is a caller-supplied buffer of size `errbuf_len`.
+/// If the message doesn't fit in the buffer, it is truncated. It is always NULL-terminated.
+///
+/// The error level is returned *elevel_p. It's one of the PostgreSQL error levels, see elog.h
+#[unsafe(no_mangle)]
+pub extern "C" fn pump_logging(
+    state: &mut LoggingState,
+    errbuf: *mut u8,
+    errbuf_len: u32,
+    elevel_p: &mut i32,
+) -> i32 {
+    let msg = match state.receiver.try_recv() {
+        Err(TryRecvError::Empty) => return 0,
+        Err(TryRecvError::Disconnected) => return -1,
+        Ok(msg) => msg,
+    };
+
+    let src: &[u8] = &msg.message;
+    let dst = errbuf;
+    let len = std::cmp::min(src.len(), errbuf_len as usize - 1);
+    unsafe {
+        std::ptr::copy_nonoverlapping(src.as_ptr(), dst, len);
+        *(errbuf.add(len)) = b'\0'; // NULL terminator
+    }
+
+    // XXX: these levels are copied from PostgreSQL's elog.h. Introduce another enum
+    // to hide these?
+    *elevel_p = match msg.level {
+        Level::TRACE => 10, // DEBUG5
+        Level::DEBUG => 14, // DEBUG1
+        Level::INFO => 17,  // INFO
+        Level::WARN => 19,  // WARNING
+        Level::ERROR => 21, // ERROR
+    };
+    1
+}
+
+//---- The following functions can be called from any thread ----
+
+#[derive(Clone)]
+struct FormattedEventWithMeta {
+    message: Vec<u8>,
+    level: tracing::Level,
+}
+
+impl Default for FormattedEventWithMeta {
+    fn default() -> Self {
+        FormattedEventWithMeta {
+            message: Vec::new(),
+            level: tracing::Level::DEBUG,
+        }
+    }
+}
+
+struct EventBuilder<'a> {
+    event: FormattedEventWithMeta,
+
+    maker: &'a Maker,
+}
+
+impl<'a> std::io::Write for EventBuilder<'a> {
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        self.event.message.write(buf)
+    }
+    fn flush(&mut self) -> std::io::Result<()> {
+        self.maker.send_event(self.event.clone());
+        Ok(())
+    }
+}
+
+impl<'a> Drop for EventBuilder<'a> {
+    fn drop(&mut self) {
+        let maker = self.maker;
+        let event = std::mem::take(&mut self.event);
+
+        maker.send_event(event);
+    }
+}
+
+struct Maker {
+    channel: SyncSender<FormattedEventWithMeta>,
+}
+
+impl<'a> MakeWriter<'a> for Maker {
+    type Writer = EventBuilder<'a>;
+
+    fn make_writer(&'a self) -> Self::Writer {
+        panic!("not expected to be called when make_writer_for is implemented");
+    }
+
+    fn make_writer_for(&'a self, meta: &Metadata<'_>) -> Self::Writer {
+        EventBuilder {
+            event: FormattedEventWithMeta {
+                message: Vec::new(),
+                level: *meta.level(),
+            },
+            maker: self,
+        }
+    }
+}
+
+impl Maker {
+    fn send_event(&self, e: FormattedEventWithMeta) {
+        match self.channel.try_send(e) {
+            Ok(()) => {
+                // notify the main thread
+                callback_set_my_latch();
+            }
+            Err(TrySendError::Disconnected(_)) => {}
+            Err(TrySendError::Full(_)) => {
+                // TODO: record that some messages were lost
+            }
+        }
+    }
+}
+
+/// Simple formatter implementation for tracing_subscriber, which prints the log
+/// spans and message part like the default formatter, but no timestamp or error
+/// level. The error level is captured separately by `FormattedEventWithMeta',
+/// and when the error is printed by the main thread, with PostgreSQL ereport(),
+/// it gets a timestamp at that point. (The timestamp printed will therefore lag
+/// behind the timestamp on the event here, if the main thread doesn't process
+/// the log message promptly)
+struct SimpleFormatter;
+
+impl<S, N> FormatEvent<S, N> for SimpleFormatter
+where
+    S: Subscriber + for<'a> LookupSpan<'a>,
+    N: for<'a> FormatFields<'a> + 'static,
+{
+    fn format_event(
+        &self,
+        ctx: &FmtContext<'_, S, N>,
+        mut writer: Writer<'_>,
+        event: &Event<'_>,
+    ) -> std::fmt::Result {
+        // Format all the spans in the event's span context.
+        if let Some(scope) = ctx.event_scope() {
+            for span in scope.from_root() {
+                write!(writer, "{}", span.name())?;
+
+                // `FormattedFields` is a formatted representation of the span's
+                // fields, which is stored in its extensions by the `fmt` layer's
+                // `new_span` method. The fields will have been formatted
+                // by the same field formatter that's provided to the event
+                // formatter in the `FmtContext`.
+                let ext = span.extensions();
+                let fields = &ext
+                    .get::<FormattedFields<N>>()
+                    .expect("will never be `None`");
+
+                // Skip formatting the fields if the span had no fields.
+                if !fields.is_empty() {
+                    write!(writer, "{{{}}}", fields)?;
+                }
+                write!(writer, ": ")?;
+            }
+        }
+
+        // Write fields on the event
+        ctx.field_format().format_fields(writer.by_ref(), event)?;
+
+        writeln!(writer)
+    }
+}
+
+impl SimpleFormatter {
+    fn new() -> Self {
+        SimpleFormatter {}
+    }
+}
--- a/pgxn/neon/communicator/src/worker_process/main_loop.rs
+++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs
@@ -0,0 +1,384 @@
+use std::collections::HashMap;
+use std::path::PathBuf;
+
+use crate::backend_comms::NeonIOHandle;
+use crate::file_cache::FileCache;
+use crate::init::CommunicatorInitStruct;
+use crate::integrated_cache::{CacheResult, IntegratedCacheWriteAccess};
+use crate::neon_request::{CGetPageVRequest, CPrefetchVRequest};
+use crate::neon_request::{NeonIORequest, NeonIOResult};
+use pageserver_client_grpc::PageserverClient;
+use pageserver_data_api::model;
+
+use tokio::io::AsyncReadExt;
+use tokio_epoll_uring::IoBuf;
+use tokio_pipe::PipeRead;
+
+use super::callbacks::{get_request_lsn, notify_proc};
+
+use tracing::{error, info, trace};
+
+use utils::lsn::Lsn;
+
+pub struct CommunicatorWorkerProcessStruct<'a> {
+    neon_request_slots: &'a [NeonIOHandle],
+
+    pageserver_client: PageserverClient,
+
+    cache: IntegratedCacheWriteAccess<'a>,
+
+    submission_pipe_read_raw_fd: i32,
+}
+
+pub(super) async fn init(
+    cis: Box<CommunicatorInitStruct>,
+    tenant_id: String,
+    timeline_id: String,
+    auth_token: Option<String>,
+    shard_map: HashMap<u16, String>,
+    _file_cache_size: u64,
+    file_cache_path: Option<PathBuf>,
+) -> CommunicatorWorkerProcessStruct<'static> {
+    let last_lsn = get_request_lsn();
+
+    let uring_system = tokio_epoll_uring::System::launch().await.unwrap();
+
+    let file_cache = if let Some(path) = file_cache_path {
+        Some(FileCache::new(&path, uring_system).expect("could not create cache file"))
+    } else {
+        // FIXME: temporarily for testing, use LFC even if disabled
+        Some(
+            FileCache::new(&PathBuf::from("new_filecache"), uring_system)
+                .expect("could not create cache file"),
+        )
+    };
+
+    // Initialize subsystems
+    let cache = cis
+        .integrated_cache_init_struct
+        .worker_process_init(last_lsn, file_cache);
+
+    let pageserver_client = PageserverClient::new(&tenant_id, &timeline_id, &auth_token, shard_map);
+
+    let this = CommunicatorWorkerProcessStruct {
+        neon_request_slots: cis.neon_request_slots,
+        pageserver_client,
+        cache,
+        submission_pipe_read_raw_fd: cis.submission_pipe_read_fd,
+    };
+
+    this
+}
+
+impl<'t> CommunicatorWorkerProcessStruct<'t> {
+    /// Main loop of the worker process. Receive requests from the backends and process them.
+    pub(super) async fn run(self: &'static Self) {
+        let mut idxbuf: [u8; 4] = [0; 4];
+
+        let mut submission_pipe_read =
+            PipeRead::from_raw_fd_checked(self.submission_pipe_read_raw_fd)
+                .expect("invalid pipe fd");
+
+        loop {
+            // Wait for a backend to ring the doorbell
+
+            match submission_pipe_read.read(&mut idxbuf).await {
+                Ok(4) => {}
+                Ok(nbytes) => panic!("short read ({nbytes} bytes) on communicator pipe"),
+                Err(e) => panic!("error reading from communicator pipe: {e}"),
+            }
+            let request_idx = u32::from_ne_bytes(idxbuf);
+
+            // Read the IO request from the slot indicated in the wakeup
+            let Some(slot) =
+                self.neon_request_slots[request_idx as usize].start_processing_request()
+            else {
+                // This currently should not happen. But if we have multiple threads picking up
+                // requests, and without waiting for the notifications, it could.
+                panic!("no request in slot");
+            };
+
+            // Ok, we have ownership of this request now. We must process
+            // it now, there's no going back.
+
+            //trace!("processing request {request_idx}: {request:?}");
+
+            // Spawn a separate task for every request. That's a little excessive for requests that
+            // can be quickly satisfied from the cache, but we expect that to be rare, because the
+            // requesting backend would have already checked the cache.
+            tokio::spawn(async {
+                let result = self.handle_request(slot.get_request()).await;
+                let owner_procno = slot.get_owner_procno();
+
+                // Ok, we have completed the IO. Mark the request as completed. After that,
+                // we no longer have ownership of the slot, and must not modify it.
+                slot.completed(result);
+
+                // Notify the backend about the completion. (Note that the backend might see
+                // the completed status even before this; this is just a wakeup)
+                notify_proc(owner_procno);
+            });
+        }
+    }
+
+    fn request_common(&self, not_modified_since_lsn: Lsn) -> model::RequestCommon {
+        model::RequestCommon {
+            request_lsn: get_request_lsn(),
+            not_modified_since_lsn,
+        }
+    }
+
+    async fn handle_request<'x>(self: &'static Self, req: &'x NeonIORequest) -> NeonIOResult {
+        match req {
+            NeonIORequest::Empty => {
+                error!("unexpected Empty IO request");
+                NeonIOResult::Error(-1)
+            }
+            NeonIORequest::RelExists(req) => {
+                let rel = req.reltag();
+
+                let not_modified_since = match self.cache.get_rel_exists(&rel) {
+                    CacheResult::Found(exists) => return NeonIOResult::RelExists(exists),
+                    CacheResult::NotFound(lsn) => lsn,
+                };
+
+                match self
+                    .pageserver_client
+                    .process_rel_exists_request(&model::RelExistsRequest {
+                        common: self.request_common(not_modified_since),
+                        rel,
+                    })
+                    .await
+                {
+                    Ok(exists) => NeonIOResult::RelExists(exists),
+                    Err(err) => {
+                        info!("tonic error: {err:?}");
+                        NeonIOResult::Error(-1)
+                    }
+                }
+            }
+
+            NeonIORequest::RelSize(req) => {
+                let rel = req.reltag();
+
+                // Check the cache first
+                let not_modified_since = match self.cache.get_rel_size(&rel) {
+                    CacheResult::Found(nblocks) => {
+                        tracing::trace!("found relsize for {:?} in cache: {}", rel, nblocks);
+                        return NeonIOResult::RelSize(nblocks);
+                    }
+                    CacheResult::NotFound(lsn) => lsn,
+                };
+
+                let common = self.request_common(not_modified_since);
+                match self
+                    .pageserver_client
+                    .process_rel_size_request(&model::RelSizeRequest {
+                        common: common.clone(),
+                        rel: rel.clone(),
+                    })
+                    .await
+                {
+                    Ok(nblocks) => {
+                        // update the cache
+                        tracing::info!("updated relsize for {:?} in cache: {}", rel, nblocks);
+                        self.cache.remember_rel_size(&rel, nblocks);
+
+                        NeonIOResult::RelSize(nblocks)
+                    }
+                    Err(err) => {
+                        info!("tonic error: {err:?}");
+                        NeonIOResult::Error(-1)
+                    }
+                }
+            }
+            NeonIORequest::GetPageV(req) => match self.handle_get_pagev_request(req).await {
+                Ok(()) => NeonIOResult::GetPageV,
+                Err(errno) => NeonIOResult::Error(errno),
+            },
+            NeonIORequest::PrefetchV(req) => {
+                let req = req.clone();
+                tokio::spawn(async move { self.handle_prefetchv_request(&req).await });
+                NeonIOResult::PrefetchVLaunched
+            }
+            NeonIORequest::DbSize(req) => {
+                // Check the cache first
+                let not_modified_since = match self.cache.get_db_size(req.db_oid) {
+                    CacheResult::Found(db_size) => {
+                        // get_page already copied the block content to the destination
+                        return NeonIOResult::DbSize(db_size);
+                    }
+                    CacheResult::NotFound(lsn) => lsn,
+                };
+
+                match self
+                    .pageserver_client
+                    .process_dbsize_request(&model::DbSizeRequest {
+                        common: self.request_common(not_modified_since),
+                        db_oid: req.db_oid,
+                    })
+                    .await
+                {
+                    Ok(db_size) => NeonIOResult::DbSize(db_size),
+                    Err(err) => {
+                        info!("tonic error: {err:?}");
+                        NeonIOResult::Error(-1)
+                    }
+                }
+            }
+
+            // Write requests
+            NeonIORequest::WritePage(req) => {
+                // Also store it in the LFC while we still have it
+                let rel = req.reltag();
+                self.cache
+                    .remember_page(&rel, req.block_number, req.src, Lsn(req.lsn))
+                    .await;
+                NeonIOResult::WriteOK
+            }
+            NeonIORequest::RelExtend(req) => {
+                self.cache
+                    .remember_rel_size(&req.reltag(), req.block_number + 1);
+                NeonIOResult::WriteOK
+            }
+            NeonIORequest::RelZeroExtend(req) => {
+                self.cache
+                    .remember_rel_size(&req.reltag(), req.block_number + req.nblocks);
+                NeonIOResult::WriteOK
+            }
+            NeonIORequest::RelCreate(req) => {
+                self.cache.remember_rel_size(&req.reltag(), 0);
+                NeonIOResult::WriteOK
+            }
+            NeonIORequest::RelTruncate(req) => {
+                self.cache.remember_rel_size(&req.reltag(), req.nblocks);
+                NeonIOResult::WriteOK
+            }
+            NeonIORequest::RelUnlink(req) => {
+                self.cache.forget_rel(&req.reltag());
+                NeonIOResult::WriteOK
+            }
+        }
+    }
+
+    async fn handle_get_pagev_request(&'t self, req: &CGetPageVRequest) -> Result<(), i32> {
+        let rel = req.reltag();
+
+        // Check the cache first
+        let mut cache_misses = Vec::new();
+        for i in 0..req.nblocks {
+            let blkno = req.block_number + i as u32;
+            let dest = req.dest[i as usize];
+            let not_modified_since = match self.cache.get_page(&rel, blkno, dest).await {
+                Ok(CacheResult::Found(_)) => {
+                    // get_page already copied the block content to the destination
+                    trace!("found blk {} in rel {:?} in LFC ", blkno, rel);
+                    continue;
+                }
+                Ok(CacheResult::NotFound(lsn)) => lsn,
+                Err(_io_error) => return Err(-1), // FIXME errno?
+            };
+            cache_misses.push((blkno, not_modified_since, dest));
+        }
+        if cache_misses.is_empty() {
+            return Ok(());
+        }
+        let not_modified_since = cache_misses
+            .iter()
+            .map(|(_blkno, lsn, _dest)| *lsn)
+            .max()
+            .unwrap();
+
+        // TODO: Use batched protocol
+        for (blkno, _lsn, dest) in cache_misses.iter() {
+            match self
+                .pageserver_client
+                .get_page(&model::GetPageRequest {
+                    common: self.request_common(not_modified_since),
+                    rel: rel.clone(),
+                    block_number: *blkno,
+                })
+                .await
+            {
+                Ok(page_image) => {
+                    // Write the received page image directly to the shared memory location
+                    // that the backend requested.
+                    let src: &[u8] = page_image.as_ref();
+                    let len = std::cmp::min(src.len(), dest.bytes_total() as usize);
+                    unsafe {
+                        std::ptr::copy_nonoverlapping(src.as_ptr(), dest.as_mut_ptr(), len);
+                    };
+
+                    trace!("remembering blk {} in rel {:?} in LFC", blkno, rel);
+
+                    // Also store it in the LFC while we have it
+                    self.cache
+                        .remember_page(&rel, *blkno, page_image, not_modified_since)
+                        .await;
+                }
+                Err(err) => {
+                    info!("tonic error: {err:?}");
+                    return Err(-1);
+                }
+            }
+        }
+        Ok(())
+    }
+
+    async fn handle_prefetchv_request(
+        self: &'static Self,
+        req: &CPrefetchVRequest,
+    ) -> Result<(), i32> {
+        let rel = req.reltag();
+
+        // Check the cache first
+        let mut cache_misses = Vec::new();
+        for i in 0..req.nblocks {
+            let blkno = req.block_number + i as u32;
+            let not_modified_since = match self.cache.page_is_cached(&rel, blkno).await {
+                Ok(CacheResult::Found(_)) => {
+                    trace!("found blk {} in rel {:?} in LFC ", req.block_number, rel);
+                    continue;
+                }
+                Ok(CacheResult::NotFound(lsn)) => lsn,
+                Err(_io_error) => return Err(-1), // FIXME errno?
+            };
+            cache_misses.push((req.block_number, not_modified_since));
+        }
+        if cache_misses.is_empty() {
+            return Ok(());
+        }
+        let not_modified_since = cache_misses.iter().map(|(_blkno, lsn)| *lsn).max().unwrap();
+
+        // TODO: spawn separate tasks for these. Use the integrated cache to keep track of the
+        // in-flight requests
+
+        // TODO: Use batched protocol
+        for (blkno, _lsn) in cache_misses.iter() {
+            match self
+                .pageserver_client
+                .get_page(&model::GetPageRequest {
+                    common: self.request_common(not_modified_since),
+                    rel: rel.clone(),
+                    block_number: *blkno,
+                })
+                .await
+            {
+                Ok(page_image) => {
+                    trace!(
+                        "prefetch completed, remembering blk {} in rel {:?} in LFC",
+                        req.block_number, rel
+                    );
+                    self.cache
+                        .remember_page(&rel, req.block_number, page_image, not_modified_since)
+                        .await;
+                }
+                Err(err) => {
+                    info!("tonic error: {err:?}");
+                    return Err(-1);
+                }
+            }
+        }
+        Ok(())
+    }
+}
--- a/pgxn/neon/communicator/src/worker_process/mod.rs
+++ b/pgxn/neon/communicator/src/worker_process/mod.rs
@@ -0,0 +1,11 @@
+//! This code runs in the communicator worker process. This provides
+//! the glue code to:
+//!
+//! - launch the 'processor',
+//! - receive IO requests from backends and pass them to the processor,
+//! - write results back to backends.
+
+mod callbacks;
+mod logging;
+mod main_loop;
+mod worker_interface;
--- a/pgxn/neon/communicator/src/worker_process/worker_interface.rs
+++ b/pgxn/neon/communicator/src/worker_process/worker_interface.rs
@@ -0,0 +1,93 @@
+//! Functions called from the C code in the worker process
+
+use std::collections::HashMap;
+use std::ffi::{CStr, c_char};
+use std::path::PathBuf;
+
+use tracing::error;
+
+use crate::init::CommunicatorInitStruct;
+use crate::worker_process::main_loop;
+
+/// Launch the communicator's tokio tasks, which do most of the work.
+///
+/// The caller has initialized the process as a regular PostgreSQL
+/// background worker process. The shared memory segment used to
+/// communicate with the backends has been allocated and initialized
+/// earlier, at postmaster startup, in rcommunicator_shmem_init().
+#[unsafe(no_mangle)]
+pub extern "C" fn communicator_worker_process_launch(
+    cis: Box<CommunicatorInitStruct>,
+    tenant_id: *const c_char,
+    timeline_id: *const c_char,
+    auth_token: *const c_char,
+    shard_map: *mut *mut c_char,
+    nshards: u32,
+    file_cache_path: *const c_char,
+    file_cache_size: u64,
+) {
+    // Convert the arguments into more convenient Rust types
+    let tenant_id = unsafe { CStr::from_ptr(tenant_id) }.to_str().unwrap();
+    let timeline_id = unsafe { CStr::from_ptr(timeline_id) }.to_str().unwrap();
+    let auth_token = {
+        if auth_token.is_null() {
+            None
+        } else {
+            let c_str = unsafe { CStr::from_ptr(auth_token) };
+            Some(c_str.to_str().unwrap().to_string())
+        }
+    };
+    let file_cache_path = {
+        if file_cache_path.is_null() {
+            None
+        } else {
+            let c_str = unsafe { CStr::from_ptr(file_cache_path) };
+            Some(PathBuf::from(c_str.to_str().unwrap()))
+        }
+    };
+    let shard_map = parse_shard_map(nshards, shard_map);
+
+    // start main loop
+    let runtime = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .thread_name("communicator thread")
+        .build()
+        .unwrap();
+
+    let worker_struct = runtime.block_on(main_loop::init(
+        cis,
+        tenant_id.to_string(),
+        timeline_id.to_string(),
+        auth_token,
+        shard_map,
+        file_cache_size,
+        file_cache_path,
+    ));
+    let worker_struct = Box::leak(Box::new(worker_struct));
+
+    let main_loop_handle = runtime.spawn(worker_struct.run());
+
+    runtime.spawn(async {
+        let err = main_loop_handle.await.unwrap_err();
+        error!("error: {err:?}");
+    });
+
+    // keep the runtime running after we exit this function
+    Box::leak(Box::new(runtime));
+}
+
+/// Convert the "shard map" from an array of C strings, indexed by shard no to a rust HashMap
+fn parse_shard_map(nshards: u32, shard_map: *mut *mut c_char) -> HashMap<u16, String> {
+    let mut result: HashMap<u16, String> = HashMap::new();
+    let mut p = shard_map;
+
+    for i in 0..nshards {
+        let c_str = unsafe { CStr::from_ptr(*p) };
+
+        p = unsafe { p.add(1) };
+
+        let s = c_str.to_str().unwrap();
+        result.insert(i as u16, s.into());
+    }
+    result
+}
--- a/pgxn/neon/communicator_new.c
+++ b/pgxn/neon/communicator_new.c
@@ -0,0 +1,953 @@
+/*-------------------------------------------------------------------------
+ *
+ * communicator_new.c
+ *	  Functions for communicating with remote pageservers.
+ *
+ * This is the "new" communicator. It consists of functions that
+ * are called from the smgr implementation, in pagestore_smgr.c.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xlog.h"
+#include "access/xlogdefs.h"
+#if PG_VERSION_NUM >= 150000
+#include "access/xlogrecovery.h"
+#endif
+#include "access/xlog_internal.h"
+#include "access/xlogutils.h"
+#include "executor/instrument.h"
+#include "miscadmin.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/interrupt.h"
+#include "replication/walsender.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/procarray.h"
+#if PG_VERSION_NUM >= 170000
+#include "storage/procnumber.h"
+#endif
+#include "storage/spin.h"
+#include "tcop/tcopprot.h"
+
+#include "communicator_new.h"
+#include "neon.h"
+#include "neon_perf_counters.h"
+#include "pagestore_client.h"
+
+/*
+ * FIXME: these are in file_cache.h, but I don't want to #include that
+ * here. This code shouldn't be using the C file cache for anything else than
+ * the GUCs.
+ */
+extern int	lfc_size_limit;
+extern char *lfc_path;
+
+
+/* the rust bindings, generated by cbindgen */
+#include "communicator/communicator_bindings.h"
+
+#define MaxProcs (MaxBackends + NUM_AUXILIARY_PROCS)
+
+static CommunicatorInitStruct *cis;
+static CommunicatorBackendStruct *my_bs;
+
+static File cache_file = 0;
+
+typedef struct CommunicatorShmemPerBackendData
+{
+	/*
+	 * Latch used to notify backend of IO completion. We cannot use the
+	 * standard process latch (MyProc->latch) because we cannot clear that
+	 * latch as part of the IO handling, or we might cause the caller to miss
+	 * some other events.
+	 */
+	Latch		io_completion_latch;
+
+	/*
+	 * Normally, when reading or writing pages from shared buffer cache, the
+	 * worker process can operate directly on the shared buffer. But when
+	 * working with a local buffer, we use this "bounce buffer" to pass the
+	 * data to the worker process.
+	 *
+	 * TODO: That's slow, because it incurs an extra memory copy, and there's
+	 * currently only one of these per backend, which means you can have only
+	 * one such IO in progress at a time.
+	 */
+	PGIOAlignedBlock bounce_buffer;
+} CommunicatorShmemPerBackendData;
+
+typedef struct CommunicatorShmemData
+{
+	int			dummy;
+
+	CommunicatorShmemPerBackendData backends[]; /* MaxProcs */
+
+	/* rust-managed shmem area follows at next MAXALIGN boundary */
+} CommunicatorShmemData;
+
+static CommunicatorShmemData *communicator_shmem_ptr;
+
+#define MyIOCompletionLatch (&communicator_shmem_ptr->backends[MyProcNumber].io_completion_latch)
+
+static slock_t in_elog;
+
+#define MAX_INFLIGHT_ASYNC_REQUESTS 5
+
+/* request indexes of (prefetch) requests that have been started */
+static int	inflight_requests[MAX_INFLIGHT_ASYNC_REQUESTS];
+static int	num_inflight_requests = 0;
+
+static int	start_request(NeonIORequest *request, struct NeonIOResult *immediate_result_p);
+static void wait_request_completion(int request_idx, struct NeonIOResult *result_p);
+static void perform_request(NeonIORequest *request, struct NeonIOResult *result_p);
+static void process_inflight_requests(void);
+
+static bool bounce_needed(void *buffer);
+static void *bounce_buf(void);
+static void *bounce_write_if_needed(void *buffer);
+
+PGDLLEXPORT void communicator_new_bgworker_main(Datum main_arg);
+static void communicator_new_backend_exit(int code, Datum arg);
+
+/**** Initialization functions. These run in postmaster ****/
+
+void
+pg_init_communicator_new(void)
+{
+	BackgroundWorker bgw;
+
+	/* Initialize the background worker process */
+	memset(&bgw, 0, sizeof(bgw));
+	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
+	bgw.bgw_start_time = BgWorkerStart_PostmasterStart;
+	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
+	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "communicator_new_bgworker_main");
+	snprintf(bgw.bgw_name, BGW_MAXLEN, "Storage communicator process");
+	snprintf(bgw.bgw_type, BGW_MAXLEN, "Storage communicator process");
+	bgw.bgw_restart_time = 5;
+	bgw.bgw_notify_pid = 0;
+	bgw.bgw_main_arg = (Datum) 0;
+
+	RegisterBackgroundWorker(&bgw);
+
+	SpinLockInit(&in_elog);
+}
+
+static size_t
+communicator_new_shmem_size(void)
+{
+	size_t		size = 0;
+
+	size += MAXALIGN(
+					 offsetof(CommunicatorShmemData, backends) +
+					 MaxProcs * sizeof(CommunicatorShmemPerBackendData)
+		);
+
+	/* space needed by the rust code */
+	size += rcommunicator_shmem_size(MaxProcs);
+
+	return size;
+}
+
+void
+communicator_new_shmem_request(void)
+{
+	RequestAddinShmemSpace(communicator_new_shmem_size());
+}
+
+void
+communicator_new_shmem_startup(void)
+{
+	bool		found;
+	int			pipefd[2];
+	int			rc;
+	size_t		communicator_size;
+	size_t		shmem_size;
+	void	   *shmem_ptr;
+
+	rc = pipe(pipefd);
+	if (rc != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg_internal("could not create pipe between neon communicator and backends : %m")));
+	if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
+		elog(FATAL, "fcntl(F_SETFL) failed on read-end of communicator pipe: %m");
+	if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
+		elog(FATAL, "fcntl(F_SETFL) failed on write-end of communicator pipe: %m");
+
+	shmem_size = communicator_new_shmem_size();
+	shmem_ptr = ShmemInitStruct("Communicator shmem state",
+								shmem_size,
+								&found);
+	Assert(!found);
+
+	/* Initialize the C-managed parts */
+	communicator_shmem_ptr = (CommunicatorShmemData *) shmem_ptr;
+	communicator_size = MAXALIGN(offsetof(CommunicatorShmemData, backends) + MaxProcs * sizeof(CommunicatorShmemPerBackendData));
+	shmem_ptr = (char *) shmem_ptr + communicator_size;
+	shmem_size -= communicator_size;
+
+	for (int i = 0; i < MaxProcs; i++)
+		InitSharedLatch(&communicator_shmem_ptr->backends[i].io_completion_latch);
+
+	/* Initialize the rust-managed parts */
+	cis = rcommunicator_shmem_init(pipefd[0], pipefd[1], MaxProcs, shmem_ptr, shmem_size);
+}
+
+/**** Worker process functions. These run in the communicator worker process ****/
+
+/* Entry point for the communicator bgworker process */
+void
+communicator_new_bgworker_main(Datum main_arg)
+{
+	char	  **connstrs;
+	shardno_t	num_shards;
+	struct LoggingState *logging;
+	char		errbuf[1000];
+	int			elevel;
+
+	/* Establish signal handlers. */
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
+	pqsignal(SIGTERM, die);
+
+	BackgroundWorkerUnblockSignals();
+
+	get_shard_map(&connstrs, &num_shards);
+
+	logging = configure_logging();
+
+	communicator_worker_process_launch(
+									   cis,
+									   neon_tenant,
+									   neon_timeline,
+									   neon_auth_token,
+									   connstrs,
+									   num_shards,
+									   lfc_path,
+									   lfc_size_limit);
+	cis = NULL;
+
+	elog(LOG, "communicator threads started");
+	for (;;)
+	{
+		int32		rc;
+
+		CHECK_FOR_INTERRUPTS();
+
+		for (;;)
+		{
+			rc = pump_logging(logging, (uint8 *) errbuf, sizeof(errbuf), &elevel);
+			if (rc == 0)
+			{
+				/* nothing to do */
+				break;
+			}
+			else if (rc == 1)
+			{
+				/* Because we don't want to exit on error */
+				if (elevel == ERROR)
+					elevel = LOG;
+				if (elevel == INFO)
+					elevel = LOG;
+				elog(elevel, "[COMMUNICATOR] %s", errbuf);
+			}
+			else if (rc == -1)
+			{
+				elog(ERROR, "logging channel was closed unexpectedly");
+			}
+		}
+
+		(void) WaitLatch(MyLatch,
+						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
+						 0,
+						 PG_WAIT_EXTENSION);
+		ResetLatch(MyLatch);
+	}
+}
+
+/*
+ * Callbacks from the rust code, in the communicator process.
+ *
+ * NOTE: These must be thread safe! It's very limited which PostgreSQL functions you can use!!!
+ *
+ * NOTE: the signatures of these better match the Rust definitions!
+ */
+
+void
+notify_proc_unsafe(int procno)
+{
+	SetLatch(&communicator_shmem_ptr->backends[procno].io_completion_latch);
+
+}
+
+void
+callback_set_my_latch_unsafe(void)
+{
+	SetLatch(MyLatch);
+}
+
+/*
+ * FIXME: The logic from neon_get_request_lsns() needs to go here, except for
+ * the last-written LSN cache stuff, which is managed by the rust code now.
+ */
+uint64
+callback_get_request_lsn_unsafe(void)
+{
+	/*
+	 * NB: be very careful with what you do here! This is called from tokio
+	 * threads, so anything tha tries to take LWLocks is unsafe, for example.
+	 *
+	 * RecoveryInProgress() is OK
+	 */
+	if (RecoveryInProgress())
+	{
+		XLogRecPtr	replay_lsn = GetXLogReplayRecPtr(NULL);
+
+		return replay_lsn;
+	}
+	else
+	{
+		XLogRecPtr	flushlsn;
+
+#if PG_VERSION_NUM >= 150000
+		flushlsn = GetFlushRecPtr(NULL);
+#else
+		flushlsn = GetFlushRecPtr();
+#endif
+
+		return flushlsn;
+	}
+}
+
+/**** Backend functions. These run in each backend ****/
+
+/* Initialize per-backend private state */
+void
+communicator_new_init(void)
+{
+	Assert(cis != NULL);
+	Assert(my_bs == NULL);
+
+	if (MyBgworkerEntry && strcmp(MyBgworkerEntry->bgw_function_name, "communicator_new_bgworker_main") == 0)
+		return;
+
+	OwnLatch(MyIOCompletionLatch);
+
+	my_bs = rcommunicator_backend_init(cis, MyProcNumber);
+	cis = NULL;
+
+	/*
+	 * Arrange to clean up at backend exit.
+	 */
+	on_shmem_exit(communicator_new_backend_exit, 0);
+}
+
+static void
+communicator_new_backend_exit(int code, Datum arg)
+{
+	DisownLatch(MyIOCompletionLatch);
+}
+
+/*
+ * prefetch_register_bufferv() - register and prefetch buffers
+ *
+ * Register that we may want the contents of BufferTag in the near future.
+ * This is used when issuing a speculative prefetch request, but also when
+ * performing a synchronous request and need the buffer right now.
+ *
+ * When performing a prefetch rather than a synchronous request,
+ * is_prefetch==true. Currently, it only affects how the request is accounted
+ * in the perf counters.
+ *
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
+ */
+void
+communicator_new_prefetch_register_bufferv(NRelFileInfo rinfo, ForkNumber forkNum,
+										   BlockNumber blockno, BlockNumber nblocks)
+{
+	int			request_idx;
+	NeonIORequest request = {
+		.tag = NeonIORequest_PrefetchV,
+		.prefetch_v = {
+			.spc_oid = NInfoGetSpcOid(rinfo),
+			.db_oid = NInfoGetDbOid(rinfo),
+			.rel_number = NInfoGetRelNumber(rinfo),
+			.fork_number = forkNum,
+			.block_number = blockno,
+			.nblocks = nblocks,
+		}
+	};
+	struct NeonIOResult result;
+
+	elog(LOG, "prefetch called for rel %u/%u/%u.%u block %u (%u blocks)",
+		 RelFileInfoFmt(rinfo), forkNum, blockno, nblocks);
+
+	if (num_inflight_requests >= MAX_INFLIGHT_ASYNC_REQUESTS)
+		process_inflight_requests();
+
+	request_idx = bcomm_start_io_request(my_bs, &request, &result);
+	if (request_idx == -1)
+	{
+		/* -1 means the request was satisfied immediately. */
+		/* FIXME: check and log errors */
+		return;
+	}
+	inflight_requests[num_inflight_requests] = request_idx;
+	num_inflight_requests++;
+
+	elog(LOG, "sent prefetch request with idx %d", request_idx);
+}
+
+static void
+process_inflight_requests(void)
+{
+	struct NeonIOResult result;
+
+	/* FIXME: log errors */
+	for (int i = 0; i < num_inflight_requests; i++)
+		wait_request_completion(inflight_requests[i], &result);
+	num_inflight_requests = 0;
+}
+
+/*
+ * Perform an IO request in a synchronous fashion.
+ *
+ * Returns a pointer to the result slot. It is valid until the next time a
+ * request is submitted.
+ */
+static void
+perform_request(NeonIORequest * request, struct NeonIOResult *result_p)
+{
+	int			request_idx;
+
+	process_inflight_requests();
+
+	request_idx = start_request(request, result_p);
+	if (request_idx == -1)
+	{
+		/* it was completed immediately */
+		return;
+	}
+	wait_request_completion(request_idx, result_p);
+}
+
+static int
+start_request(NeonIORequest * request, struct NeonIOResult *immediate_result_p)
+{
+	int			request_idx;
+
+	request_idx = bcomm_start_io_request(my_bs, request, immediate_result_p);
+	if (request_idx == -1)
+	{
+		/* -1 means the request was satisfied immediately. */
+		return -1;
+	}
+	elog(DEBUG5, "sent request with idx %d: tag %d", request_idx, request->tag);
+	return request_idx;
+}
+
+static void
+wait_request_completion(int request_idx, struct NeonIOResult *result_p)
+{
+	int32_t		poll_res;
+
+	/* fixme: check 'request_idx' ? */
+
+	for (;;)
+	{
+		ResetLatch(MyIOCompletionLatch);
+
+		poll_res = bcomm_poll_request_completion(my_bs, request_idx, result_p);
+		if (poll_res == -1)
+		{
+			CHECK_FOR_INTERRUPTS();
+
+			/*
+			 * TODO: wake up periodically for CHECK_FOR_INTERRUPTS(). Because
+			 * we wait on MyIOCompletionLatch rather than MyLatch, we won't be
+			 * woken up for the standard interrupts.
+			 */
+			(void) WaitLatch(MyIOCompletionLatch,
+							 WL_EXIT_ON_PM_DEATH | WL_LATCH_SET,
+							 0,
+							 WAIT_EVENT_NEON_PS_STARTING);
+			continue;			/* still busy */
+		}
+		else if (poll_res == 0)
+		{
+			return;
+		}
+		else
+		{
+			elog(ERROR, "unexpected return code from bcomm_poll_request_completion()");
+		}
+	}
+}
+
+/*
+ *	Does the physical file exist?
+ */
+bool
+communicator_new_rel_exists(NRelFileInfo rinfo, ForkNumber forkNum)
+{
+	NeonIORequest request = {
+		.tag = NeonIORequest_RelExists,
+		.rel_exists = {
+			.spc_oid = NInfoGetSpcOid(rinfo),
+			.db_oid = NInfoGetDbOid(rinfo),
+			.rel_number = NInfoGetRelNumber(rinfo),
+			.fork_number = forkNum,
+		}
+	};
+	NeonIOResult result;
+
+	perform_request(&request, &result);
+	switch (result.tag)
+	{
+		case NeonIOResult_RelExists:
+			return result.rel_exists;
+		case NeonIOResult_Error:
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not check existence of rel %u/%u/%u.%u: %s",
+							RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
+			break;
+		default:
+			elog(ERROR, "unexpected result for RelExists operation: %d", result.tag);
+			break;
+	}
+}
+
+/*
+ * Read N consecutive pages from a relation
+ */
+void
+communicator_new_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
+							  void **buffers, BlockNumber nblocks)
+{
+	NeonIOResult result;
+	CCachedGetPageVResult cached_result;
+	void	   *bounce_buf_used = NULL;
+	int			request_idx;
+	NeonIORequest request = {
+		.tag = NeonIORequest_GetPageV,
+		.get_page_v = {
+			.spc_oid = NInfoGetSpcOid(rinfo),
+			.db_oid = NInfoGetDbOid(rinfo),
+			.rel_number = NInfoGetRelNumber(rinfo),
+			.fork_number = forkNum,
+			.block_number = blockno,
+			.nblocks = nblocks,
+		}
+	};
+
+	elog(LOG, "getpagev called for rel %u/%u/%u.%u block %u (%u blocks)",
+		 RelFileInfoFmt(rinfo), forkNum, blockno, nblocks);
+
+	/* Fill in the destination buffers in the request */
+	if (nblocks == 1)
+	{
+		if (bounce_needed(buffers[0]))
+		{
+			bounce_buf_used = bounce_buf();
+			request.get_page_v.dest[0].ptr = bounce_buf_used;
+		}
+		else
+			request.get_page_v.dest[0].ptr = buffers[0];
+	}
+	else
+	{
+		for (int i = 0; i < nblocks; i++)
+		{
+			if (bounce_needed(buffers[i]))
+			{
+				/* Split the vector-request into single page requests */
+				for (int j = 0; j < nblocks; j++)
+				{
+					communicator_new_read_at_lsnv(rinfo, forkNum, blockno + j,
+												  &buffers[j], 1);
+				}
+				return;
+			}
+			request.get_page_v.dest[i].ptr = buffers[i];
+		}
+	}
+
+	process_inflight_requests();
+
+retry:
+	request_idx = bcomm_start_get_page_v_request(my_bs, &request, &cached_result);
+	if (request_idx == -1)
+	{
+		bool		completed;
+
+		/*
+		 * LFC hit, but we are responsible for completing the I/O on the local
+		 * file
+		 */
+		if (cache_file == 0)
+			cache_file = PathNameOpenFile(lfc_path, O_RDONLY | PG_BINARY);
+
+		for (int i = 0; i < nblocks; i++)
+		{
+			uint64_t	cached_block = cached_result.cache_block_numbers[i];
+			ssize_t		bytes_total = 0;
+
+			while (bytes_total < BLCKSZ)
+			{
+				ssize_t		nbytes;
+
+				nbytes = FileRead(cache_file, ((char *) buffers[i]) + bytes_total, BLCKSZ - bytes_total, cached_block * BLCKSZ + bytes_total, WAIT_EVENT_NEON_LFC_READ);
+				if (nbytes == -1)
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not read block %lu in local cache file: %m",
+									cached_block)));
+				bytes_total += nbytes;
+			}
+		}
+		completed = bcomm_finish_cache_read(my_bs);
+		if (!completed)
+		{
+			elog(DEBUG1, "read from local cache file was superseded by concurrent update");
+			goto retry;
+		}
+		return;
+	}
+
+	wait_request_completion(request_idx, &result);
+	switch (result.tag)
+	{
+		case NeonIOResult_GetPageV:
+			if (bounce_buf_used)
+				memcpy(buffers[0], bounce_buf_used, BLCKSZ);
+			return;
+		case NeonIOResult_Error:
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read block %u in rel %u/%u/%u.%u: %s",
+							blockno, RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
+			break;
+		default:
+			elog(ERROR, "unexpected result for GetPage operation: %d", result.tag);
+			break;
+	}
+}
+
+/*
+ *	neon_nblocks() -- Get the number of blocks stored in a relation.
+ */
+BlockNumber
+communicator_new_rel_nblocks(NRelFileInfo rinfo, ForkNumber forkNum)
+{
+	NeonIORequest request = {
+		.tag = NeonIORequest_RelSize,
+		.rel_size = {
+			.spc_oid = NInfoGetSpcOid(rinfo),
+			.db_oid = NInfoGetDbOid(rinfo),
+			.rel_number = NInfoGetRelNumber(rinfo),
+			.fork_number = forkNum,
+		}
+	};
+	NeonIOResult result;
+
+	perform_request(&request, &result);
+	switch (result.tag)
+	{
+		case NeonIOResult_RelSize:
+			return result.rel_size;
+		case NeonIOResult_Error:
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read size of rel %u/%u/%u.%u: %s",
+							RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
+			break;
+		default:
+			elog(ERROR, "unexpected result for RelSize operation: %d", result.tag);
+			break;
+	}
+}
+
+/*
+ *	neon_db_size() -- Get the size of the database in bytes.
+ */
+int64
+communicator_new_dbsize(Oid dbNode)
+{
+	NeonIORequest request = {
+		.tag = NeonIORequest_DbSize,
+		.db_size = {
+			.db_oid = dbNode,
+		}
+	};
+	NeonIOResult result;
+
+	perform_request(&request, &result);
+	switch (result.tag)
+	{
+		case NeonIOResult_DbSize:
+			return (int64) result.db_size;
+		case NeonIOResult_Error:
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read database size of database %u: %s",
+							dbNode, pg_strerror(result.error))));
+			break;
+		default:
+			elog(ERROR, "unexpected result for DbSize operation: %d", result.tag);
+			break;
+	}
+}
+
+int
+communicator_new_read_slru_segment(SlruKind kind, int64 segno, void *buffer)
+{
+	/* TODO */
+	elog(ERROR, "not implemented");
+}
+
+/* Write requests */
+void
+communicator_new_write_page(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
+							const void *buffer, XLogRecPtr lsn)
+{
+	void	   *src = bounce_write_if_needed((void *) buffer);
+	NeonIORequest request = {
+		.tag = NeonIORequest_WritePage,
+		.write_page = {
+			.spc_oid = NInfoGetSpcOid(rinfo),
+			.db_oid = NInfoGetDbOid(rinfo),
+			.rel_number = NInfoGetRelNumber(rinfo),
+			.fork_number = forkNum,
+			.block_number = blockno,
+			.lsn = lsn,
+			.src.ptr = src,
+		}
+	};
+	NeonIOResult result;
+
+	perform_request(&request, &result);
+	switch (result.tag)
+	{
+		case NeonIOResult_WriteOK:
+			return;
+		case NeonIOResult_Error:
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write block %u in rel %u/%u/%u.%u: %s",
+							blockno, RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
+			break;
+		default:
+			elog(ERROR, "unexpected result for WritePage operation: %d", result.tag);
+			break;
+	}
+}
+
+void
+communicator_new_rel_extend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
+							const void *buffer, XLogRecPtr lsn)
+{
+	void	   *src = bounce_write_if_needed((void *) buffer);
+	NeonIORequest request = {
+		.tag = NeonIORequest_RelExtend,
+		.rel_extend = {
+			.spc_oid = NInfoGetSpcOid(rinfo),
+			.db_oid = NInfoGetDbOid(rinfo),
+			.rel_number = NInfoGetRelNumber(rinfo),
+			.fork_number = forkNum,
+			.block_number = blockno,
+			.lsn = lsn,
+			.src_ptr = (uintptr_t) src,
+			.src_size = BLCKSZ,
+		}
+	};
+	NeonIOResult result;
+
+	perform_request(&request, &result);
+	switch (result.tag)
+	{
+		case NeonIOResult_WriteOK:
+			return;
+		case NeonIOResult_Error:
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not extend to block %u in rel %u/%u/%u.%u: %s",
+							blockno, RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
+			break;
+		default:
+			elog(ERROR, "unexpected result for Extend operation: %d", result.tag);
+			break;
+	}
+}
+
+void
+communicator_new_rel_zeroextend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
+								BlockNumber nblocks, XLogRecPtr lsn)
+{
+	NeonIORequest request = {
+		.tag = NeonIORequest_RelZeroExtend,
+		.rel_zero_extend = {
+			.spc_oid = NInfoGetSpcOid(rinfo),
+			.db_oid = NInfoGetDbOid(rinfo),
+			.rel_number = NInfoGetRelNumber(rinfo),
+			.fork_number = forkNum,
+			.block_number = blockno,
+			.nblocks = nblocks,
+			.lsn = lsn,
+		}
+	};
+	NeonIOResult result;
+
+	perform_request(&request, &result);
+	switch (result.tag)
+	{
+		case NeonIOResult_WriteOK:
+			return;
+		case NeonIOResult_Error:
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not zeroextend to block %u in rel %u/%u/%u.%u: %s",
+							blockno, RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
+			break;
+		default:
+			elog(ERROR, "unexpected result for ZeroExtend operation: %d", result.tag);
+			break;
+	}
+}
+
+void
+communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum)
+{
+	NeonIORequest request = {
+		.tag = NeonIORequest_RelCreate,
+		.rel_create = {
+			.spc_oid = NInfoGetSpcOid(rinfo),
+			.db_oid = NInfoGetDbOid(rinfo),
+			.rel_number = NInfoGetRelNumber(rinfo),
+			.fork_number = forkNum,
+		}
+	};
+	NeonIOResult result;
+
+	perform_request(&request, &result);
+	switch (result.tag)
+	{
+		case NeonIOResult_WriteOK:
+			return;
+		case NeonIOResult_Error:
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not create rel %u/%u/%u.%u: %s",
+							RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
+			break;
+		default:
+			elog(ERROR, "unexpected result for Create operation: %d", result.tag);
+			break;
+	}
+}
+
+void
+communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
+{
+	NeonIORequest request = {
+		.tag = NeonIORequest_RelTruncate,
+		.rel_truncate = {
+			.spc_oid = NInfoGetSpcOid(rinfo),
+			.db_oid = NInfoGetDbOid(rinfo),
+			.rel_number = NInfoGetRelNumber(rinfo),
+			.fork_number = forkNum,
+			.nblocks = nblocks,
+		}
+	};
+	NeonIOResult result;
+
+	perform_request(&request, &result);
+	switch (result.tag)
+	{
+		case NeonIOResult_WriteOK:
+			return;
+		case NeonIOResult_Error:
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not truncate rel %u/%u/%u.%u to %u blocks: %s",
+							RelFileInfoFmt(rinfo), forkNum, nblocks, pg_strerror(result.error))));
+			break;
+		default:
+			elog(ERROR, "unexpected result for Truncate operation: %d", result.tag);
+			break;
+	}
+}
+
+void
+communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum)
+{
+	NeonIORequest request = {
+		.tag = NeonIORequest_RelUnlink,
+		.rel_unlink = {
+			.spc_oid = NInfoGetSpcOid(rinfo),
+			.db_oid = NInfoGetDbOid(rinfo),
+			.rel_number = NInfoGetRelNumber(rinfo),
+			.fork_number = forkNum,
+		}
+	};
+	NeonIOResult result;
+
+	perform_request(&request, &result);
+	switch (result.tag)
+	{
+		case NeonIOResult_WriteOK:
+			return;
+		case NeonIOResult_Error:
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not unlink rel %u/%u/%u.%u: %s",
+							RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
+			break;
+		default:
+			elog(ERROR, "unexpected result for Unlink operation: %d", result.tag);
+			break;
+	}
+}
+
+/*
+ * The worker process can read / write shared buffers directly. But if smgrread() or
+ * smgrwrite() is called with a private temporary buffer, we need to copy it to the
+ * "bounce buffer", to make it available fro the worker process.
+ */
+static bool
+bounce_needed(void *buffer)
+{
+	if ((uintptr_t) buffer >= (uintptr_t) BufferBlocks &&
+		(uintptr_t) buffer < (uintptr_t) BufferBlocks + NBuffers * BLCKSZ)
+	{
+		return false;
+	}
+	return true;
+}
+
+static void *
+bounce_buf(void)
+{
+	return &communicator_shmem_ptr->backends[MyProcNumber].bounce_buffer;
+}
+
+static void *
+bounce_write_if_needed(void *buffer)
+{
+	void	   *p;
+
+	if (!bounce_needed(buffer))
+		return buffer;
+
+	p = bounce_buf();
+	memcpy(p, buffer, BLCKSZ);
+	return p;
+}
--- a/pgxn/neon/communicator_new.h
+++ b/pgxn/neon/communicator_new.h
@@ -0,0 +1,54 @@
+/*-------------------------------------------------------------------------
+ *
+ * communicator_new.h
+ *	  new implementation
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef COMMUNICATOR_NEW_H
+#define COMMUNICATOR_NEW_H
+
+#include "neon_pgversioncompat.h"
+
+#include "storage/buf_internals.h"
+
+#include "pagestore_client.h"
+
+/* initialization at postmaster startup */
+extern void pg_init_communicator_new(void);
+extern void communicator_new_shmem_request(void);
+extern void communicator_new_shmem_startup(void);
+
+/* initialization at backend startup */
+extern void communicator_new_init(void);
+
+/* Read requests */
+extern bool communicator_new_rel_exists(NRelFileInfo rinfo, ForkNumber forkNum);
+extern BlockNumber communicator_new_rel_nblocks(NRelFileInfo rinfo, ForkNumber forknum);
+extern int64 communicator_new_dbsize(Oid dbNode);
+extern void communicator_new_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum,
+										  BlockNumber base_blockno,
+										  void **buffers, BlockNumber nblocks);
+extern void communicator_new_prefetch_register_bufferv(NRelFileInfo rinfo, ForkNumber forkNum,
+													   BlockNumber blockno,
+													   BlockNumber nblocks);
+extern int	communicator_new_read_slru_segment(SlruKind kind, int64 segno,
+											   void *buffer);
+
+/* Write requests, to keep the caches up-to-date */
+extern void communicator_new_write_page(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
+										const void *buffer, XLogRecPtr lsn);
+extern void communicator_new_rel_extend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
+										const void *buffer, XLogRecPtr lsn);
+extern void communicator_new_rel_zeroextend(NRelFileInfo rinfo, ForkNumber forkNum,
+											BlockNumber blockno, BlockNumber nblocks,
+											XLogRecPtr lsn);
+extern void communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum);
+extern void communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);
+extern void communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum);
+
+#endif							/* COMMUNICATOR_NEW_H */
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -164,10 +164,10 @@ static HTAB *lfc_hash;
 static int	lfc_desc = -1;
 static LWLockId lfc_lock;
 static int	lfc_max_size;
-static int	lfc_size_limit;
+int	lfc_size_limit;
 static int	lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
 static int	lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
-static char *lfc_path;
+char *lfc_path;
 static uint64 lfc_generation;
 static FileCacheControl *lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
--- a/pgxn/neon/file_cache.h
+++ b/pgxn/neon/file_cache.h
@@ -15,6 +15,8 @@

 /* GUCs */
 extern bool lfc_store_prefetch_result;
+extern int	lfc_size_limit;
+extern char *lfc_path;

 /* functions for local file cache */
 extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -279,6 +279,55 @@ AssignPageserverConnstring(const char *newval, void *extra)
 	}
 }

+/* Return a copy of the whole shard map from shared memory */
+void
+get_shard_map(char ***connstrs_p, shardno_t *num_shards_p)
+{
+	uint64		begin_update_counter;
+	uint64		end_update_counter;
+	ShardMap   *shard_map = &pagestore_shared->shard_map;
+	shardno_t	num_shards;
+	char	   *buf;
+	char	  **connstrs;
+
+	buf = palloc(MAX_SHARDS*MAX_PAGESERVER_CONNSTRING_SIZE);
+	connstrs = palloc(sizeof(char *) * MAX_SHARDS);
+
+	/*
+	 * Postmaster can update the shared memory values concurrently, in which
+	 * case we would copy a garbled mix of the old and new values. We will
+	 * detect it because the counter's won't match, and retry. But it's
+	 * important that we don't do anything within the retry-loop that would
+	 * depend on the string having valid contents.
+	 */
+	do
+	{
+		char		*p;
+
+		begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
+		end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter);
+
+		num_shards = shard_map->num_shards;
+
+		p = buf;
+		for (int i = 0; i < Min(num_shards, MAX_SHARDS); i++)
+		{
+			strlcpy(p, shard_map->connstring[i], MAX_PAGESERVER_CONNSTRING_SIZE);
+			connstrs[i] = p;
+			elog(LOG, "XX: connstrs[%d]: %p", i, p);
+			p += MAX_PAGESERVER_CONNSTRING_SIZE;
+		}
+
+		pg_memory_barrier();
+	}
+	while (begin_update_counter != end_update_counter
+		   || begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter)
+		   || end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter));
+
+	*connstrs_p = connstrs;
+	*num_shards_p = num_shards;
+}
+
 /*
 * Get the current number of shards, and/or the connection string for a
 * particular shard from the shard map in shared memory.
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -20,6 +20,7 @@
 #include "replication/logicallauncher.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
+#include "storage/ipc.h"
 #include "storage/proc.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
@@ -29,6 +30,7 @@
 #include "utils/guc_tables.h"

 #include "communicator.h"
+#include "communicator_new.h"
 #include "extension_server.h"
 #include "file_cache.h"
 #include "neon.h"
@@ -45,13 +47,17 @@ PG_MODULE_MAGIC;
 void		_PG_init(void);


+bool neon_enable_new_communicator;
 static int  running_xacts_overflow_policy;

-#if PG_MAJORVERSION_NUM >= 16
 static shmem_startup_hook_type prev_shmem_startup_hook;
-
-static void neon_shmem_startup_hook(void);
+#if PG_VERSION_NUM>=150000
+static shmem_request_hook_type prev_shmem_request_hook;
 #endif
+
+static void neon_shmem_request(void);
+static void neon_shmem_startup_hook(void);
+
 #if PG_MAJORVERSION_NUM >= 17
 uint32		WAIT_EVENT_NEON_LFC_MAINTENANCE;
 uint32		WAIT_EVENT_NEON_LFC_READ;
@@ -430,17 +436,36 @@ _PG_init(void)
 	 */
 #if PG_VERSION_NUM >= 160000
 	load_file("$libdir/neon_rmgr", false);
+#endif

 	prev_shmem_startup_hook = shmem_startup_hook;
 	shmem_startup_hook = neon_shmem_startup_hook;
+#if PG_VERSION_NUM>=150000
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = neon_shmem_request;
+#else
+	neon_shmem_request();
 #endif

+	DefineCustomBoolVariable(
+							"neon.enable_new_communicator",
+							"Enables new communicator implementation",
+							NULL,
+							&neon_enable_new_communicator,
+							true,
+							PGC_POSTMASTER,
+							0,
+							NULL, NULL, NULL);
+
 	pg_init_libpagestore();
 	lfc_init();
 	pg_init_walproposer();
 	init_lwlsncache();

 	pg_init_communicator();
+	if (neon_enable_new_communicator)
+		pg_init_communicator_new();
+
 	Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;

 	InitUnstableExtensionsSupport();
@@ -559,7 +584,17 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
 	PG_RETURN_UINT64(BackpressureThrottlingTime());
 }

-#if PG_MAJORVERSION_NUM >= 16
+static void
+neon_shmem_request(void)
+{
+#if PG_VERSION_NUM>=150000
+	if (prev_shmem_request_hook)
+		prev_shmem_request_hook();
+#endif
+
+	communicator_new_shmem_request();
+}
+
 static void
 neon_shmem_startup_hook(void)
 {
@@ -579,5 +614,6 @@ neon_shmem_startup_hook(void)
 	WAIT_EVENT_NEON_PS_READ = WaitEventExtensionNew("Neon/PS_ReadIO");
 	WAIT_EVENT_NEON_WAL_DL = WaitEventExtensionNew("Neon/WAL_Download");
 #endif
+
+	communicator_new_shmem_startup();
 }
-#endif
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -13,6 +13,7 @@
 #include "utils/wait_event.h"

 /* GUCs */
+extern bool neon_enable_new_communicator;
 extern char *neon_auth_token;
 extern char *neon_timeline;
 extern char *neon_tenant;
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -9,6 +9,10 @@
 #include "fmgr.h"
 #include "storage/buf_internals.h"

+#if PG_MAJORVERSION_NUM < 16
+typedef PGAlignedBlock PGIOAlignedBlock;
+#endif
+
 #if PG_MAJORVERSION_NUM < 17
 #define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId)
 #else
@@ -154,6 +158,10 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
 #define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess())
 #endif

+#if PG_MAJORVERSION_NUM < 17
+#define	MyProcNumber (MyProc - &ProcGlobal->allProcs[0])
+#endif
+
 #if PG_MAJORVERSION_NUM < 15
 extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
 #endif
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -228,6 +228,7 @@ extern char *neon_tenant;
 extern int32 max_cluster_size;
 extern int  neon_protocol_version;

+extern void get_shard_map(char ***connstrs_p, shardno_t *num_shards_p);
 extern shardno_t get_shard_number(BufferTag* tag);

 extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -62,6 +62,7 @@

 #include "bitmap.h"
 #include "communicator.h"
+#include "communicator_new.h"
 #include "file_cache.h"
 #include "neon.h"
 #include "neon_lwlsncache.h"
@@ -72,10 +73,6 @@
 #include "access/xlogrecovery.h"
 #endif

-#if PG_VERSION_NUM < 160000
-typedef PGAlignedBlock PGIOAlignedBlock;
-#endif
-
 /*
 * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API
 * calls to md.c, and *also* do the calls to the Page Server. On every
@@ -97,7 +94,7 @@ static char *hexdump_page(char *page);
 		NInfoGetRelNumber(InfoFromSMgrRel(reln)) >= FirstNormalObjectId \
 )

-const int	SmgrTrace = DEBUG5;
+const int	SmgrTrace = DEBUG1;

 /* unlogged relation build states */
 typedef enum
@@ -779,10 +776,15 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}

-	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
-						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
+	if (neon_enable_new_communicator)
+		return communicator_new_rel_exists(InfoFromSMgrRel(reln), forkNum);
+	else
+	{
+		neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
+							  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);

-	return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
+		return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
+	}
 }

 /*
@@ -820,33 +822,40 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 		 forkNum);

-	/*
-	 * Newly created relation is empty, remember that in the relsize cache.
-	 *
-	 * Note that in REDO, this is called to make sure the relation fork
-	 * exists, but it does not truncate the relation. So, we can only update
-	 * the relsize if it didn't exist before.
-	 *
-	 * Also, in redo, we must make sure to update the cached size of the
-	 * relation, as that is the primary source of truth for REDO's file length
-	 * considerations, and as file extension isn't (perfectly) logged, we need
-	 * to take care of that before we hit file size checks.
-	 *
-	 * FIXME: This is currently not just an optimization, but required for
-	 * correctness. Postgres can call smgrnblocks() on the newly-created
-	 * relation. Currently, we don't call SetLastWrittenLSN() when a new
-	 * relation created, so if we didn't remember the size in the relsize
-	 * cache, we might call smgrnblocks() on the newly-created relation before
-	 * the creation WAL record hass been received by the page server.
-	 */
-	if (isRedo)
+	if (neon_enable_new_communicator)
 	{
-		update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
-		get_cached_relsize(InfoFromSMgrRel(reln), forkNum,
-						   &reln->smgr_cached_nblocks[forkNum]);
+		communicator_new_rel_create(InfoFromSMgrRel(reln), forkNum);
 	}
 	else
-		set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
+	{
+		/*
+		 * Newly created relation is empty, remember that in the relsize cache.
+		 *
+		 * Note that in REDO, this is called to make sure the relation fork
+		 * exists, but it does not truncate the relation. So, we can only update
+		 * the relsize if it didn't exist before.
+		 *
+		 * Also, in redo, we must make sure to update the cached size of the
+		 * relation, as that is the primary source of truth for REDO's file length
+		 * considerations, and as file extension isn't (perfectly) logged, we need
+		 * to take care of that before we hit file size checks.
+		 *
+		 * FIXME: This is currently not just an optimization, but required for
+		 * correctness. Postgres can call smgrnblocks() on the newly-created
+		 * relation. Currently, we don't call SetLastWrittenLSN() when a new
+		 * relation created, so if we didn't remember the size in the relsize
+		 * cache, we might call smgrnblocks() on the newly-created relation before
+		 * the creation WAL record hass been received by the page server.
+		 */
+		if (isRedo)
+		{
+			update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
+			get_cached_relsize(InfoFromSMgrRel(reln), forkNum,
+							   &reln->smgr_cached_nblocks[forkNum]);
+		}
+		else
+			set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
+	}

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -968,34 +977,43 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		 forkNum, blkno,
 		 (uint32) (lsn >> 32), (uint32) lsn);

-	lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
+	if (neon_enable_new_communicator)
+	{
+		// FIXME: this can pass lsn == invalid. Is that ok?
+		communicator_new_rel_extend(InfoFromSMgrRel(reln), forkNum, blkno, (const void *) buffer, lsn);
+	}
+	else
+	{
+		lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);

 #ifdef DEBUG_COMPARE_LOCAL
-	if (IS_LOCAL_REL(reln))
-		mdextend(reln, forkNum, blkno, buffer, skipFsync);
+		if (IS_LOCAL_REL(reln))
+			mdextend(reln, forkNum, blkno, buffer, skipFsync);
 #endif

-	/*
-	 * smgr_extend is often called with an all-zeroes page, so
-	 * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
-	 * later, after it has been initialized with the real page contents, and
-	 * it is eventually evicted from the buffer cache. But we need a valid LSN
-	 * to the relation metadata update now.
-	 */
-	if (lsn == InvalidXLogRecPtr)
-	{
-		lsn = GetXLogInsertRecPtr();
-		neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno);
+		/*
+		 * smgr_extend is often called with an all-zeroes page, so
+		 * lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
+		 * later, after it has been initialized with the real page contents, and
+		 * it is eventually evicted from the buffer cache. But we need a valid LSN
+		 * to the relation metadata update now.
+		 */
+		if (lsn == InvalidXLogRecPtr)
+		{
+			lsn = GetXLogInsertRecPtr();
+			neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno);
+		}
+		neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
 	}
-	neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
 }

 #if PG_MAJORVERSION_NUM >= 16
 static void
-neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
+neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber start_block,
 				int nblocks, bool skipFsync)
 {
 	const PGIOAlignedBlock buffer = {0};
+	BlockNumber blocknum = start_block;
 	int			remblocks = nblocks;
 	XLogRecPtr	lsn = 0;

@@ -1092,8 +1110,15 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,

 	Assert(lsn != 0);

-	neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
-	set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
+	if (neon_enable_new_communicator)
+	{
+		communicator_new_rel_zeroextend(InfoFromSMgrRel(reln), forkNum, start_block, nblocks, lsn);
+	}
+	else
+	{
+		neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
+		set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
+	}
 }
 #endif

@@ -1153,11 +1178,17 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

+	if (neon_enable_new_communicator)
+	{
+		communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, nblocks);
+		return false;
+	}
+
 	tag.spcOid = reln->smgr_rlocator.locator.spcOid;
 	tag.dbOid = reln->smgr_rlocator.locator.dbOid;
 	tag.relNumber = reln->smgr_rlocator.locator.relNumber;
 	tag.forkNum = forknum;
-
+	
 	while (nblocks > 0)
 	{
 		int		iterblocks = Min(nblocks, PG_IOV_MAX);
@@ -1179,7 +1210,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		blocknum += iterblocks;
 	}

-	communicator_prefetch_pump_state(false);
+	if (!neon_enable_new_communicator)
+		communicator_prefetch_pump_state(false);

 	return false;
 }
@@ -1216,9 +1248,13 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)

 	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));

-	communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
+	if (neon_enable_new_communicator)
+		communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, 1);
+	else
+		communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);

-	communicator_prefetch_pump_state(false);
+	if (!neon_enable_new_communicator)
+		communicator_prefetch_pump_state(false);

 	return false;
 }
@@ -1262,7 +1298,8 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 	 */
 	neon_log(SmgrTrace, "writeback noop");

-	communicator_prefetch_pump_state(false);
+	if (!neon_enable_new_communicator)
+		communicator_prefetch_pump_state(false);

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1278,7 +1315,14 @@ void
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 neon_request_lsns request_lsns, void *buffer)
 {
-	communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
+	if (neon_enable_new_communicator)
+	{
+		// FIXME: request_lsns is ignored. That affects the neon_test_utils callers.
+		// Add the capability to specify the LSNs explicitly, for the sake of neon_test_utils ?
+		communicator_new_read_at_lsnv(rinfo, forkNum, blkno, &buffer, 1);
+	}
+	else
+		communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
 }

 #if PG_MAJORVERSION_NUM < 17
@@ -1296,6 +1340,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	neon_request_lsns request_lsns;
 	bits8		present;
 	void	   *bufferp;
+	bool		prefetch_hit;

 	switch (reln->smgr_relpersistence)
 	{
@@ -1314,33 +1359,62 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	/* Try to read PS results if they are available */
-	communicator_prefetch_pump_state(false);
-
-	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
-
 	present = 0;
 	bufferp = buffer;
-	if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
+
+	if (neon_enable_new_communicator)
 	{
-		/* Prefetch hit */
-		return;
+		communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forkNum, blkno,
+									  (void *) &buffer, 1);
 	}
-
-	/* Try to read from local file cache */
-	if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
+	else
 	{
-		MyNeonCounters->file_cache_hits_total++;
-		return;
+		prefetch_hit = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present);
+		if (prefetch_hit)
+		{
+			/* Prefetch hit */
+			return;
+		}
+
+		/* Try to read from local file cache */
+		if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
+		{
+			MyNeonCounters->file_cache_hits_total++;
+			return;
+		}
+
+		/*
+		 * Try to receive prefetch results once again just to make sure we
+		 * don't leave the smgr code while the OS might still have buffered
+		 * bytes.
+		 */
+		communicator_prefetch_pump_state(false);
+
+		neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
+
+		prefetch_hit = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present);
+
+		if (prefetch_hit)
+		{
+			/* Prefetch hit */
+			return;
+		}
+
+		/* Try to read from local file cache */
+		if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
+		{
+			MyNeonCounters->file_cache_hits_total++;
+			return;
+		}
+
+		neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
+
+		/*
+		 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
+		 */
+		communicator_prefetch_pump_state(false);
 	}

-	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
-
-	/*
-	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
-	 */
-	communicator_prefetch_pump_state(false);
-
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
 	{
@@ -1449,38 +1523,47 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 				 nblocks, PG_IOV_MAX);

 	/* Try to read PS results if they are available */
-	communicator_prefetch_pump_state(false);
-
-	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
-						  request_lsns, nblocks);
+	if (!neon_enable_new_communicator)
+		communicator_prefetch_pump_state(false);

 	memset(read_pages, 0, sizeof(read_pages));

-	prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
-													blocknum, request_lsns, nblocks,
-													buffers, read_pages);
+	if (neon_enable_new_communicator)
+	{
+		communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum,
+									  buffers, nblocks);
+	}
+	else
+	{
+		neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
+							  request_lsns, nblocks);
+		
+		prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
+														blocknum, request_lsns, nblocks,
+														buffers, read_pages);

-	if (prefetch_result == nblocks)
-		return;
+		if (prefetch_result == nblocks)
+			return;

-	/* Try to read from local file cache */
-	lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
-								  nblocks, read_pages);
+		/* Try to read from local file cache */
+		lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
+									  nblocks, read_pages);

-	if (lfc_result > 0)
-		MyNeonCounters->file_cache_hits_total += lfc_result;
+		if (lfc_result > 0)
+			MyNeonCounters->file_cache_hits_total += lfc_result;

-	/* Read all blocks from LFC, so we're done */
-	if (prefetch_result + lfc_result == nblocks)
-		return;
+		/* Read all blocks from LFC, so we're done */
+		if (prefetch_result + lfc_result == nblocks)
+			return;

-	communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
-							  buffers, nblocks, read_pages);
+		communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
+								  buffers, nblocks, read_pages);

-	/*
-	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
-	 */
-	communicator_prefetch_pump_state(false);
+		/*
+		 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
+		 */
+		communicator_prefetch_pump_state(false);
+	}

 #ifdef DEBUG_COMPARE_LOCAL
 	if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -1663,9 +1746,16 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 		 forknum, blocknum,
 		 (uint32) (lsn >> 32), (uint32) lsn);

-	lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
+	if (neon_enable_new_communicator)
+	{
+		communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blocknum, buffer, lsn);
+	}
+	else
+	{
+		lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);

-	communicator_prefetch_pump_state(false);
+		communicator_prefetch_pump_state(false);
+	}

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1725,9 +1815,21 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,

 	neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false);

-	lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
+	if (neon_enable_new_communicator)
+	{
+		for (int i = 0; i < nblocks; i++)
+		{
+			XLogRecPtr lsn = PageGetLSN((Page) buffers[i]);

-	communicator_prefetch_pump_state(false);
+			communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blkno + i, buffers[i], lsn);
+		}
+	}
+	else
+	{
+		lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
+
+		communicator_prefetch_pump_state(false);
+	}

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1763,19 +1865,26 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
+	if (neon_enable_new_communicator)
 	{
-		neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
-			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-			 forknum, n_blocks);
-		return n_blocks;
+		n_blocks = communicator_new_rel_nblocks(InfoFromSMgrRel(reln), forknum);
 	}
+	else
+	{
+		if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
+		{
+			neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
+					 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+					 forknum, n_blocks);
+			return n_blocks;
+		}

-	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
-						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
+		neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
+							  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);

-	n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
-	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
+		n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
+		update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
+	}

 	neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
 			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
@@ -1796,10 +1905,17 @@ neon_dbsize(Oid dbNode)
 	neon_request_lsns request_lsns;
 	NRelFileInfo dummy_node = {0};

-	neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
-						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
+	if (neon_enable_new_communicator)
+	{
+		db_size = communicator_new_dbsize(dbNode);
+	}
+	else
+	{
+		neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
+							  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);

-	db_size = communicator_dbsize(dbNode, &request_lsns);
+		db_size = communicator_dbsize(dbNode, &request_lsns);
+	}

 	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
 			 dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
@@ -1813,8 +1929,6 @@ neon_dbsize(Oid dbNode)
 static void
 neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks)
 {
-	XLogRecPtr	lsn;
-
 	switch (reln->smgr_relpersistence)
 	{
 		case 0:
@@ -1833,34 +1947,43 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
+	if (neon_enable_new_communicator)
+	{
+		communicator_new_rel_truncate(InfoFromSMgrRel(reln), forknum, nblocks);
+	}
+	else
+	{
+		XLogRecPtr	lsn;

-	/*
-	 * Truncating a relation drops all its buffers from the buffer cache
-	 * without calling smgrwrite() on them. But we must account for that in
-	 * our tracking of last-written-LSN all the same: any future smgrnblocks()
-	 * request must return the new size after the truncation. We don't know
-	 * what the LSN of the truncation record was, so be conservative and use
-	 * the most recently inserted WAL record's LSN.
-	 */
-	lsn = GetXLogInsertRecPtr();
-	lsn = nm_adjust_lsn(lsn);
+		set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);

-	/*
-	 * Flush it, too. We don't actually care about it here, but let's uphold
-	 * the invariant that last-written LSN <= flush LSN.
-	 */
-	XLogFlush(lsn);
+		/*
+		 * Truncating a relation drops all its buffers from the buffer cache
+		 * without calling smgrwrite() on them. But we must account for that in
+		 * our tracking of last-written-LSN all the same: any future smgrnblocks()
+		 * request must return the new size after the truncation. We don't know
+		 * what the LSN of the truncation record was, so be conservative and use
+		 * the most recently inserted WAL record's LSN.
+		 */
+		lsn = GetXLogInsertRecPtr();
+		lsn = nm_adjust_lsn(lsn);

-	/*
-	 * Truncate may affect several chunks of relations. So we should either
-	 * update last written LSN for all of them, or update LSN for "dummy"
-	 * metadata block. Second approach seems more efficient. If the relation
-	 * is extended again later, the extension will update the last-written LSN
-	 * for the extended pages, so there's no harm in leaving behind obsolete
-	 * entries for the truncated chunks.
-	 */
-	neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);
+		/*
+		 * Flush it, too. We don't actually care about it here, but let's uphold
+		 * the invariant that last-written LSN <= flush LSN.
+		 */
+		XLogFlush(lsn);
+
+		/*
+		 * Truncate may affect several chunks of relations. So we should either
+		 * update last written LSN for all of them, or update LSN for "dummy"
+		 * metadata block. Second approach seems more efficient. If the relation
+		 * is extended again later, the extension will update the last-written LSN
+		 * for the extended pages, so there's no harm in leaving behind obsolete
+		 * entries for the truncated chunks.
+		 */
+		neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);
+	}

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -1902,7 +2025,8 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)

 	neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");

-	communicator_prefetch_pump_state(false);
+	if (!neon_enable_new_communicator)
+		communicator_prefetch_pump_state(false);

 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -2173,7 +2297,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	request_lsns.not_modified_since = not_modified_since;
 	request_lsns.effective_request_lsn = request_lsn;

-	n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);
+	if (neon_enable_new_communicator)
+		n_blocks = communicator_new_read_slru_segment(kind, segno, buffer);
+	else
+		n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);

 	return n_blocks;
 }
@@ -2210,7 +2337,8 @@ AtEOXact_neon(XactEvent event, void *arg)
 			}
 			break;
 	}
-	communicator_reconfigure_timeout_if_needed();
+	if (!neon_enable_new_communicator)
+		communicator_reconfigure_timeout_if_needed();
 }

 static const struct f_smgr neon_smgr =
@@ -2268,7 +2396,10 @@ smgr_init_neon(void)

 	smgr_init_standard();
 	neon_init();
-	communicator_init();
+	if (neon_enable_new_communicator)
+		communicator_new_init();
+	else
+		communicator_init();
 }


@@ -2280,6 +2411,12 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	/* This is only used in WAL replay */
 	Assert(RecoveryInProgress());

+	if (neon_enable_new_communicator)
+	{
+		// FIXME: broken, but this is only used in replica
+		elog(ERROR, "not implemented yet");
+	}
+
 	/* Extend the relation if we know its size */
 	if (get_cached_relsize(rinfo, forknum, &relsize))
 	{