Add skeleton of parallel xxHash implementation

Merge branch 'quantumish/lfc-resizable-map' into quantumish/lfc-soa-map
Remove prev entry tracking, refactor HashMapInit into proper builder
2026-02-05 19:50:36 +00:00 · 2025-07-03 13:12:41 -07:00 · 2025-06-24 14:36:43 -07:00 · 2025-06-24 13:34:22 -07:00 · 2025-06-23 16:15:43 -07:00 · 2025-06-23 15:38:49 -07:00
108 changed files with 4548 additions and 1446 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1086,6 +1086,25 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"

+[[package]]
+name = "cbindgen"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eadd868a2ce9ca38de7eeafdcec9c7065ef89b42b32f0839278d55f35c54d1ff"
+dependencies = [
+ "clap",
+ "heck 0.4.1",
+ "indexmap 2.9.0",
+ "log",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "serde_json",
+ "syn 2.0.100",
+ "tempfile",
+ "toml",
+]
+
 [[package]]
 name = "cc"
 version = "1.2.16"
@@ -1212,7 +1231,7 @@ version = "4.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -1270,6 +1289,14 @@ dependencies = [
 "unicode-width",
 ]

+[[package]]
+name = "communicator"
+version = "0.1.0"
+dependencies = [
+ "cbindgen",
+ "neon-shmem",
+]
+
 [[package]]
 name = "compute_api"
 version = "0.1.0"
@@ -1936,7 +1963,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
 dependencies = [
 "darling",
 "either",
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -2500,6 +2527,18 @@ dependencies = [
 "wasm-bindgen",
 ]

+[[package]]
+name = "getrandom"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi",
+ "wasi 0.14.2+wasi-0.2.4",
+]
+
 [[package]]
 name = "gettid"
 version = "0.1.3"
@@ -2712,6 +2751,12 @@ dependencies = [
 "http 1.1.0",
 ]

+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -3648,7 +3693,7 @@ version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -3710,7 +3755,7 @@ dependencies = [
 "procfs",
 "prometheus",
 "rand 0.8.5",
- "rand_distr",
+ "rand_distr 0.4.3",
 "twox-hash",
 ]

@@ -3798,7 +3843,11 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 name = "neon-shmem"
 version = "0.1.0"
 dependencies = [
+ "criterion",
 "nix 0.30.1",
+ "rand 0.9.1",
+ "rand_distr 0.5.1",
+ "rustc-hash 1.1.0",
 "tempfile",
 "thiserror 1.0.69",
 "workspace_hack",
@@ -4237,6 +4286,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-trait",
+ "bytes",
 "camino",
 "clap",
 "futures",
@@ -5091,7 +5141,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
 dependencies = [
 "bytes",
- "heck",
+ "heck 0.5.0",
 "itertools 0.12.1",
 "log",
 "multimap",
@@ -5112,7 +5162,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
 "bytes",
- "heck",
+ "heck 0.5.0",
 "itertools 0.12.1",
 "log",
 "multimap",
@@ -5237,7 +5287,7 @@ dependencies = [
 "postgres_backend",
 "pq_proto",
 "rand 0.8.5",
- "rand_distr",
+ "rand_distr 0.4.3",
 "rcgen",
 "redis",
 "regex",
@@ -5341,6 +5391,12 @@ dependencies = [
 "proc-macro2",
 ]

+[[package]]
+name = "r-efi"
+version = "5.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
+
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5365,6 +5421,16 @@ dependencies = [
 "rand_core 0.6.4",
 ]

+[[package]]
+name = "rand"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
+dependencies = [
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.3",
+]
+
 [[package]]
 name = "rand_chacha"
 version = "0.2.2"
@@ -5385,6 +5451,16 @@ dependencies = [
 "rand_core 0.6.4",
 ]

+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.3",
+]
+
 [[package]]
 name = "rand_core"
 version = "0.5.1"
@@ -5403,6 +5479,15 @@ dependencies = [
 "getrandom 0.2.11",
 ]

+[[package]]
+name = "rand_core"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
+dependencies = [
+ "getrandom 0.3.3",
+]
+
 [[package]]
 name = "rand_distr"
 version = "0.4.3"
@@ -5413,6 +5498,16 @@ dependencies = [
 "rand 0.8.5",
 ]

+[[package]]
+name = "rand_distr"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
+dependencies = [
+ "num-traits",
+ "rand 0.9.1",
+]
+
 [[package]]
 name = "rand_hc"
 version = "0.2.0"
@@ -6899,7 +6994,7 @@ version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "rustversion",
@@ -8198,6 +8293,15 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

+[[package]]
+name = "wasi"
+version = "0.14.2+wasi-0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
+dependencies = [
+ "wit-bindgen-rt",
+]
+
 [[package]]
 name = "wasite"
 version = "0.1.0"
@@ -8555,6 +8659,15 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

+[[package]]
+name = "wit-bindgen-rt"
+version = "0.39.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
+dependencies = [
+ "bitflags 2.8.0",
+]
+
 [[package]]
 name = "workspace_hack"
 version = "0.1.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -44,6 +44,7 @@ members = [
    "libs/proxy/postgres-types2",
    "libs/proxy/tokio-postgres2",
    "endpoint_storage",
+    "pgxn/neon/communicator",
 ]

 [workspace.package]
@@ -251,6 +252,7 @@ desim = { version = "0.1", path = "./libs/desim" }
 endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" }
 http-utils = { version = "0.1", path = "./libs/http-utils/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
+neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
 pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
@@ -278,6 +280,7 @@ walproposer = { version = "0.1", path = "./libs/walproposer/" }
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
+cbindgen = "0.28.0"
 criterion = "0.5.1"
 rcgen = "0.13"
 rstest = "0.18"
--- a/7
+++ b/7
@@ -18,10 +18,12 @@ ifeq ($(BUILD_TYPE),release)
 	PG_LDFLAGS = $(LDFLAGS)
 	# Unfortunately, `--profile=...` is a nightly feature
 	CARGO_BUILD_FLAGS += --release
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
 else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
 	PG_CFLAGS += -O0 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
 else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif
@@ -180,11 +182,16 @@ postgres-check-%: postgres-%

 .PHONY: neon-pg-ext-%
 neon-pg-ext-%: postgres-%
+	+@echo "Compiling communicator $*"
+	$(CARGO_CMD_PREFIX) cargo build -p communicator $(CARGO_BUILD_FLAGS)
+
 	+@echo "Compiling neon $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
+		LIBCOMMUNICATOR_PATH=$(NEON_CARGO_ARTIFACT_TARGET_DIR) \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
+
 	+@echo "Compiling neon_walredo $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -297,6 +297,7 @@ RUN ./autogen.sh && \
    ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    make staged-install && \
    cd extensions/postgis && \
    make clean && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -602,7 +603,7 @@ RUN case "${PG_VERSION:?}" in \
        ;; \
    esac && \
 	wget https://github.com/knizhnik/online_advisor/archive/refs/tags/1.0.tar.gz -O online_advisor.tar.gz && \
-    echo "059b7d9e5a90013a58bdd22e9505b88406ce05790675eb2d8434e5b215652d54 online_advisor.tar.gz" | sha256sum --check && \
+    echo "37dcadf8f7cc8d6cc1f8831276ee245b44f1b0274f09e511e47a67738ba9ed0f online_advisor.tar.gz" | sha256sum --check && \
    mkdir online_advisor-src && cd online_advisor-src && tar xzf ../online_advisor.tar.gz --strip-components=1 -C .

 FROM pg-build AS online_advisor-build
@@ -1842,10 +1843,25 @@ RUN make PG_VERSION="${PG_VERSION:?}" -C compute

 FROM pg-build AS extension-tests
 ARG PG_VERSION
+# This is required for the PostGIS test
+RUN apt-get update && case $DEBIAN_VERSION in \
+      bullseye) \
+        apt-get install -y libproj19 libgdal28 time; \
+      ;; \
+      bookworm) \
+        apt-get install -y libgdal32 libproj25 time; \
+      ;; \
+      *) \
+        echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \
+      ;; \
+    esac
+
 COPY docker-compose/ext-src/ /ext-src/

 COPY --from=pg-build /postgres /postgres
-#COPY --from=postgis-src /ext-src/ /ext-src/
+COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=postgis-build /ext-src/postgis-src /ext-src/postgis-src
+COPY --from=postgis-build /sfcgal/* /usr
 COPY --from=plv8-src /ext-src/ /ext-src/
 COPY --from=h3-pg-src /ext-src/h3-pg-src /ext-src/h3-pg-src
 COPY --from=postgresql-unit-src /ext-src/ /ext-src/
@@ -1886,6 +1902,7 @@ COPY compute/patches/pg_repack.patch /ext-src
 RUN cd /ext-src/pg_repack-src && patch -p1 </ext-src/pg_repack.patch && rm -f /ext-src/pg_repack.patch

 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
+RUN echo /usr/local/pgsql/lib > /etc/ld.so.conf.d/00-neon.conf && /sbin/ldconfig
 RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq \
   && apt clean && rm -rf /ext-src/*.tar.gz /ext-src/*.patch /var/lib/apt/lists/*
 ENV PATH=/usr/local/pgsql/bin:$PATH
--- a/compute/manifest.yaml
+++ b/compute/manifest.yaml
@@ -0,0 +1,121 @@
+pg_settings:
+  # Common settings for primaries and replicas of all versions.
+  common:
+    # Check for client disconnection every 1 minute. By default, Postgres will detect the
+    # loss of the connection only at the next interaction with the socket, when it waits
+    # for, receives or sends data, so it will likely waste resources till the end of the
+    # query execution. There should be no drawbacks in setting this for everyone, so enable
+    # it by default. If anyone will complain, we can allow editing it.
+    # https://www.postgresql.org/docs/16/runtime-config-connection.html#GUC-CLIENT-CONNECTION-CHECK-INTERVAL
+    client_connection_check_interval: "60000" # 1 minute
+    # ---- IO ---- 
+    effective_io_concurrency: "20"
+    maintenance_io_concurrency: "100"
+    fsync: "off"
+    hot_standby: "off"
+    # We allow users to change this if needed, but by default we
+    # just don't want to see long-lasting idle transactions, as they
+    # prevent activity monitor from suspending projects.
+    idle_in_transaction_session_timeout: "300000" # 5 minutes
+    listen_addresses: "*"
+    # --- LOGGING ---- helps investigations
+    log_connections: "on"
+    log_disconnections: "on"
+    # 1GB, unit is KB
+    log_temp_files: "1048576"
+    # Disable dumping customer data to logs, both to increase data privacy
+    # and to reduce the amount the logs.
+    log_error_verbosity: "terse"
+    log_min_error_statement: "panic"
+    max_connections: "100"
+    # --- WAL ---
+    # - flush lag is the max amount of WAL that has been generated but not yet stored
+    # to disk in the page server. A smaller value means less delay after a pageserver
+    # restart, but if you set it too small you might again need to slow down writes if the
+    # pageserver cannot flush incoming WAL to disk fast enough. This must be larger
+    # than the pageserver's checkpoint interval, currently 1 GB! Otherwise you get a
+    # a deadlock where the compute node refuses to generate more WAL before the
+    # old WAL has been uploaded to S3, but the pageserver is waiting for more WAL
+    # to be generated before it is uploaded to S3.
+    max_replication_flush_lag: "10GB"
+    max_replication_slots: "10"
+    # Backpressure configuration:
+    # - write lag is the max amount of WAL that has been generated by Postgres but not yet
+    # processed by the page server. Making this smaller reduces the worst case latency
+    # of a GetPage request, if you request a page that was recently modified. On the other
+    # hand, if this is too small, the compute node might need to wait on a write if there is a
+    # hiccup in the network or page server so that the page server has temporarily fallen
+    # behind.
+    #
+    # Previously it was set to 500 MB, but it caused compute being unresponsive under load
+    # https://github.com/neondatabase/neon/issues/2028
+    max_replication_write_lag: "500MB"
+    max_wal_senders: "10"
+    # A Postgres checkpoint is cheap in storage, as doesn't involve any significant amount
+    # of real I/O. Only the SLRU buffers and some other small files are flushed to disk.
+    # However, as long as we have full_page_writes=on, page updates after a checkpoint
+    # include full-page images which bloats the WAL. So may want to bump max_wal_size to
+    # reduce the WAL bloating, but at the same it will increase pg_wal directory size on
+    # compute and can lead to out of disk error on k8s nodes.
+    max_wal_size: "1024"
+    wal_keep_size: "0"
+    wal_level: "replica"
+    # Reduce amount of WAL generated by default.
+    wal_log_hints: "off"
+    # - without wal_sender_timeout set we don't get feedback messages,
+    # required for backpressure.
+    wal_sender_timeout: "10000"
+    # We have some experimental extensions, which we don't want users to install unconsciously.
+    # To install them, users would need to set the `neon.allow_unstable_extensions` setting.
+    # There are two of them currently:
+    # - `pgrag` - https://github.com/neondatabase-labs/pgrag - extension is actually called just `rag`,
+    #                                                          and two dependencies:
+    #                                                          - `rag_bge_small_en_v15`
+    #                                                          - `rag_jina_reranker_v1_tiny_en`
+    # - `pg_mooncake` - https://github.com/Mooncake-Labs/pg_mooncake/  
+    neon.unstable_extensions: "rag,rag_bge_small_en_v15,rag_jina_reranker_v1_tiny_en,pg_mooncake,anon"
+    neon.protocol_version: "3"
+    password_encryption: "scram-sha-256"
+    # This is important to prevent Postgres from trying to perform
+    # a local WAL redo after backend crash. It should exit and let
+    # the systemd or k8s to do a fresh startup with compute_ctl.
+    restart_after_crash: "off"
+    # By default 3. We have the following persistent connections in the VM:
+    # * compute_activity_monitor (from compute_ctl)
+    # * postgres-exporter (metrics collector; it has 2 connections)
+    # * sql_exporter (metrics collector; we have 2 instances [1 for us & users; 1 for autoscaling])
+    # * vm-monitor (to query & change file cache size)
+    # i.e. total of 6. Let's reserve 7, so there's still at least one left over.
+    superuser_reserved_connections: "7"
+    synchronous_standby_names: "walproposer"
+
+  replica:
+    hot_standby: "on"
+
+  per_version:
+    17:
+      common:
+        # PostgreSQL 17 has a new IO system called "read stream", which can combine IOs up to some
+        # size. It still has some issues with readahead, though, so we default to disabled/
+        # "no combining of IOs" to make sure we get the maximum prefetch depth.
+        # See also: https://github.com/neondatabase/neon/pull/9860
+        io_combine_limit: "1"
+      replica:
+        # prefetching of blocks referenced in WAL doesn't make sense for us
+        # Neon hot standby ignores pages that are not in the shared_buffers
+        recovery_prefetch: "off"
+    16:
+      common:
+      replica:
+        # prefetching of blocks referenced in WAL doesn't make sense for us
+        # Neon hot standby ignores pages that are not in the shared_buffers
+        recovery_prefetch: "off"
+    15:
+      common:
+      replica:
+        # prefetching of blocks referenced in WAL doesn't make sense for us
+        # Neon hot standby ignores pages that are not in the shared_buffers
+        recovery_prefetch: "off"
+    14:
+      common:
+      replica:
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -40,7 +40,7 @@ use std::sync::mpsc;
 use std::thread;
 use std::time::Duration;

-use anyhow::{Context, Result};
+use anyhow::{Context, Result, bail};
 use clap::Parser;
 use compute_api::responses::ComputeConfig;
 use compute_tools::compute::{
@@ -57,14 +57,14 @@ use tracing::{error, info};
 use url::Url;
 use utils::failpoint_support;

-#[derive(Parser)]
+#[derive(Debug, Parser)]
 #[command(rename_all = "kebab-case")]
 struct Cli {
    #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
    pub pgbin: String,

    /// The base URL for the remote extension storage proxy gateway.
-    #[arg(short = 'r', long)]
+    #[arg(short = 'r', long, value_parser = Self::parse_remote_ext_base_url)]
    pub remote_ext_base_url: Option<Url>,

    /// The port to bind the external listening HTTP server to. Clients running
@@ -126,6 +126,25 @@ struct Cli {
    pub installed_extensions_collection_interval: u64,
 }

+impl Cli {
+    /// Parse a URL from an argument. By default, this isn't necessary, but we
+    /// want to do some sanity checking.
+    fn parse_remote_ext_base_url(value: &str) -> Result<Url> {
+        // Remove extra trailing slashes, and add one. We use Url::join() later
+        // when downloading remote extensions. If the base URL is something like
+        // http://example.com/pg-ext-s3-gateway, and join() is called with
+        // something like "xyz", the resulting URL is http://example.com/xyz.
+        let value = value.trim_end_matches('/').to_owned() + "/";
+        let url = Url::parse(&value)?;
+
+        if url.query_pairs().count() != 0 {
+            bail!("parameters detected in remote extensions base URL")
+        }
+
+        Ok(url)
+    }
+}
+
 fn main() -> Result<()> {
    let cli = Cli::parse();

@@ -252,7 +271,8 @@ fn handle_exit_signal(sig: i32) {

 #[cfg(test)]
 mod test {
-    use clap::CommandFactory;
+    use clap::{CommandFactory, Parser};
+    use url::Url;

    use super::Cli;

@@ -260,4 +280,43 @@ mod test {
    fn verify_cli() {
        Cli::command().debug_assert()
    }
+
+    #[test]
+    fn verify_remote_ext_base_url() {
+        let cli = Cli::parse_from([
+            "compute_ctl",
+            "--pgdata=test",
+            "--connstr=test",
+            "--compute-id=test",
+            "--remote-ext-base-url",
+            "https://example.com/subpath",
+        ]);
+        assert_eq!(
+            cli.remote_ext_base_url.unwrap(),
+            Url::parse("https://example.com/subpath/").unwrap()
+        );
+
+        let cli = Cli::parse_from([
+            "compute_ctl",
+            "--pgdata=test",
+            "--connstr=test",
+            "--compute-id=test",
+            "--remote-ext-base-url",
+            "https://example.com//",
+        ]);
+        assert_eq!(
+            cli.remote_ext_base_url.unwrap(),
+            Url::parse("https://example.com").unwrap()
+        );
+
+        Cli::try_parse_from([
+            "compute_ctl",
+            "--pgdata=test",
+            "--connstr=test",
+            "--compute-id=test",
+            "--remote-ext-base-url",
+            "https://example.com?hello=world",
+        ])
+        .expect_err("URL parameters are not allowed");
+    }
 }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -3,7 +3,7 @@ use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
 use compute_api::responses::{
    ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
-    LfcPrewarmState,
+    LfcPrewarmState, TlsConfig,
 };
 use compute_api::spec::{
    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
@@ -396,7 +396,7 @@ impl ComputeNode {
        // because QEMU will already have its memory allocated from the host, and
        // the necessary binaries will already be cached.
        if cli_spec.is_none() {
-            this.prewarm_postgres()?;
+            this.prewarm_postgres_vm_memory()?;
        }

        // Set the up metric with Empty status before starting the HTTP server.
@@ -603,6 +603,8 @@ impl ComputeNode {
            });
        }

+        let tls_config = self.tls_config(&pspec.spec);
+
        // If there are any remote extensions in shared_preload_libraries, start downloading them
        if pspec.spec.remote_extensions.is_some() {
            let (this, spec) = (self.clone(), pspec.spec.clone());
@@ -659,7 +661,7 @@ impl ComputeNode {
            info!("tuning pgbouncer");

            let pgbouncer_settings = pgbouncer_settings.clone();
-            let tls_config = self.compute_ctl_config.tls.clone();
+            let tls_config = tls_config.clone();

            // Spawn a background task to do the tuning,
            // so that we don't block the main thread that starts Postgres.
@@ -678,7 +680,10 @@ impl ComputeNode {

            // Spawn a background task to do the configuration,
            // so that we don't block the main thread that starts Postgres.
-            let local_proxy = local_proxy.clone();
+
+            let mut local_proxy = local_proxy.clone();
+            local_proxy.tls = tls_config.clone();
+
            let _handle = tokio::spawn(async move {
                if let Err(err) = local_proxy::configure(&local_proxy) {
                    error!("error while configuring local_proxy: {err:?}");
@@ -779,7 +784,7 @@ impl ComputeNode {
        // Spawn the extension stats background task
        self.spawn_extension_stats_task();

-        if pspec.spec.prewarm_lfc_on_startup {
+        if pspec.spec.autoprewarm {
            self.prewarm_lfc();
        }
        Ok(())
@@ -1205,13 +1210,15 @@ impl ComputeNode {
        let spec = &pspec.spec;
        let pgdata_path = Path::new(&self.params.pgdata);

+        let tls_config = self.tls_config(&pspec.spec);
+
        // Remove/create an empty pgdata directory and put configuration there.
        self.create_pgdata()?;
        config::write_postgres_conf(
            pgdata_path,
            &pspec.spec,
            self.params.internal_http_port,
-            &self.compute_ctl_config.tls,
+            tls_config,
        )?;

        // Syncing safekeepers is only safe with primary nodes: if a primary
@@ -1307,8 +1314,8 @@ impl ComputeNode {
    }

    /// Start and stop a postgres process to warm up the VM for startup.
-    pub fn prewarm_postgres(&self) -> Result<()> {
-        info!("prewarming");
+    pub fn prewarm_postgres_vm_memory(&self) -> Result<()> {
+        info!("prewarming VM memory");

        // Create pgdata
        let pgdata = &format!("{}.warmup", self.params.pgdata);
@@ -1350,7 +1357,7 @@ impl ComputeNode {
        kill(pm_pid, Signal::SIGQUIT)?;
        info!("sent SIGQUIT signal");
        pg.wait()?;
-        info!("done prewarming");
+        info!("done prewarming vm memory");

        // clean up
        let _ok = fs::remove_dir_all(pgdata);
@@ -1536,14 +1543,22 @@ impl ComputeNode {
                .clone(),
        );

+        let mut tls_config = None::<TlsConfig>;
+        if spec.features.contains(&ComputeFeature::TlsExperimental) {
+            tls_config = self.compute_ctl_config.tls.clone();
+        }
+
        let max_concurrent_connections = self.max_service_connections(compute_state, &spec);

        // Merge-apply spec & changes to PostgreSQL state.
        self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?;

        if let Some(local_proxy) = &spec.clone().local_proxy_config {
+            let mut local_proxy = local_proxy.clone();
+            local_proxy.tls = tls_config.clone();
+
            info!("configuring local_proxy");
-            local_proxy::configure(local_proxy).context("apply_config local_proxy")?;
+            local_proxy::configure(&local_proxy).context("apply_config local_proxy")?;
        }

        // Run migrations separately to not hold up cold starts
@@ -1595,11 +1610,13 @@ impl ComputeNode {
    pub fn reconfigure(&self) -> Result<()> {
        let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;

+        let tls_config = self.tls_config(&spec);
+
        if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
            info!("tuning pgbouncer");

            let pgbouncer_settings = pgbouncer_settings.clone();
-            let tls_config = self.compute_ctl_config.tls.clone();
+            let tls_config = tls_config.clone();

            // Spawn a background task to do the tuning,
            // so that we don't block the main thread that starts Postgres.
@@ -1617,7 +1634,7 @@ impl ComputeNode {
            // Spawn a background task to do the configuration,
            // so that we don't block the main thread that starts Postgres.
            let mut local_proxy = local_proxy.clone();
-            local_proxy.tls = self.compute_ctl_config.tls.clone();
+            local_proxy.tls = tls_config.clone();
            tokio::spawn(async move {
                if let Err(err) = local_proxy::configure(&local_proxy) {
                    error!("error while configuring local_proxy: {err:?}");
@@ -1635,7 +1652,7 @@ impl ComputeNode {
            pgdata_path,
            &spec,
            self.params.internal_http_port,
-            &self.compute_ctl_config.tls,
+            tls_config,
        )?;

        if !spec.skip_pg_catalog_updates {
@@ -1755,6 +1772,14 @@ impl ComputeNode {
        }
    }

+    pub fn tls_config(&self, spec: &ComputeSpec) -> &Option<TlsConfig> {
+        if spec.features.contains(&ComputeFeature::TlsExperimental) {
+            &self.compute_ctl_config.tls
+        } else {
+            &None::<TlsConfig>
+        }
+    }
+
    /// Update the `last_active` in the shared state, but ensure that it's a more recent one.
    pub fn update_last_active(&self, last_active: Option<DateTime<Utc>>) {
        let mut state = self.state.lock().unwrap();
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -166,7 +166,7 @@ pub async fn download_extension(

    // TODO add retry logic
    let download_buffer =
-        match download_extension_tar(remote_ext_base_url.as_str(), &ext_path.to_string()).await {
+        match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await {
            Ok(buffer) => buffer,
            Err(error_message) => {
                return Err(anyhow::anyhow!(
@@ -271,10 +271,14 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
 }

 // Do request to extension storage proxy, e.g.,
-// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
+// curl http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/latest/v15/extensions/anon.tar.zst
 // using HTTP GET and return the response body as bytes.
-async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result<Bytes> {
-    let uri = format!("{}/{}", remote_ext_base_url, ext_path);
+async fn download_extension_tar(remote_ext_base_url: &Url, ext_path: &str) -> Result<Bytes> {
+    let uri = remote_ext_base_url.join(ext_path).with_context(|| {
+        format!(
+            "failed to create the remote extension URI for {ext_path} using {remote_ext_base_url}"
+        )
+    })?;
    let filename = Path::new(ext_path)
        .file_name()
        .unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
@@ -284,7 +288,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re

    info!("Downloading extension file '{}' from uri {}", filename, uri);

-    match do_extension_server_request(&uri).await {
+    match do_extension_server_request(uri).await {
        Ok(resp) => {
            info!("Successfully downloaded remote extension data {}", ext_path);
            REMOTE_EXT_REQUESTS_TOTAL
@@ -303,7 +307,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re

 // Do a single remote extensions server request.
 // Return result or (error message + stringified status code) in case of any failures.
-async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String)> {
+async fn do_extension_server_request(uri: Url) -> Result<Bytes, (String, String)> {
    let resp = reqwest::get(uri).await.map_err(|e| {
        (
            format!(
--- a/compute_tools/src/http/mod.rs
+++ b/compute_tools/src/http/mod.rs
@@ -48,11 +48,9 @@ impl JsonResponse {

    /// Create an error response related to the compute being in an invalid state
    pub(self) fn invalid_status(status: ComputeStatus) -> Response {
-        Self::create_response(
+        Self::error(
            StatusCode::PRECONDITION_FAILED,
-            &GenericAPIError {
-                error: format!("invalid compute status: {status}"),
-            },
+            format!("invalid compute status: {status}"),
        )
    }
 }
--- a/compute_tools/src/http/routes/configure.rs
+++ b/compute_tools/src/http/routes/configure.rs
@@ -22,7 +22,7 @@ pub(in crate::http) async fn configure(
    State(compute): State<Arc<ComputeNode>>,
    request: Json<ConfigurationRequest>,
 ) -> Response {
-    let pspec = match ParsedSpec::try_from(request.spec.clone()) {
+    let pspec = match ParsedSpec::try_from(request.0.spec) {
        Ok(p) => p,
        Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
    };
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -13,6 +13,12 @@ use crate::metrics::{PG_CURR_DOWNTIME_MS, PG_TOTAL_DOWNTIME_MS};

 const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);

+/// Struct to store runtime state of the compute monitor thread.
+/// In theory, this could be a part of `Compute`, but i)
+/// this state is expected to be accessed only by single thread,
+/// so we don't need to care about locking; ii) `Compute` is
+/// already quite big. Thus, it seems to be a good idea to keep
+/// all the activity/health monitoring parts here.
 struct ComputeMonitor {
    compute: Arc<ComputeNode>,

@@ -70,12 +76,36 @@ impl ComputeMonitor {
        )
    }

+    /// Check if compute is in some terminal or soon-to-be-terminal
+    /// state, then return `true`, signalling the caller that it
+    /// should exit gracefully. Otherwise, return `false`.
+    fn check_interrupts(&mut self) -> bool {
+        let compute_status = self.compute.get_status();
+        if matches!(
+            compute_status,
+            ComputeStatus::Terminated | ComputeStatus::TerminationPending | ComputeStatus::Failed
+        ) {
+            info!(
+                "compute is in {} status, stopping compute monitor",
+                compute_status
+            );
+            return true;
+        }
+
+        false
+    }
+
    /// Spin in a loop and figure out the last activity time in the Postgres.
-    /// Then update it in the shared state. This function never errors out.
+    /// Then update it in the shared state. This function currently never
+    /// errors out explicitly, but there is a graceful termination path.
+    /// Every time we receive an error trying to check Postgres, we use
+    /// [`ComputeMonitor::check_interrupts()`] because it could be that
+    /// compute is being terminated already, then we can exit gracefully
+    /// to not produce errors' noise in the log.
    /// NB: the only expected panic is at `Mutex` unwrap(), all other errors
    /// should be handled gracefully.
    #[instrument(skip_all)]
-    pub fn run(&mut self) {
+    pub fn run(&mut self) -> anyhow::Result<()> {
        // Suppose that `connstr` doesn't change
        let connstr = self.compute.params.connstr.clone();
        let conf = self
@@ -93,6 +123,10 @@ impl ComputeMonitor {
        info!("starting compute monitor for {}", connstr);

        loop {
+            if self.check_interrupts() {
+                break;
+            }
+
            match &mut client {
                Ok(cli) => {
                    if cli.is_closed() {
@@ -100,6 +134,10 @@ impl ComputeMonitor {
                            downtime_info = self.downtime_info(),
                            "connection to Postgres is closed, trying to reconnect"
                        );
+                        if self.check_interrupts() {
+                            break;
+                        }
+
                        self.report_down();

                        // Connection is closed, reconnect and try again.
@@ -111,15 +149,19 @@ impl ComputeMonitor {
                                self.compute.update_last_active(self.last_active);
                            }
                            Err(e) => {
+                                error!(
+                                    downtime_info = self.downtime_info(),
+                                    "could not check Postgres: {}", e
+                                );
+                                if self.check_interrupts() {
+                                    break;
+                                }
+
                                // Although we have many places where we can return errors in `check()`,
                                // normally it shouldn't happen. I.e., we will likely return error if
                                // connection got broken, query timed out, Postgres returned invalid data, etc.
                                // In all such cases it's suspicious, so let's report this as downtime.
                                self.report_down();
-                                error!(
-                                    downtime_info = self.downtime_info(),
-                                    "could not check Postgres: {}", e
-                                );

                                // Reconnect to Postgres just in case. During tests, I noticed
                                // that queries in `check()` can fail with `connection closed`,
@@ -136,6 +178,10 @@ impl ComputeMonitor {
                        downtime_info = self.downtime_info(),
                        "could not connect to Postgres: {}, retrying", e
                    );
+                    if self.check_interrupts() {
+                        break;
+                    }
+
                    self.report_down();

                    // Establish a new connection and try again.
@@ -147,6 +193,9 @@ impl ComputeMonitor {
            self.last_checked = Utc::now();
            thread::sleep(MONITOR_CHECK_INTERVAL);
        }
+
+        // Graceful termination path
+        Ok(())
    }

    #[instrument(skip_all)]
@@ -429,7 +478,10 @@ pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
        .spawn(move || {
            let span = span!(Level::INFO, "compute_monitor");
            let _enter = span.enter();
-            monitor.run();
+            match monitor.run() {
+                Ok(_) => info!("compute monitor thread terminated gracefully"),
+                Err(err) => error!("compute monitor thread terminated abnormally {:?}", err),
+            }
        })
        .expect("cannot launch compute monitor thread")
 }
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -30,7 +30,7 @@ mod pg_helpers_tests {
            r#"fsync = off
 wal_level = logical
 hot_standby = on
-prewarm_lfc_on_startup = off
+autoprewarm = off
 neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
 wal_log_hints = on
 log_connections = on
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -747,7 +747,7 @@ impl Endpoint {
                logs_export_host: None::<String>,
                endpoint_storage_addr: Some(endpoint_storage_addr),
                endpoint_storage_token: Some(endpoint_storage_token),
-                prewarm_lfc_on_startup: false,
+                autoprewarm: false,
            };

            // this strange code is needed to support respec() in tests
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -513,11 +513,6 @@ impl PageServerNode {
                .map(|x| x.parse::<bool>())
                .transpose()
                .context("Failed to parse 'timeline_offloading' as bool")?,
-            wal_receiver_protocol_override: settings
-                .remove("wal_receiver_protocol_override")
-                .map(serde_json::from_str)
-                .transpose()
-                .context("parse `wal_receiver_protocol_override` from json")?,
            rel_size_v2_enabled: settings
                .remove("rel_size_v2_enabled")
                .map(|x| x.parse::<bool>())
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -13,6 +13,6 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
                       jq   \
                       netcat-openbsd
 #This is required for the pg_hintplan test
-RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw
+RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw /ext-src/postgis-src/ && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw /ext-src/postgis-src

 USER postgres
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -1,18 +1,18 @@
-#!/bin/bash
+#!/usr/bin/env bash
 set -eux

 # Generate a random tenant or timeline ID
 #
 # Takes a variable name as argument. The result is stored in that variable.
 generate_id() {
-    local -n resvar=$1
-    printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
+    local -n resvar=${1}
+    printf -v resvar '%08x%08x%08x%08x' ${SRANDOM} ${SRANDOM} ${SRANDOM} ${SRANDOM}
 }

 PG_VERSION=${PG_VERSION:-14}

-CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
-CONFIG_FILE=/tmp/config.json
+readonly CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
+readonly CONFIG_FILE=/tmp/config.json

 # Test that the first library path that the dynamic loader looks in is the path
 # that we use for custom compiled software
@@ -20,17 +20,17 @@ first_path="$(ldconfig --verbose 2>/dev/null \
    | grep --invert-match ^$'\t' \
    | cut --delimiter=: --fields=1 \
    | head --lines=1)"
-test "$first_path" == '/usr/local/lib'
+test "${first_path}" = '/usr/local/lib'

 echo "Waiting pageserver become ready."
 while ! nc -z pageserver 6400; do
-     sleep 1;
+     sleep 1
 done
 echo "Page server is ready."

-cp ${CONFIG_FILE_ORG} ${CONFIG_FILE}
+cp "${CONFIG_FILE_ORG}" "${CONFIG_FILE}"

- if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
+ if [[ -n "${TENANT_ID:-}" && -n "${TIMELINE_ID:-}" ]]; then
   tenant_id=${TENANT_ID}
   timeline_id=${TIMELINE_ID}
 else
@@ -41,7 +41,7 @@ else
       "http://pageserver:9898/v1/tenant"
  )
  tenant_id=$(curl "${PARAMS[@]}" | jq -r .[0].id)
-  if [ -z "${tenant_id}" ] || [ "${tenant_id}" = null ]; then
+  if [[ -z "${tenant_id}" || "${tenant_id}" = null ]]; then
    echo "Create a tenant"
    generate_id tenant_id
    PARAMS=(
@@ -51,7 +51,7 @@ else
        "http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
    )
    result=$(curl "${PARAMS[@]}")
-    echo $result | jq .
+    printf '%s\n' "${result}" | jq .
  fi

  echo "Check if a timeline present"
@@ -61,7 +61,7 @@ else
       "http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
  )
  timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
-  if [ -z "${timeline_id}" ] || [ "${timeline_id}" = null ]; then
+  if [[ -z "${timeline_id}" || "${timeline_id}" = null ]]; then
    generate_id timeline_id
    PARAMS=(
        -sbf
@@ -71,7 +71,7 @@ else
        "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
    )
    result=$(curl "${PARAMS[@]}")
-    echo $result | jq .
+    printf '%s\n' "${result}" | jq .
  fi
 fi

@@ -82,10 +82,10 @@ else
 fi
 echo "Adding pgx_ulid"
 shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE})
-sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE}
+sed -i "s|${shared_libraries}|${shared_libraries},${ulid_extension}|" ${CONFIG_FILE}
 echo "Overwrite tenant id and timeline id in spec file"
-sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE}
-sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}
+sed -i "s|TENANT_ID|${tenant_id}|" ${CONFIG_FILE}
+sed -i "s|TIMELINE_ID|${timeline_id}|" ${CONFIG_FILE}

 cat ${CONFIG_FILE}

@@ -93,5 +93,5 @@ echo "Start compute node"
 /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
     -C "postgresql://cloud_admin@localhost:55433/postgres"  \
     -b /usr/local/bin/postgres                              \
-     --compute-id "compute-$RANDOM"                          \
-     --config "$CONFIG_FILE"
+     --compute-id "compute-${RANDOM}"                          \
+     --config "${CONFIG_FILE}"
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -186,13 +186,14 @@ services:

  neon-test-extensions:
    profiles: ["test-extensions"]
-    image: ${REPOSITORY:-ghcr.io/neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}}
+    image: ${REPOSITORY:-ghcr.io/neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-${PG_VERSION:-16}}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}}
    environment:
-      - PGPASSWORD=cloud_admin
+      - PGUSER=${PGUSER:-cloud_admin}
+      - PGPASSWORD=${PGPASSWORD:-cloud_admin}
    entrypoint:
      - "/bin/bash"
      - "-c"
    command:
-      - sleep 1800
+      - sleep 3600
    depends_on:
      - compute
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -54,6 +54,15 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
        # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
        echo Adding dummy config
        docker compose exec compute touch /var/db/postgres/compute/compute_ctl_temp_override.conf
+        # Prepare for the PostGIS test
+        docker compose exec compute mkdir -p /tmp/pgis_reg/pgis_reg_tmp
+        TMPDIR=$(mktemp -d)
+        docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${TMPDIR}"
+        docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${TMPDIR}"
+        docker compose exec compute mkdir -p /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install
+        docker compose cp "${TMPDIR}/test" compute:/ext-src/postgis-src/raster/test
+        docker compose cp "${TMPDIR}/00-regress-install" compute:/ext-src/postgis-src/regress
+        rm -rf "${TMPDIR}"
        # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
        TMPDIR=$(mktemp -d)
        docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${TMPDIR}/data"
@@ -68,7 +77,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
        docker compose exec -T neon-test-extensions bash -c "(cd /postgres && patch -p1)" <"../compute/patches/contrib_pg${pg_version}.patch"
        # We are running tests now
        rm -f testout.txt testout_contrib.txt
-        docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \
+        docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \
        neon-test-extensions /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
        docker compose exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
        neon-test-extensions /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
--- a/docker-compose/ext-src/postgis-src/README-Neon.md
+++ b/docker-compose/ext-src/postgis-src/README-Neon.md
@@ -0,0 +1,70 @@
+# PostGIS Testing in Neon
+
+This directory contains configuration files and patches for running PostGIS tests in the Neon database environment.
+
+## Overview
+
+PostGIS is a spatial database extension for PostgreSQL that adds support for geographic objects. Testing PostGIS compatibility ensures that Neon's modifications to PostgreSQL don't break compatibility with this critical extension.
+
+## PostGIS Versions
+
+- PostgreSQL v17: PostGIS 3.5.0
+- PostgreSQL v14/v15/v16: PostGIS 3.3.3
+
+## Test Configuration
+
+The test setup includes:
+
+- `postgis-no-upgrade-test.patch`: Disables upgrade tests by removing the upgrade test section from regress/runtest.mk
+- `postgis-regular-v16.patch`: Version-specific patch for PostgreSQL v16
+- `postgis-regular-v17.patch`: Version-specific patch for PostgreSQL v17
+- `regular-test.sh`: Script to run PostGIS tests as a regular user
+- `neon-test.sh`: Script to handle version-specific test configurations
+- `raster_outdb_template.sql`: Template for raster tests with explicit file paths
+
+## Excluded Tests
+
+**Important Note:** The test exclusions listed below are specifically for regular-user tests against staging instances. These exclusions are necessary because staging instances run with limited privileges and cannot perform operations requiring superuser access. Docker-compose based tests are not affected by these exclusions.
+
+### Tests Requiring Superuser Permissions
+
+These tests cannot be run as a regular user:
+- `estimatedextent`
+- `regress/core/legacy`
+- `regress/core/typmod`
+- `regress/loader/TestSkipANALYZE`
+- `regress/loader/TestANALYZE`
+
+### Tests Requiring Filesystem Access
+
+These tests need direct filesystem access that is only possible for superusers:
+- `loader/load_outdb`
+
+### Tests with Flaky Results
+
+These tests have assumptions that don't always hold true:
+- `regress/core/computed_columns` - Assumes computed columns always outperform alternatives, which is not consistently true
+
+### Tests Requiring Tunable Parameter Modifications
+
+These tests attempt to modify the `postgis.gdal_enabled_drivers` parameter, which is only accessible to superusers:
+- `raster/test/regress/rt_wkb`
+- `raster/test/regress/rt_addband`
+- `raster/test/regress/rt_setbandpath`
+- `raster/test/regress/rt_fromgdalraster`
+- `raster/test/regress/rt_asgdalraster`
+- `raster/test/regress/rt_astiff`
+- `raster/test/regress/rt_asjpeg`
+- `raster/test/regress/rt_aspng`
+- `raster/test/regress/permitted_gdal_drivers`
+- Loader tests: `BasicOutDB`, `Tiled10x10`, `Tiled10x10Copy`, `Tiled8x8`, `TiledAuto`, `TiledAutoSkipNoData`, `TiledAutoCopyn`
+
+### Topology Tests (v17 only)
+- `populate_topology_layer`
+- `renametopogeometrycolumn`
+
+## Other Modifications
+
+- Binary.sql tests are modified to use explicit file paths
+- Server-side SQL COPY commands (which require superuser privileges) are converted to client-side `\copy` commands
+- Upgrade tests are disabled
--- a/docker-compose/ext-src/postgis-src/neon-test.sh
+++ b/docker-compose/ext-src/postgis-src/neon-test.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname "$0")"
+patch -p1 <"postgis-common-${PG_VERSION}.patch"
+trap 'echo Cleaning up; patch -R -p1 <postgis-common-${PG_VERSION}.patch' EXIT
+make installcheck-base
--- a/docker-compose/ext-src/postgis-src/postgis-common-v16.patch
+++ b/docker-compose/ext-src/postgis-src/postgis-common-v16.patch
@@ -0,0 +1,37 @@
+diff --git a/regress/core/tests.mk b/regress/core/tests.mk
+index 3abd7bc..64a9254 100644
+--- a/regress/core/tests.mk
+++ b/regress/core/tests.mk
+@@ -144,11 +144,6 @@ TESTS_SLOW = \
+ 	$(top_srcdir)/regress/core/concave_hull_hard \
+ 	$(top_srcdir)/regress/core/knn_recheck
+ 
+-ifeq ($(shell expr "$(POSTGIS_PGSQL_VERSION)" ">=" 120),1)
+-	TESTS += \
+-		$(top_srcdir)/regress/core/computed_columns
+-endif
+-
+ ifeq ($(shell expr "$(POSTGIS_GEOS_VERSION)" ">=" 30700),1)
+ 	# GEOS-3.7 adds:
+ 	# ST_FrechetDistance
+diff --git a/regress/runtest.mk b/regress/runtest.mk
+index c051f03..010e493 100644
+--- a/regress/runtest.mk
+++ b/regress/runtest.mk
+@@ -24,16 +24,6 @@ check-regress:
+ 
+ 	POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(RUNTESTFLAGS_INTERNAL) $(TESTS)
+ 
+-	@if echo "$(RUNTESTFLAGS)" | grep -vq -- --upgrade; then \
+-		echo "Running upgrade test as RUNTESTFLAGS did not contain that"; \
+-		POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl \
+-      --upgrade \
+-      $(RUNTESTFLAGS) \
+-      $(RUNTESTFLAGS_INTERNAL) \
+-      $(TESTS); \
+-	else \
+-		echo "Skipping upgrade test as RUNTESTFLAGS already requested upgrades"; \
+-	fi
+ 
+ check-long:
+ 	$(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(TESTS) $(TESTS_SLOW)
--- a/docker-compose/ext-src/postgis-src/postgis-common-v17.patch
+++ b/docker-compose/ext-src/postgis-src/postgis-common-v17.patch
@@ -0,0 +1,35 @@
+diff --git a/regress/core/tests.mk b/regress/core/tests.mk
+index 9e05244..90987df 100644
+--- a/regress/core/tests.mk
+++ b/regress/core/tests.mk
+@@ -143,8 +143,7 @@ TESTS += \
+ 	$(top_srcdir)/regress/core/oriented_envelope \
+ 	$(top_srcdir)/regress/core/point_coordinates \
+ 	$(top_srcdir)/regress/core/out_geojson \
+-  $(top_srcdir)/regress/core/wrapx \
+-	$(top_srcdir)/regress/core/computed_columns
+  $(top_srcdir)/regress/core/wrapx
+ 
+ # Slow slow tests
+ TESTS_SLOW = \
+diff --git a/regress/runtest.mk b/regress/runtest.mk
+index 4b95b7e..449d5a2 100644
+--- a/regress/runtest.mk
+++ b/regress/runtest.mk
+@@ -24,16 +24,6 @@ check-regress:
+ 
+ 	@POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(RUNTESTFLAGS_INTERNAL) $(TESTS)
+ 
+-	@if echo "$(RUNTESTFLAGS)" | grep -vq -- --upgrade; then \
+-		echo "Running upgrade test as RUNTESTFLAGS did not contain that"; \
+-		POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl \
+-      --upgrade \
+-      $(RUNTESTFLAGS) \
+-      $(RUNTESTFLAGS_INTERNAL) \
+-      $(TESTS); \
+-	else \
+-		echo "Skipping upgrade test as RUNTESTFLAGS already requested upgrades"; \
+-	fi
+ 
+ check-long:
+ 	$(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(TESTS) $(TESTS_SLOW)
--- a/docker-compose/ext-src/postgis-src/postgis-regular-v16.patch
+++ b/docker-compose/ext-src/postgis-src/postgis-regular-v16.patch
@@ -0,0 +1,186 @@
+diff --git a/raster/test/regress/tests.mk b/raster/test/regress/tests.mk
+index 00918e1..7e2b6cd 100644
+--- a/raster/test/regress/tests.mk
+++ b/raster/test/regress/tests.mk
+@@ -17,9 +17,7 @@ override RUNTESTFLAGS_INTERNAL := \
+   $(RUNTESTFLAGS_INTERNAL) \
+   --after-upgrade-script $(top_srcdir)/raster/test/regress/hooks/hook-after-upgrade-raster.sql
+ 
+-RASTER_TEST_FIRST = \
+-	$(top_srcdir)/raster/test/regress/check_gdal \
+-	$(top_srcdir)/raster/test/regress/loader/load_outdb
+RASTER_TEST_FIRST =
+ 
+ RASTER_TEST_LAST = \
+ 	$(top_srcdir)/raster/test/regress/clean
+@@ -33,9 +31,7 @@ RASTER_TEST_IO = \
+ 
+ RASTER_TEST_BASIC_FUNC = \
+ 	$(top_srcdir)/raster/test/regress/rt_bytea \
+-	$(top_srcdir)/raster/test/regress/rt_wkb \
+ 	$(top_srcdir)/raster/test/regress/box3d \
+-	$(top_srcdir)/raster/test/regress/rt_addband \
+ 	$(top_srcdir)/raster/test/regress/rt_band \
+ 	$(top_srcdir)/raster/test/regress/rt_tile
+ 
+@@ -73,16 +69,10 @@ RASTER_TEST_BANDPROPS = \
+ 	$(top_srcdir)/raster/test/regress/rt_neighborhood \
+ 	$(top_srcdir)/raster/test/regress/rt_nearestvalue \
+ 	$(top_srcdir)/raster/test/regress/rt_pixelofvalue \
+-	$(top_srcdir)/raster/test/regress/rt_polygon \
+-	$(top_srcdir)/raster/test/regress/rt_setbandpath
+	$(top_srcdir)/raster/test/regress/rt_polygon
+ 
+ RASTER_TEST_UTILITY = \
+ 	$(top_srcdir)/raster/test/regress/rt_utility \
+-	$(top_srcdir)/raster/test/regress/rt_fromgdalraster \
+-	$(top_srcdir)/raster/test/regress/rt_asgdalraster \
+-	$(top_srcdir)/raster/test/regress/rt_astiff \
+-	$(top_srcdir)/raster/test/regress/rt_asjpeg \
+-	$(top_srcdir)/raster/test/regress/rt_aspng \
+ 	$(top_srcdir)/raster/test/regress/rt_reclass \
+ 	$(top_srcdir)/raster/test/regress/rt_gdalwarp \
+ 	$(top_srcdir)/raster/test/regress/rt_gdalcontour \
+@@ -120,21 +110,13 @@ RASTER_TEST_SREL = \
+ 
+ RASTER_TEST_BUGS = \
+ 	$(top_srcdir)/raster/test/regress/bug_test_car5 \
+-	$(top_srcdir)/raster/test/regress/permitted_gdal_drivers \
+ 	$(top_srcdir)/raster/test/regress/tickets
+ 
+ RASTER_TEST_LOADER = \
+ 	$(top_srcdir)/raster/test/regress/loader/Basic \
+ 	$(top_srcdir)/raster/test/regress/loader/Projected \
+ 	$(top_srcdir)/raster/test/regress/loader/BasicCopy \
+-	$(top_srcdir)/raster/test/regress/loader/BasicFilename \
+-	$(top_srcdir)/raster/test/regress/loader/BasicOutDB \
+-	$(top_srcdir)/raster/test/regress/loader/Tiled10x10 \
+-	$(top_srcdir)/raster/test/regress/loader/Tiled10x10Copy \
+-	$(top_srcdir)/raster/test/regress/loader/Tiled8x8 \
+-	$(top_srcdir)/raster/test/regress/loader/TiledAuto \
+-	$(top_srcdir)/raster/test/regress/loader/TiledAutoSkipNoData \
+-	$(top_srcdir)/raster/test/regress/loader/TiledAutoCopyn
+	$(top_srcdir)/raster/test/regress/loader/BasicFilename
+ 
+ RASTER_TESTS := $(RASTER_TEST_FIRST) \
+ 	$(RASTER_TEST_METADATA) $(RASTER_TEST_IO) $(RASTER_TEST_BASIC_FUNC) \
+diff --git a/regress/core/binary.sql b/regress/core/binary.sql
+index 7a36b65..ad78fc7 100644
+--- a/regress/core/binary.sql
+++ b/regress/core/binary.sql
+@@ -1,4 +1,5 @@
+ SET client_min_messages TO warning;
+
+ CREATE SCHEMA tm;
+ 
+ CREATE TABLE tm.geoms (id serial, g geometry);
+@@ -31,24 +32,39 @@ SELECT st_force4d(g) FROM tm.geoms WHERE id < 15 ORDER BY id;
+ INSERT INTO tm.geoms(g)
+ SELECT st_setsrid(g,4326) FROM tm.geoms ORDER BY id;
+ 
+-COPY tm.geoms TO :tmpfile WITH BINARY;
+-- define temp file path
+\set tmpfile '/tmp/postgis_binary_test.dat'
+
+-- export
+\set command '\\copy tm.geoms TO ':tmpfile' WITH (FORMAT BINARY)'
+:command
+
+-- import
+ CREATE TABLE tm.geoms_in AS SELECT * FROM tm.geoms LIMIT 0;
+-COPY tm.geoms_in FROM :tmpfile WITH BINARY;
+-SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o WHERE i.id = o.id
+- AND ST_OrderingEquals(i.g, o.g);
+\set command '\\copy tm.geoms_in FROM ':tmpfile' WITH (FORMAT BINARY)'
+:command
+
+SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o
+WHERE i.id = o.id AND ST_OrderingEquals(i.g, o.g);
+ 
+ CREATE TABLE tm.geogs AS SELECT id,g::geography FROM tm.geoms
+ WHERE geometrytype(g) NOT LIKE '%CURVE%'
+   AND geometrytype(g) NOT LIKE '%CIRCULAR%'
+   AND geometrytype(g) NOT LIKE '%SURFACE%'
+   AND geometrytype(g) NOT LIKE 'TRIANGLE%'
+-  AND geometrytype(g) NOT LIKE 'TIN%'
+-;
+  AND geometrytype(g) NOT LIKE 'TIN%';
+ 
+-COPY tm.geogs TO :tmpfile WITH BINARY;
+-- export
+\set command '\\copy tm.geogs TO ':tmpfile' WITH (FORMAT BINARY)'
+:command
+
+-- import
+ CREATE TABLE tm.geogs_in AS SELECT * FROM tm.geogs LIMIT 0;
+-COPY tm.geogs_in FROM :tmpfile WITH BINARY;
+-SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o WHERE i.id = o.id
+- AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
+\set command '\\copy tm.geogs_in FROM ':tmpfile' WITH (FORMAT BINARY)'
+:command
+
+SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o
+WHERE i.id = o.id AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
+ 
+ DROP SCHEMA tm CASCADE;
+
+diff --git a/regress/core/tests.mk b/regress/core/tests.mk
+index 64a9254..94903c3 100644
+--- a/regress/core/tests.mk
+++ b/regress/core/tests.mk
+@@ -23,7 +23,6 @@ current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+ RUNTESTFLAGS_INTERNAL += \
+   --before-upgrade-script $(top_srcdir)/regress/hooks/hook-before-upgrade.sql \
+   --after-upgrade-script  $(top_srcdir)/regress/hooks/hook-after-upgrade.sql \
+-  --after-create-script   $(top_srcdir)/regress/hooks/hook-after-create.sql \
+   --before-uninstall-script $(top_srcdir)/regress/hooks/hook-before-uninstall.sql
+ 
+ TESTS += \
+@@ -40,7 +39,6 @@ TESTS += \
+ 	$(top_srcdir)/regress/core/dumppoints \
+ 	$(top_srcdir)/regress/core/dumpsegments \
+ 	$(top_srcdir)/regress/core/empty \
+-	$(top_srcdir)/regress/core/estimatedextent \
+ 	$(top_srcdir)/regress/core/forcecurve \
+ 	$(top_srcdir)/regress/core/flatgeobuf \
+ 	$(top_srcdir)/regress/core/geography \
+@@ -55,7 +53,6 @@ TESTS += \
+ 	$(top_srcdir)/regress/core/out_marc21 \
+ 	$(top_srcdir)/regress/core/in_encodedpolyline \
+ 	$(top_srcdir)/regress/core/iscollection \
+-	$(top_srcdir)/regress/core/legacy \
+ 	$(top_srcdir)/regress/core/letters \
+ 	$(top_srcdir)/regress/core/long_xact \
+ 	$(top_srcdir)/regress/core/lwgeom_regress \
+@@ -112,7 +109,6 @@ TESTS += \
+ 	$(top_srcdir)/regress/core/temporal_knn \
+ 	$(top_srcdir)/regress/core/tickets \
+ 	$(top_srcdir)/regress/core/twkb \
+-	$(top_srcdir)/regress/core/typmod \
+ 	$(top_srcdir)/regress/core/wkb \
+ 	$(top_srcdir)/regress/core/wkt \
+ 	$(top_srcdir)/regress/core/wmsservers \
+diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk
+index 1fc77ac..c3cb9de 100644
+--- a/regress/loader/tests.mk
+++ b/regress/loader/tests.mk
+@@ -38,7 +38,5 @@ TESTS += \
+ 	$(top_srcdir)/regress/loader/Latin1 \
+ 	$(top_srcdir)/regress/loader/Latin1-implicit \
+ 	$(top_srcdir)/regress/loader/mfile \
+-	$(top_srcdir)/regress/loader/TestSkipANALYZE \
+-	$(top_srcdir)/regress/loader/TestANALYZE \
+ 	$(top_srcdir)/regress/loader/CharNoWidth
+ 
+diff --git a/regress/run_test.pl b/regress/run_test.pl
+index 0ec5b2d..1c331f4 100755
+--- a/regress/run_test.pl
+++ b/regress/run_test.pl
+@@ -147,7 +147,6 @@ $ENV{"LANG"} = "C";
+ # Add locale info to the psql options
+ # Add pg12 precision suppression
+ my $PGOPTIONS = $ENV{"PGOPTIONS"};
+-$PGOPTIONS .= " -c lc_messages=C";
+ $PGOPTIONS .= " -c client_min_messages=NOTICE";
+ $PGOPTIONS .= " -c extra_float_digits=0";
+ $ENV{"PGOPTIONS"} = $PGOPTIONS;
--- a/docker-compose/ext-src/postgis-src/postgis-regular-v17.patch
+++ b/docker-compose/ext-src/postgis-src/postgis-regular-v17.patch
@@ -0,0 +1,208 @@
+diff --git a/raster/test/regress/tests.mk b/raster/test/regress/tests.mk
+index 00918e1..7e2b6cd 100644
+--- a/raster/test/regress/tests.mk
+++ b/raster/test/regress/tests.mk
+@@ -17,9 +17,7 @@ override RUNTESTFLAGS_INTERNAL := \
+   $(RUNTESTFLAGS_INTERNAL) \
+   --after-upgrade-script $(top_srcdir)/raster/test/regress/hooks/hook-after-upgrade-raster.sql
+ 
+-RASTER_TEST_FIRST = \
+-	$(top_srcdir)/raster/test/regress/check_gdal \
+-	$(top_srcdir)/raster/test/regress/loader/load_outdb
+RASTER_TEST_FIRST =
+ 
+ RASTER_TEST_LAST = \
+ 	$(top_srcdir)/raster/test/regress/clean
+@@ -33,9 +31,7 @@ RASTER_TEST_IO = \
+ 
+ RASTER_TEST_BASIC_FUNC = \
+ 	$(top_srcdir)/raster/test/regress/rt_bytea \
+-	$(top_srcdir)/raster/test/regress/rt_wkb \
+ 	$(top_srcdir)/raster/test/regress/box3d \
+-	$(top_srcdir)/raster/test/regress/rt_addband \
+ 	$(top_srcdir)/raster/test/regress/rt_band \
+ 	$(top_srcdir)/raster/test/regress/rt_tile
+ 
+@@ -73,16 +69,10 @@ RASTER_TEST_BANDPROPS = \
+ 	$(top_srcdir)/raster/test/regress/rt_neighborhood \
+ 	$(top_srcdir)/raster/test/regress/rt_nearestvalue \
+ 	$(top_srcdir)/raster/test/regress/rt_pixelofvalue \
+-	$(top_srcdir)/raster/test/regress/rt_polygon \
+-	$(top_srcdir)/raster/test/regress/rt_setbandpath
+	$(top_srcdir)/raster/test/regress/rt_polygon
+ 
+ RASTER_TEST_UTILITY = \
+ 	$(top_srcdir)/raster/test/regress/rt_utility \
+-	$(top_srcdir)/raster/test/regress/rt_fromgdalraster \
+-	$(top_srcdir)/raster/test/regress/rt_asgdalraster \
+-	$(top_srcdir)/raster/test/regress/rt_astiff \
+-	$(top_srcdir)/raster/test/regress/rt_asjpeg \
+-	$(top_srcdir)/raster/test/regress/rt_aspng \
+ 	$(top_srcdir)/raster/test/regress/rt_reclass \
+ 	$(top_srcdir)/raster/test/regress/rt_gdalwarp \
+ 	$(top_srcdir)/raster/test/regress/rt_gdalcontour \
+@@ -120,21 +110,13 @@ RASTER_TEST_SREL = \
+ 
+ RASTER_TEST_BUGS = \
+ 	$(top_srcdir)/raster/test/regress/bug_test_car5 \
+-	$(top_srcdir)/raster/test/regress/permitted_gdal_drivers \
+ 	$(top_srcdir)/raster/test/regress/tickets
+ 
+ RASTER_TEST_LOADER = \
+ 	$(top_srcdir)/raster/test/regress/loader/Basic \
+ 	$(top_srcdir)/raster/test/regress/loader/Projected \
+ 	$(top_srcdir)/raster/test/regress/loader/BasicCopy \
+-	$(top_srcdir)/raster/test/regress/loader/BasicFilename \
+-	$(top_srcdir)/raster/test/regress/loader/BasicOutDB \
+-	$(top_srcdir)/raster/test/regress/loader/Tiled10x10 \
+-	$(top_srcdir)/raster/test/regress/loader/Tiled10x10Copy \
+-	$(top_srcdir)/raster/test/regress/loader/Tiled8x8 \
+-	$(top_srcdir)/raster/test/regress/loader/TiledAuto \
+-	$(top_srcdir)/raster/test/regress/loader/TiledAutoSkipNoData \
+-	$(top_srcdir)/raster/test/regress/loader/TiledAutoCopyn
+	$(top_srcdir)/raster/test/regress/loader/BasicFilename
+ 
+ RASTER_TESTS := $(RASTER_TEST_FIRST) \
+ 	$(RASTER_TEST_METADATA) $(RASTER_TEST_IO) $(RASTER_TEST_BASIC_FUNC) \
+diff --git a/regress/core/binary.sql b/regress/core/binary.sql
+index 7a36b65..ad78fc7 100644
+--- a/regress/core/binary.sql
+++ b/regress/core/binary.sql
+@@ -1,4 +1,5 @@
+ SET client_min_messages TO warning;
+
+ CREATE SCHEMA tm;
+ 
+ CREATE TABLE tm.geoms (id serial, g geometry);
+@@ -31,24 +32,39 @@ SELECT st_force4d(g) FROM tm.geoms WHERE id < 15 ORDER BY id;
+ INSERT INTO tm.geoms(g)
+ SELECT st_setsrid(g,4326) FROM tm.geoms ORDER BY id;
+ 
+-COPY tm.geoms TO :tmpfile WITH BINARY;
+-- define temp file path
+\set tmpfile '/tmp/postgis_binary_test.dat'
+
+-- export
+\set command '\\copy tm.geoms TO ':tmpfile' WITH (FORMAT BINARY)'
+:command
+
+-- import
+ CREATE TABLE tm.geoms_in AS SELECT * FROM tm.geoms LIMIT 0;
+-COPY tm.geoms_in FROM :tmpfile WITH BINARY;
+-SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o WHERE i.id = o.id
+- AND ST_OrderingEquals(i.g, o.g);
+\set command '\\copy tm.geoms_in FROM ':tmpfile' WITH (FORMAT BINARY)'
+:command
+
+SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o
+WHERE i.id = o.id AND ST_OrderingEquals(i.g, o.g);
+ 
+ CREATE TABLE tm.geogs AS SELECT id,g::geography FROM tm.geoms
+ WHERE geometrytype(g) NOT LIKE '%CURVE%'
+   AND geometrytype(g) NOT LIKE '%CIRCULAR%'
+   AND geometrytype(g) NOT LIKE '%SURFACE%'
+   AND geometrytype(g) NOT LIKE 'TRIANGLE%'
+-  AND geometrytype(g) NOT LIKE 'TIN%'
+-;
+  AND geometrytype(g) NOT LIKE 'TIN%';
+ 
+-COPY tm.geogs TO :tmpfile WITH BINARY;
+-- export
+\set command '\\copy tm.geogs TO ':tmpfile' WITH (FORMAT BINARY)'
+:command
+
+-- import
+ CREATE TABLE tm.geogs_in AS SELECT * FROM tm.geogs LIMIT 0;
+-COPY tm.geogs_in FROM :tmpfile WITH BINARY;
+-SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o WHERE i.id = o.id
+- AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
+\set command '\\copy tm.geogs_in FROM ':tmpfile' WITH (FORMAT BINARY)'
+:command
+
+SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o
+WHERE i.id = o.id AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
+ 
+ DROP SCHEMA tm CASCADE;
+
+diff --git a/regress/core/tests.mk b/regress/core/tests.mk
+index 90987df..74fe3f1 100644
+--- a/regress/core/tests.mk
+++ b/regress/core/tests.mk
+@@ -16,14 +16,13 @@ POSTGIS_PGSQL_VERSION=170
+ POSTGIS_GEOS_VERSION=31101
+ HAVE_JSON=yes
+ HAVE_SPGIST=yes
+-INTERRUPTTESTS=yes
+INTERRUPTTESTS=no
+ 
+ current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+ 
+ RUNTESTFLAGS_INTERNAL += \
+   --before-upgrade-script $(top_srcdir)/regress/hooks/hook-before-upgrade.sql \
+   --after-upgrade-script  $(top_srcdir)/regress/hooks/hook-after-upgrade.sql \
+-  --after-create-script   $(top_srcdir)/regress/hooks/hook-after-create.sql \
+   --before-uninstall-script $(top_srcdir)/regress/hooks/hook-before-uninstall.sql
+ 
+ TESTS += \
+@@ -40,7 +39,6 @@ TESTS += \
+ 	$(top_srcdir)/regress/core/dumppoints \
+ 	$(top_srcdir)/regress/core/dumpsegments \
+ 	$(top_srcdir)/regress/core/empty \
+-	$(top_srcdir)/regress/core/estimatedextent \
+ 	$(top_srcdir)/regress/core/forcecurve \
+ 	$(top_srcdir)/regress/core/flatgeobuf \
+ 	$(top_srcdir)/regress/core/frechet \
+@@ -60,7 +58,6 @@ TESTS += \
+ 	$(top_srcdir)/regress/core/out_marc21 \
+ 	$(top_srcdir)/regress/core/in_encodedpolyline \
+ 	$(top_srcdir)/regress/core/iscollection \
+-	$(top_srcdir)/regress/core/legacy \
+ 	$(top_srcdir)/regress/core/letters \
+ 	$(top_srcdir)/regress/core/lwgeom_regress \
+ 	$(top_srcdir)/regress/core/measures \
+@@ -119,7 +116,6 @@ TESTS += \
+ 	$(top_srcdir)/regress/core/temporal_knn \
+ 	$(top_srcdir)/regress/core/tickets \
+ 	$(top_srcdir)/regress/core/twkb \
+-	$(top_srcdir)/regress/core/typmod \
+ 	$(top_srcdir)/regress/core/wkb \
+ 	$(top_srcdir)/regress/core/wkt \
+ 	$(top_srcdir)/regress/core/wmsservers \
+diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk
+index ac4f8ad..4bad4fc 100644
+--- a/regress/loader/tests.mk
+++ b/regress/loader/tests.mk
+@@ -38,7 +38,5 @@ TESTS += \
+ 	$(top_srcdir)/regress/loader/Latin1 \
+ 	$(top_srcdir)/regress/loader/Latin1-implicit \
+ 	$(top_srcdir)/regress/loader/mfile \
+-	$(top_srcdir)/regress/loader/TestSkipANALYZE \
+-	$(top_srcdir)/regress/loader/TestANALYZE \
+ 	$(top_srcdir)/regress/loader/CharNoWidth \
+ 
+diff --git a/regress/run_test.pl b/regress/run_test.pl
+index cac4b2e..4c7c82b 100755
+--- a/regress/run_test.pl
+++ b/regress/run_test.pl
+@@ -238,7 +238,6 @@ $ENV{"LANG"} = "C";
+ # Add locale info to the psql options
+ # Add pg12 precision suppression
+ my $PGOPTIONS = $ENV{"PGOPTIONS"};
+-$PGOPTIONS .= " -c lc_messages=C";
+ $PGOPTIONS .= " -c client_min_messages=NOTICE";
+ $PGOPTIONS .= " -c extra_float_digits=0";
+ $ENV{"PGOPTIONS"} = $PGOPTIONS;
+diff --git a/topology/test/tests.mk b/topology/test/tests.mk
+index cbe2633..2c7c18f 100644
+--- a/topology/test/tests.mk
+++ b/topology/test/tests.mk
+@@ -46,9 +46,7 @@ TESTS += \
+ 	$(top_srcdir)/topology/test/regress/legacy_query.sql \
+ 	$(top_srcdir)/topology/test/regress/legacy_validate.sql \
+ 	$(top_srcdir)/topology/test/regress/polygonize.sql \
+-	$(top_srcdir)/topology/test/regress/populate_topology_layer.sql \
+ 	$(top_srcdir)/topology/test/regress/removeunusedprimitives.sql \
+-	$(top_srcdir)/topology/test/regress/renametopogeometrycolumn.sql \
+ 	$(top_srcdir)/topology/test/regress/renametopology.sql \
+ 	$(top_srcdir)/topology/test/regress/share_sequences.sql \
+ 	$(top_srcdir)/topology/test/regress/sqlmm.sql \
--- a/docker-compose/ext-src/postgis-src/raster_outdb_template.sql
+++ b/docker-compose/ext-src/postgis-src/raster_outdb_template.sql
--- a/docker-compose/ext-src/postgis-src/regular-test.sh
+++ b/docker-compose/ext-src/postgis-src/regular-test.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+set -ex
+cd "$(dirname "${0}")"
+dropdb --if-exist contrib_regression
+createdb contrib_regression
+psql -d contrib_regression -c "ALTER DATABASE contrib_regression SET TimeZone='UTC'" \
+     -c "ALTER DATABASE contrib_regression SET DateStyle='ISO, MDY'" \
+     -c "CREATE EXTENSION postgis SCHEMA public" \
+     -c "CREATE EXTENSION postgis_topology" \
+     -c "CREATE EXTENSION postgis_tiger_geocoder CASCADE" \
+     -c "CREATE EXTENSION postgis_raster SCHEMA public" \
+     -c "CREATE EXTENSION postgis_sfcgal SCHEMA public"
+patch -p1 <"postgis-common-${PG_VERSION}.patch"
+patch -p1 <"postgis-regular-${PG_VERSION}.patch"
+psql -d contrib_regression -f raster_outdb_template.sql
+trap 'patch -R -p1 <postgis-regular-${PG_VERSION}.patch && patch -R -p1 <"postgis-common-${PG_VERSION}.patch"' EXIT
+POSTGIS_REGRESS_DB=contrib_regression RUNTESTFLAGS=--nocreate make installcheck-base
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -63,5 +63,9 @@ done
 for d in ${FAILED}; do
  cat "$(find $d -name regression.diffs)"
 done
+for postgis_diff in /tmp/pgis_reg/*_diff; do
+  echo "${postgis_diff}:"
+  cat "${postgis_diff}"
+done
 echo "${FAILED}"
 exit 1
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -178,9 +178,9 @@ pub struct ComputeSpec {
    /// JWT for authorizing requests to endpoint storage service
    pub endpoint_storage_token: Option<String>,

-    /// If true, download LFC state from endpoint_storage and pass it to Postgres on startup
+    /// Download LFC state from endpoint_storage and pass it to Postgres on startup
    #[serde(default)]
-    pub prewarm_lfc_on_startup: bool,
+    pub autoprewarm: bool,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -192,6 +192,9 @@ pub enum ComputeFeature {
    /// track short-lived connections as user activity.
    ActivityMonitorExperimental,

+    /// Enable TLS functionality.
+    TlsExperimental,
+
    /// This is a special feature flag that is used to represent unknown feature flags.
    /// Basically all unknown to enum flags are represented as this one. See unit test
    /// `parse_unknown_features()` for more details.
@@ -250,34 +253,44 @@ impl RemoteExtSpec {
        }

        match self.extension_data.get(real_ext_name) {
-            Some(_ext_data) => {
-                // We have decided to use the Go naming convention due to Kubernetes.
-
-                let arch = match std::env::consts::ARCH {
-                    "x86_64" => "amd64",
-                    "aarch64" => "arm64",
-                    arch => arch,
-                };
-
-                // Construct the path to the extension archive
-                // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
-                //
-                // Keep it in sync with path generation in
-                // https://github.com/neondatabase/build-custom-extensions/tree/main
-                let archive_path_str = format!(
-                    "{build_tag}/{arch}/{pg_major_version}/extensions/{real_ext_name}.tar.zst"
-                );
-                Ok((
-                    real_ext_name.to_string(),
-                    RemotePath::from_string(&archive_path_str)?,
-                ))
-            }
+            Some(_ext_data) => Ok((
+                real_ext_name.to_string(),
+                Self::build_remote_path(build_tag, pg_major_version, real_ext_name)?,
+            )),
            None => Err(anyhow::anyhow!(
                "real_ext_name {} is not found",
                real_ext_name
            )),
        }
    }
+
+    /// Get the architecture-specific portion of the remote extension path. We
+    /// use the Go naming convention due to Kubernetes.
+    fn get_arch() -> &'static str {
+        match std::env::consts::ARCH {
+            "x86_64" => "amd64",
+            "aarch64" => "arm64",
+            arch => arch,
+        }
+    }
+
+    /// Build a [`RemotePath`] for an extension.
+    fn build_remote_path(
+        build_tag: &str,
+        pg_major_version: &str,
+        ext_name: &str,
+    ) -> anyhow::Result<RemotePath> {
+        let arch = Self::get_arch();
+
+        // Construct the path to the extension archive
+        // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
+        //
+        // Keep it in sync with path generation in
+        // https://github.com/neondatabase/build-custom-extensions/tree/main
+        RemotePath::from_string(&format!(
+            "{build_tag}/{arch}/{pg_major_version}/extensions/{ext_name}.tar.zst"
+        ))
+    }
 }

 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
@@ -518,6 +531,37 @@ mod tests {
            .expect("Library should be found");
    }

+    #[test]
+    fn remote_extension_path() {
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": ["ext"],
+            "custom_extensions": [],
+            "library_index": {
+                "extlib": "ext",
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        let (_ext_name, ext_path) = rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect("Extension should be found");
+        // Starting with a forward slash would have consequences for the
+        // Url::join() that occurs when downloading a remote extension.
+        assert!(!ext_path.to_string().starts_with("/"));
+        assert_eq!(
+            ext_path,
+            RemoteExtSpec::build_remote_path("latest", "v17", "ext").unwrap()
+        );
+    }
+
    #[test]
    fn parse_spec_file() {
        let file = File::open("tests/cluster_spec.json").unwrap();
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -85,7 +85,7 @@
                "vartype": "bool"
            },
            {
-                "name": "prewarm_lfc_on_startup",
+                "name": "autoprewarm",
                "value": "off",
                "vartype": "bool"
            },
--- a/libs/neon-shmem/Cargo.toml
+++ b/libs/neon-shmem/Cargo.toml
@@ -6,8 +6,20 @@ license.workspace = true

 [dependencies]
 thiserror.workspace = true
-nix.workspace=true
+nix.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+rustc-hash = { version = "2.1.1" }
+
+[dev-dependencies]
+criterion = { workspace = true, features = ["html_reports"] }
+rand = "0.9.1"
+rand_distr = "0.5.1"
+xxhash-rust = { version = "0.8.15", features = ["xxh3"] }
+ahash.workspace = true

 [target.'cfg(target_os = "macos")'.dependencies]
 tempfile = "3.14.0"
+
+[[bench]]
+name = "hmap_resize"
+harness = false
--- a/libs/neon-shmem/src/hash.rs
+++ b/libs/neon-shmem/src/hash.rs
@@ -0,0 +1,438 @@
+//! Hash table implementation on top of 'shmem'
+//!
+//! Features required in the long run by the communicator project:
+//!
+//! [X] Accessible from both Postgres processes and rust threads in the communicator process
+//! [X] Low latency
+//! [ ] Scalable to lots of concurrent accesses (currently relies on caller for locking)
+//! [ ] Resizable
+
+use std::fmt::Debug;
+use std::hash::{Hash, Hasher, BuildHasher};
+use std::mem::MaybeUninit;
+
+use rustc_hash::FxBuildHasher;
+
+use crate::shmem::ShmemHandle;
+
+mod core;
+pub mod entry;
+
+#[cfg(test)]
+mod tests;
+
+mod optim;
+
+use core::{CoreHashMap, INVALID_POS};
+use entry::{Entry, OccupiedEntry};
+
+
+pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
+    // Hash table can be allocated in a fixed memory area, or in a resizeable ShmemHandle.
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+	shared_size: usize,
+	hasher: S,
+	num_buckets: u32,
+}
+
+pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+	hasher: S,
+}
+
+unsafe impl<'a, K: Sync, V: Sync, S> Sync for HashMapAccess<'a, K, V, S> {}
+unsafe impl<'a, K: Send, V: Send, S> Send for HashMapAccess<'a, K, V, S> {}
+
+impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
+	pub fn with_hasher(self, hasher: S) -> HashMapInit<'a, K, V, S> {
+		Self { hasher, ..self }
+	}
+	
+	pub fn estimate_size(num_buckets: u32) -> usize {
+        // add some margin to cover alignment etc.
+        CoreHashMap::<K, V>::estimate_size(num_buckets) + size_of::<HashMapShared<K, V>>() + 1000
+    }
+	
+    pub fn attach_writer(self) -> HashMapAccess<'a, K, V, S> {
+		let mut ptr: *mut u8 = self.shared_ptr.cast();
+        let end_ptr: *mut u8 = unsafe { ptr.add(self.shared_size) };
+        ptr = unsafe { ptr.add(ptr.align_offset(align_of::<HashMapShared<K, V>>())) };
+        let shared_ptr: *mut HashMapShared<K, V> = ptr.cast();
+        ptr = unsafe { ptr.add(size_of::<HashMapShared<K, V>>()) };
+ 
+        // carve out the buckets
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<core::LinkedKey<K>>())) };
+        let keys_ptr = ptr;
+        ptr = unsafe { ptr.add(size_of::<core::LinkedKey<K>>() * self.num_buckets as usize) };
+		
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<Option<V>>())) };
+        let vals_ptr = ptr;
+        ptr = unsafe { ptr.add(size_of::<Option<V>>() * self.num_buckets as usize) };
+		
+        // use remaining space for the dictionary
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<u32>())) };
+        assert!(ptr.addr() < end_ptr.addr());
+        let dictionary_ptr = ptr;
+        let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::<u32>() as isize };
+        assert!(dictionary_size > 0);
+
+        let keys =
+            unsafe { std::slice::from_raw_parts_mut(keys_ptr.cast(), self.num_buckets as usize) };
+		let vals =
+            unsafe { std::slice::from_raw_parts_mut(vals_ptr.cast(), self.num_buckets as usize) };
+        let dictionary = unsafe {
+            std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize)
+        };
+        let hashmap = CoreHashMap::new(keys, vals, dictionary);
+        unsafe {
+            std::ptr::write(shared_ptr, HashMapShared { inner: hashmap });
+        }
+		
+        HashMapAccess {
+            shmem_handle: self.shmem_handle,
+            shared_ptr: self.shared_ptr,
+			hasher: self.hasher,
+        }
+    }
+
+    pub fn attach_reader(self) -> HashMapAccess<'a, K, V, S> {
+        // no difference to attach_writer currently
+         self.attach_writer()
+    }
+}
+
+/// This is stored in the shared memory area
+///
+/// NOTE: We carve out the parts from a contiguous chunk. Growing and shrinking the hash table
+/// relies on the memory layout! The data structures are laid out in the contiguous shared memory
+/// area as follows:
+///
+/// HashMapShared
+/// [buckets]
+/// [dictionary]
+///
+/// In between the above parts, there can be padding bytes to align the parts correctly.
+struct HashMapShared<'a, K, V> {
+    inner: CoreHashMap<'a, K, V>	
+}
+
+impl<'a, K, V> HashMapInit<'a, K, V, rustc_hash::FxBuildHasher>
+where
+	K: Clone + Hash + Eq
+{
+	pub fn with_fixed(
+		num_buckets: u32,
+        area: &'a mut [MaybeUninit<u8>],
+    ) -> HashMapInit<'a, K, V> {
+		Self {
+			num_buckets,
+			shmem_handle: None,
+			shared_ptr: area.as_mut_ptr().cast(),
+			shared_size: area.len(),
+			hasher: rustc_hash::FxBuildHasher::default(),
+		}		
+    }
+
+    /// Initialize a new hash map in the given shared memory area
+    pub fn with_shmem(num_buckets: u32, shmem: ShmemHandle) -> HashMapInit<'a, K, V> {
+		let size = Self::estimate_size(num_buckets);
+		shmem
+            .set_size(size)
+            .expect("could not resize shared memory area");
+		Self {
+			num_buckets,
+			shared_ptr: shmem.data_ptr.as_ptr().cast(),
+			shmem_handle: Some(shmem),
+			shared_size: size,
+			hasher: rustc_hash::FxBuildHasher::default()
+		}
+    }
+
+	pub fn new_resizeable_named(num_buckets: u32, max_buckets: u32, name: &str) -> HashMapInit<'a, K, V> {
+		let size = Self::estimate_size(num_buckets);
+		let max_size = Self::estimate_size(max_buckets);
+		let shmem = ShmemHandle::new(name, size, max_size)
+			.expect("failed to make shared memory area");
+		
+		Self {
+			num_buckets,
+			shared_ptr: shmem.data_ptr.as_ptr().cast(),
+			shmem_handle: Some(shmem),
+			shared_size: size,
+			hasher: rustc_hash::FxBuildHasher::default()
+		}
+	}
+
+	pub fn new_resizeable(num_buckets: u32, max_buckets: u32) -> HashMapInit<'a, K, V> {
+		use std::sync::atomic::{AtomicUsize, Ordering};
+		const COUNTER: AtomicUsize = AtomicUsize::new(0);
+		let val = COUNTER.fetch_add(1, Ordering::Relaxed);
+		let name = format!("neon_shmem_hmap{}", val);
+		Self::new_resizeable_named(num_buckets, max_buckets, &name)
+	}
+}
+
+impl<'a, K, V, S: BuildHasher> HashMapAccess<'a, K, V, S>
+where
+    K: Clone + Hash + Eq,
+{
+    pub fn get_hash_value(&self, key: &K) -> u64 {
+		self.hasher.hash_one(key)        
+    }
+
+    pub fn get_with_hash<'e>(&'e self, key: &K, hash: u64) -> Option<&'e V> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+
+        map.inner.get_with_hash(key, hash)
+    }
+
+    pub fn entry_with_hash(&mut self, key: K, hash: u64) -> Entry<'a, '_, K, V> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+
+        map.inner.entry_with_hash(key, hash)
+    }
+
+    pub fn remove_with_hash(&mut self, key: &K, hash: u64) {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+
+        match map.inner.entry_with_hash(key.clone(), hash) {
+            Entry::Occupied(e) => {
+                e.remove();
+            }
+            Entry::Vacant(_) => {}
+        };
+    }
+
+    pub fn entry_at_bucket(&mut self, pos: usize) -> Option<OccupiedEntry<'a, '_, K, V>> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+        map.inner.entry_at_bucket(pos)
+    }
+
+    pub fn get_num_buckets(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        map.inner.get_num_buckets()
+    }
+
+    /// Return the key and value stored in bucket with given index. This can be used to
+    /// iterate through the hash map. (An Iterator might be nicer. The communicator's
+    /// clock algorithm needs to _slowly_ iterate through all buckets with its clock hand,
+    /// without holding a lock. If we switch to an Iterator, it must not hold the lock.)
+    pub fn get_at_bucket(&self, pos: usize) -> Option<(&K, &V)> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+
+        if pos >= map.inner.keys.len() {
+            return None;
+        }
+        let key = &map.inner.keys[pos];
+		key.inner.as_ref().map(|k| (k, map.inner.vals[pos].as_ref().unwrap()))
+    }
+
+    pub fn get_bucket_for_value(&self, val_ptr: *const V) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+
+        let origin = map.inner.vals.as_ptr();
+        let idx = (val_ptr as usize - origin as usize) / (size_of::<V>() as usize);
+        assert!(idx < map.inner.vals.len());
+
+        idx
+    }
+
+    // for metrics
+    pub fn get_num_buckets_in_use(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        map.inner.buckets_in_use as usize
+    }
+
+	pub fn clear(&mut self) {
+		let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+        let inner = &mut map.inner;
+        inner.clear()
+	}
+	
+	/// Helper function that abstracts the common logic between growing and shrinking.
+	/// The only significant difference in the rehashing step is how many buckets to rehash.
+	fn rehash_dict(
+		&mut self,
+		inner: &mut CoreHashMap<'a, K, V>,
+		keys_ptr: *mut core::LinkedKey<K>,
+		end_ptr: *mut u8,
+		num_buckets: u32,
+		rehash_buckets: u32,
+	) {
+		inner.free_head = INVALID_POS;
+		
+		// Recalculate the dictionary
+        let keys;
+        let dictionary;
+        unsafe {
+            let keys_end_ptr = keys_ptr.add(num_buckets as usize);
+            let buckets_end_ptr: *mut u8 = (keys_end_ptr as *mut u8)
+				.add(size_of::<Option<V>>() * num_buckets as usize);
+			let dictionary_ptr: *mut u32 = buckets_end_ptr
+				.byte_add(buckets_end_ptr.align_offset(align_of::<u32>()))
+                .cast();
+            let dictionary_size: usize =
+                end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::<u32>();
+
+            keys = std::slice::from_raw_parts_mut(keys_ptr, num_buckets as usize);
+            dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size);
+        }
+        for i in 0..dictionary.len() {
+            dictionary[i] = INVALID_POS;
+        }
+
+        for i in 0..rehash_buckets as usize {
+			if keys[i].inner.is_none() {
+				keys[i].next = inner.free_head;
+				inner.free_head = i as u32;
+				continue;
+			}
+
+			let hash = self.hasher.hash_one(&keys[i].inner.as_ref().unwrap());
+            let pos: usize = (hash % dictionary.len() as u64) as usize;
+            keys[i].next = dictionary[pos];
+            dictionary[pos] = i as u32;
+        }
+
+        // Finally, update the CoreHashMap struct
+        inner.dictionary = dictionary;
+        inner.keys = keys;
+	}
+
+	/// Rehash the map. Intended for benchmarking only.
+	pub fn shuffle(&mut self) {
+		let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+        let inner = &mut map.inner;
+		let num_buckets = inner.get_num_buckets() as u32;
+		let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+		let end_ptr: *mut u8 = unsafe { (self.shared_ptr as *mut u8).add(size_bytes) };
+        let keys_ptr = inner.keys.as_mut_ptr();
+		self.rehash_dict(inner, keys_ptr, end_ptr, num_buckets, num_buckets);
+	}
+
+	
+    // /// Grow
+    // ///
+    // /// 1. grow the underlying shared memory area
+    // /// 2. Initialize new buckets. This overwrites the current dictionary
+    // /// 3. Recalculate the dictionary
+    // pub fn grow(&mut self, num_buckets: u32) -> Result<(), crate::shmem::Error> {
+    //     let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+    //     let inner = &mut map.inner;
+    //     let old_num_buckets = inner.buckets.len() as u32;
+    //     if num_buckets < old_num_buckets {
+    //         panic!("grow called with a smaller number of buckets");
+    //     }
+    //     if num_buckets == old_num_buckets {
+    //         return Ok(());
+    //     }
+    //     let shmem_handle = self
+    //         .shmem_handle
+    //         .as_ref()
+    //         .expect("grow called on a fixed-size hash table");
+
+    //     let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+    //     shmem_handle.set_size(size_bytes)?;
+    //     let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
+
+    //     // Initialize new buckets. The new buckets are linked to the free list. NB: This overwrites
+    //     // the dictionary!
+    //     let keys_ptr = inner.keys.as_mut_ptr();
+    //     unsafe {
+    //         for i in old_num_buckets..num_buckets {
+    //             let bucket_ptr = buckets_ptr.add(i as usize);
+    //             bucket_ptr.write(core::Bucket {
+    //                 next: if i < num_buckets-1 {
+    //                     i as u32 + 1
+    //                 } else {
+    //                     inner.free_head
+    //                 },
+	// 				prev: if i > 0 {
+	// 					PrevPos::Chained(i as u32 - 1)
+	// 				} else {
+	// 					PrevPos::First(INVALID_POS)
+	// 				},
+    //                 inner: None,
+    //             });
+    //         }
+    //     }
+	// 	self.rehash_dict(inner, keys_ptr, end_ptr, num_buckets, old_num_buckets);
+    //     inner.free_head = old_num_buckets;
+
+    //     Ok(())
+    // }
+
+	// /// Begin a shrink, limiting all new allocations to be in buckets with index less than `num_buckets`. 
+// 	pub fn begin_shrink(&mut self, num_buckets: u32) {
+// 		let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+// 		if num_buckets > map.inner.get_num_buckets() as u32 {
+//             panic!("shrink called with a larger number of buckets");
+//         }
+// 		_ = self
+//             .shmem_handle
+//             .as_ref()
+//             .expect("shrink called on a fixed-size hash table");
+// 		map.inner.alloc_limit = num_buckets;
+// 	}
+
+// 	/// Complete a shrink after caller has evicted entries, removing the unused buckets and rehashing.
+// 	pub fn finish_shrink(&mut self) -> Result<(), crate::shmem::Error> {
+// 		let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+// 		let inner = &mut map.inner;
+// 		if !inner.is_shrinking() {
+// 			panic!("called finish_shrink when no shrink is in progress");
+// 		}
+
+// 		let num_buckets = inner.alloc_limit; 
+
+// 		if inner.get_num_buckets() == num_buckets as usize {
+//             return Ok(());
+//         }
+		
+// 		for i in (num_buckets as usize)..inner.buckets.len() {
+// 			if inner.buckets[i].inner.is_some() {
+// 				// TODO(quantumish) Do we want to treat this as a violation of an invariant
+// 				// or a legitimate error the caller can run into? Originally I thought this
+// 				// could return something like a UnevictedError(index) as soon as it runs
+// 				// into something (that way a caller could clear their soon-to-be-shrinked 
+// 				// buckets by repeatedly trying to call `finish_shrink`). 
+// 				//
+// 				// Would require making a wider error type enum with this and shmem errors.
+// 				panic!("unevicted entries in shrinked space")
+// 			}
+// 			match inner.buckets[i].prev {
+// 				PrevPos::First(_) => {
+// 					let next_pos = inner.buckets[i].next;
+// 					inner.free_head = next_pos;
+// 					if next_pos != INVALID_POS {
+// 						inner.buckets[next_pos as usize].prev = PrevPos::First(INVALID_POS);
+// 					}
+// 				},
+// 				PrevPos::Chained(j) => {
+// 					let next_pos = inner.buckets[i].next;
+// 					inner.buckets[j as usize].next = next_pos;
+// 					if next_pos != INVALID_POS {
+// 						inner.buckets[next_pos as usize].prev = PrevPos::Chained(j);
+// 					}
+// 				}
+// 			}
+// 		}
+
+//         let shmem_handle = self
+//             .shmem_handle
+//             .as_ref()
+//             .expect("shrink called on a fixed-size hash table");
+
+// 		let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+//         shmem_handle.set_size(size_bytes)?;
+//         let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
+// 		let buckets_ptr = inner.buckets.as_mut_ptr();
+// 		self.rehash_dict(inner, buckets_ptr, end_ptr, num_buckets, num_buckets);
+// 		inner.alloc_limit = INVALID_POS;
+		
+// 		Ok(())
+// 	}
+
+}
--- a/libs/neon-shmem/src/hash/core.rs
+++ b/libs/neon-shmem/src/hash/core.rs
@@ -0,0 +1,247 @@
+//! Simple hash table with chaining
+//!
+//! # Resizing
+//!
+
+use std::hash::Hash;
+use std::mem::MaybeUninit;
+
+use crate::hash::entry::{Entry, OccupiedEntry, PrevPos, VacantEntry};
+
+pub(crate) const INVALID_POS: u32 = u32::MAX;
+
+pub(crate) struct LinkedKey<K> {
+	pub(crate) inner: Option<K>,
+	pub(crate) next: u32,	
+}
+
+pub(crate) struct CoreHashMap<'a, K, V> {
+	/// Dictionary used to map hashes to bucket indices.	
+    pub(crate) dictionary: &'a mut [u32],
+    pub(crate) keys: &'a mut [LinkedKey<K>],
+	pub(crate) vals: &'a mut [Option<V>],
+	/// Head of the freelist.
+    pub(crate) free_head: u32,
+
+    pub(crate) _user_list_head: u32,
+	/// Maximum index of a bucket allowed to be allocated. INVALID_POS if no limit.
+	pub(crate) alloc_limit: u32,
+
+    // metrics
+    pub(crate) buckets_in_use: u32,
+}
+
+#[derive(Debug)]
+pub struct FullError();
+
+impl<'a, K: Hash + Eq, V> CoreHashMap<'a, K, V>
+where
+    K: Clone + Hash + Eq,
+{
+    const FILL_FACTOR: f32 = 0.60;
+
+    pub fn estimate_size(num_buckets: u32) -> usize {
+        let mut size = 0;
+
+        // buckets
+        size += (size_of::<LinkedKey<K>>() + size_of::<Option<V>>())
+			* num_buckets as usize;
+
+        // dictionary
+        size += (f32::ceil((size_of::<u32>() * num_buckets as usize) as f32 / Self::FILL_FACTOR))
+            as usize;
+
+        size
+    }	
+
+    pub fn new(
+        keys: &'a mut [MaybeUninit<LinkedKey<K>>],
+		vals: &'a mut [MaybeUninit<Option<V>>],
+        dictionary: &'a mut [MaybeUninit<u32>],
+    ) -> CoreHashMap<'a, K, V> {
+        // Initialize the buckets
+        for i in 0..keys.len() {
+            keys[i].write(LinkedKey {
+				next: if i < keys.len() - 1 {
+                    i as u32 + 1
+                } else {
+                    INVALID_POS
+                },
+				inner: None,
+			});
+		}
+		for i in 0..vals.len() {
+            vals[i].write(None);
+		}
+
+		// Initialize the dictionary
+        for i in 0..dictionary.len() {
+            dictionary[i].write(INVALID_POS);
+        }
+
+        // TODO: use std::slice::assume_init_mut() once it stabilizes
+        let keys =
+            unsafe { std::slice::from_raw_parts_mut(keys.as_mut_ptr().cast(), keys.len()) };
+		let vals =
+            unsafe { std::slice::from_raw_parts_mut(vals.as_mut_ptr().cast(), vals.len()) };
+        let dictionary = unsafe {
+            std::slice::from_raw_parts_mut(dictionary.as_mut_ptr().cast(), dictionary.len())
+        };
+
+        CoreHashMap {
+            dictionary,
+            keys,
+			vals,
+            free_head: 0,
+            buckets_in_use: 0,
+            _user_list_head: INVALID_POS,
+			alloc_limit: INVALID_POS,
+        }
+    }
+
+    pub fn get_with_hash(&self, key: &K, hash: u64) -> Option<&V> {
+        let mut next = self.dictionary[hash as usize % self.dictionary.len()];
+        loop {
+            if next == INVALID_POS {
+                return None;
+            }
+
+            let keylink = &self.keys[next as usize];
+            let bucket_key = keylink.inner.as_ref().expect("entry is in use");
+            if bucket_key == key {
+                return Some(self.vals[next as usize].as_ref().unwrap());
+            }
+            next = keylink.next;
+        }
+    }
+
+    // all updates are done through Entry
+    pub fn entry_with_hash(&mut self, key: K, hash: u64) -> Entry<'a, '_, K, V> {
+        let dict_pos = hash as usize % self.dictionary.len();
+        let first = self.dictionary[dict_pos];
+        if first == INVALID_POS {
+            // no existing entry
+            return Entry::Vacant(VacantEntry {
+                map: self,
+                key,
+                dict_pos: dict_pos as u32,
+            });
+        }
+
+        let mut prev_pos = PrevPos::First(dict_pos as u32);
+        let mut next = first;
+        loop {
+            let keylink = &mut self.keys[next as usize];
+            let bucket_key = keylink.inner.as_mut().expect("entry is in use");
+            if *bucket_key == key {
+                // found existing entry
+                return Entry::Occupied(OccupiedEntry {
+                    map: self,
+                    _key: key,
+                    prev_pos,
+                    bucket_pos: next,
+                });
+            }
+
+            if keylink.next == INVALID_POS {
+                // No existing entry
+                return Entry::Vacant(VacantEntry {
+                    map: self,
+                    key,
+                    dict_pos: dict_pos as u32,
+                });
+            }
+            prev_pos = PrevPos::Chained(next);
+            next = keylink.next;
+        }
+    }
+
+    pub fn get_num_buckets(&self) -> usize {
+        self.keys.len()
+    }
+
+	pub fn is_shrinking(&self) -> bool {
+		self.alloc_limit != INVALID_POS
+	}
+
+	/// Clears all entries from the hashmap.
+	/// Does not reset any allocation limits, but does clear any entries beyond them.
+	pub fn clear(&mut self) {
+		for i in 0..self.keys.len() {
+            self.keys[i] = LinkedKey {
+                next: if i < self.keys.len() - 1 {
+                    i as u32 + 1
+                } else {
+                    INVALID_POS
+                },				
+                inner: None,
+            }
+        }
+		for i in 0..self.vals.len() {
+			self.vals[i] = None;
+		}
+
+        for i in 0..self.dictionary.len() {
+            self.dictionary[i] = INVALID_POS;
+        }
+
+		self.buckets_in_use = 0;
+	}
+	
+    pub fn entry_at_bucket(&mut self, pos: usize) -> Option<OccupiedEntry<'a, '_, K, V>> {
+		if pos >= self.keys.len() {
+			return None;
+		}
+
+		let entry = self.keys[pos].inner.as_ref();
+		match entry {
+			Some(key) => Some(OccupiedEntry {
+				_key: key.clone(),
+				bucket_pos: pos as u32,
+				prev_pos: PrevPos::Unknown,
+				map: self,
+			}),
+			_ => None,
+		}		
+    }
+
+	/// Find the position of an unused bucket via the freelist and initialize it. 
+    pub(crate) fn alloc_bucket(&mut self, key: K, value: V) -> Result<u32, FullError> {
+        let mut pos = self.free_head;
+
+		// Find the first bucket we're *allowed* to use.
+		let mut prev = PrevPos::First(self.free_head);
+		while pos != INVALID_POS && pos >= self.alloc_limit {
+			let keylink = &mut self.keys[pos as usize];
+			prev = PrevPos::Chained(pos);
+			pos = keylink.next;
+		}
+		if pos == INVALID_POS {
+			return Err(FullError());
+		}
+
+		// Repair the freelist.
+		match prev {
+			PrevPos::First(_) => {
+				let next_pos = self.keys[pos as usize].next;
+				self.free_head = next_pos;		
+			}
+			PrevPos::Chained(p) => if p != INVALID_POS {
+				let next_pos = self.keys[pos as usize].next;
+				self.keys[p as usize].next = next_pos;
+			},
+			PrevPos::Unknown => unreachable!()
+		}
+
+		// Initialize the bucket.
+		let keylink = &mut self.keys[pos as usize];
+		self.buckets_in_use += 1;
+        keylink.next = INVALID_POS;		
+        keylink.inner = Some(key);
+		self.vals[pos as usize] = Some(value);
+
+        return Ok(pos);
+    }
+}
+
+	
--- a/libs/neon-shmem/src/hash/entry.rs
+++ b/libs/neon-shmem/src/hash/entry.rs
@@ -0,0 +1,107 @@
+//! Like std::collections::hash_map::Entry;
+
+use crate::hash::core::{CoreHashMap, FullError, INVALID_POS};
+
+use std::hash::Hash;
+use std::mem;
+
+pub enum Entry<'a, 'b, K, V> {
+    Occupied(OccupiedEntry<'a, 'b, K, V>),
+    Vacant(VacantEntry<'a, 'b, K, V>),
+}
+
+/// Helper enum representing the previous position within a hashmap chain.
+#[derive(Clone, Copy)]
+pub(crate) enum PrevPos {
+	/// Starting index within the dictionary.  
+    First(u32),
+	/// Regular index within the buckets.
+    Chained(u32),
+	/// Unknown - e.g. the associated entry was retrieved by index instead of chain.
+	Unknown,
+}
+
+impl PrevPos {
+	/// Unwrap an index from a `PrevPos::First`, panicking otherwise.
+	pub fn unwrap_first(&self) -> u32 {
+		match self {
+			Self::First(i) => *i,
+			_ => panic!("not first entry in chain")
+		}
+	}
+}
+
+pub struct OccupiedEntry<'a, 'b, K, V> {
+	pub(crate) map: &'b mut CoreHashMap<'a, K, V>,
+	/// The key of the occupied entry
+    pub(crate) _key: K,
+	/// The index of the previous entry in the chain.
+    pub(crate) prev_pos: PrevPos,
+	/// The position of the bucket in the CoreHashMap's buckets array.
+    pub(crate) bucket_pos: u32, 
+}
+
+impl<'a, 'b, K, V> OccupiedEntry<'a, 'b, K, V> {
+    pub fn get(&self) -> &V {
+        self.map.vals[self.bucket_pos as usize]
+            .as_ref()
+            .unwrap()
+    }
+
+    pub fn get_mut(&mut self) -> &mut V {
+        self.map.vals[self.bucket_pos as usize]
+            .as_mut()
+            .unwrap()
+    }
+
+    pub fn insert(&mut self, value: V) -> V {
+        let bucket = &mut self.map.vals[self.bucket_pos as usize];
+        // This assumes inner is Some, which it must be for an OccupiedEntry
+        let old_value = mem::replace(bucket.as_mut().unwrap(), value);
+        old_value
+    }
+
+    pub fn remove(self) -> V {
+        // CoreHashMap::remove returns Option<(K, V)>. We know it's Some for an OccupiedEntry.
+        let keylink = &mut self.map.keys[self.bucket_pos as usize];
+
+        // unlink it from the chain
+        match self.prev_pos {
+            PrevPos::First(dict_pos) => self.map.dictionary[dict_pos as usize] = keylink.next,
+            PrevPos::Chained(bucket_pos) => {
+                self.map.keys[bucket_pos as usize].next = keylink.next
+            },
+			PrevPos::Unknown => panic!("can't safely remove entry with unknown previous entry"),
+        }
+
+        // and add it to the freelist        
+        let keylink = &mut self.map.keys[self.bucket_pos as usize];
+        keylink.inner = None;
+		keylink.next = self.map.free_head;
+		let old_value = self.map.vals[self.bucket_pos as usize].take();
+        self.map.free_head = self.bucket_pos;
+        self.map.buckets_in_use -= 1;
+
+        return old_value.unwrap();
+    }
+}
+
+pub struct VacantEntry<'a, 'b, K, V> {
+    pub(crate) map: &'b mut CoreHashMap<'a, K, V>,
+    pub(crate) key: K, // The key to insert
+    pub(crate) dict_pos: u32,
+}
+
+impl<'a, 'b, K: Clone + Hash + Eq, V> VacantEntry<'a, 'b, K, V> {
+    pub fn insert(self, value: V) -> Result<&'b mut V, FullError> {
+        let pos = self.map.alloc_bucket(self.key, value)?;
+        if pos == INVALID_POS {
+            return Err(FullError());
+        }
+        self.map.keys[pos as usize].next = self.map.dictionary[self.dict_pos as usize];
+        self.map.dictionary[self.dict_pos as usize] = pos;
+
+        let result = self.map.vals[pos as usize].as_mut().unwrap();
+        return Ok(result);
+    }
+}
--- a/libs/neon-shmem/src/hash/optim.rs
+++ b/libs/neon-shmem/src/hash/optim.rs
@@ -0,0 +1,85 @@
+//! Adapted from https://github.com/jsnell/parallel-xxhash (TODO: license?)
+
+use core::arch::x86::*;
+
+const PRIME32_1: u32 = 2654435761;
+const PRIME32_2: u32 = 2246822519;
+const PRIME32_3: u32 = 3266489917;
+const PRIME32_4: u32 =  668265263;
+const PRIME32_5: u32 =  374761393;
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+fn mm256_rol32<const r: u32>(x: __m256i) -> __m256i {
+    return _mm256_or_si256(_mm256_slli_epi32(x, r),
+                           _mm256_srli_epi32(x, 32 - r));
+} 
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+fn mm256_fmix32(mut h: __m256i) -> __m256i {
+    h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 15));
+    h = _mm256_mullo_epi32(h, _mm256_set1_epi32(PRIME32_2));
+    h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 13));
+    h = _mm256_mullo_epi32(h, _mm256_set1_epi32(PRIME32_3));
+    h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 16));
+	h
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+fn mm256_round(mut seed: __m256i, input: __m256i) -> __m256i {
+	seed = _mm256_add_epi32(
+		seed,
+        _mm256_mullo_epi32(input, _mm256_set1_epi32(PRIME32_2))
+	);
+    seed = mm256_rol32::<13>(seed);
+    seed = _mm256_mullo_epi32(seed, _mm256_set1_epi32(PRIME32_1));
+	seed
+}
+
+/// Computes xxHash for 8 keys of size 4*N bytes in column-major order.
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+fn xxhash_many<const N: usize>(keys: *const u32, seed: u32) -> [u32; 8] {
+	let mut res = [0; 8];
+	let mut h = _mm256_set1_epi32(seed + PRIME32_5);
+	if (N >= 4) {
+		let mut v1 = _mm256_set1_epi32(seed + PRIME32_1 + PRIME32_2);
+		let mut v2 = _mm256_set1_epi32(seed + PRIME32_2);
+		let mut v3 = _mm256_set1_epi32(seed);
+		let mut v4 = _mm256_set1_eip32(seed - PRIME32_1);
+		let mut i = 0;
+		while i < (N & !3) {
+			let k1 = _mm256_loadu_si256(keys.add((i + 0) * 8).cast());
+			let k2 = _mm256_loadu_si256(keys.add((i + 1) * 8).cast());
+			let k3 = _mm256_loadu_si256(keys.add((i + 2) * 8).cast());
+			let k4 = _mm256_loadu_si256(keys.add((i + 3) * 8).cast());
+			v1 = mm256_round(v1, k1);
+			v2 = mm256_round(v2, k2);
+			v3 = mm256_round(v3, k3);
+			v4 = mm256_round(v4, k4);
+			i += 4;
+		}
+		h = mm256_rol32::<1>(v1) + mm256_rol32::<7>(v2) +
+			mm256_rol32::<12>(v3) + mm256_rol32::<18>(v4);
+	}
+
+	// Unneeded, keeps bitwise parity with xxhash though.
+	h = _m256_add_epi32(h, _mm256_set1_eip32(N * 4));
+
+	for i in -(N & 3)..0 {
+        let v = _mm256_loadu_si256(keys.add((N + i) * 8));
+        h = _mm256_add_epi32(
+			h,
+            _mm256_mullo_epi32(v, _mm256_set1_epi32(PRIME32_3))
+		);
+        h = _mm256_mullo_epi32(
+			mm256_rol32::<17>(h),
+            _mm256_set1_epi32(PRIME32_4)
+		);
+    }
+
+    _mm256_storeu_si256((&mut res as *mut _).cast(), mm256_fmix32(h));
+	res
+}
--- a/libs/neon-shmem/src/hash/tests.rs
+++ b/libs/neon-shmem/src/hash/tests.rs
@@ -0,0 +1,382 @@
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fmt::{Debug, Formatter};
+use std::mem::uninitialized;
+use std::mem::MaybeUninit;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use crate::hash::HashMapAccess;
+use crate::hash::HashMapInit;
+use crate::hash::Entry;
+use crate::shmem::ShmemHandle;
+
+use rand::seq::SliceRandom;
+use rand::{Rng, RngCore};
+use rand_distr::Zipf;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl From<&TestKey> for u128 {
+    fn from(val: &TestKey) -> u128 {
+        u128::from_be_bytes(val.0)
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl<'a> From<&'a [u8]> for TestKey {
+    fn from(bytes: &'a [u8]) -> TestKey {
+        TestKey(bytes.try_into().unwrap())
+    }
+}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {	
+    let mut w = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		100000, 120000, "test_inserts"
+	).attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+		let hash = w.get_hash_value(&(*k).into());
+		let res = w.entry_with_hash((*k).into(), hash);
+		match res {
+			Entry::Occupied(mut e) => { e.insert(idx); }
+			Entry::Vacant(e) => {
+				let res = e.insert(idx);
+				assert!(res.is_ok());
+			},
+		};
+    }
+
+    for (idx, k) in keys.iter().enumerate() {
+		let hash = w.get_hash_value(&(*k).into());
+        let x = w.get_with_hash(&(*k).into(), hash);
+        let value = x.as_deref().copied();
+        assert_eq!(value, Some(idx));
+    }
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut rand::rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.get(&key).is_some() {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
+
+struct TestValue(AtomicUsize);
+
+impl TestValue {
+    fn new(val: usize) -> TestValue {
+        TestValue(AtomicUsize::new(val))
+    }
+
+    fn load(&self) -> usize {
+        self.0.load(Ordering::Relaxed)
+    }
+}
+
+impl Clone for TestValue {
+    fn clone(&self) -> TestValue {
+        TestValue::new(self.load())
+    }
+}
+
+impl Debug for TestValue {
+    fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.load())
+    }
+}
+
+#[derive(Clone, Debug)]
+struct TestOp(TestKey, Option<usize>);
+
+fn apply_op(
+    op: &TestOp,
+    map: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    // apply the change to the shadow tree first
+    let shadow_existing = if let Some(v) = op.1 {
+        shadow.insert(op.0, v)
+    } else {
+        shadow.remove(&op.0)
+    };
+
+	let hash = map.get_hash_value(&op.0);
+	let entry = map.entry_with_hash(op.0, hash);
+    let hash_existing = match op.1 {
+		Some(new) => {
+			match entry {
+				Entry::Occupied(mut e) => Some(e.insert(new)),
+				Entry::Vacant(e) => { e.insert(new).unwrap(); None },
+			}
+		},
+		None => {
+			match entry {
+				Entry::Occupied(e) => Some(e.remove()),
+				Entry::Vacant(_) => None,
+			}
+		},
+	};
+
+	assert_eq!(shadow_existing, hash_existing);
+}
+
+fn do_random_ops(
+	num_ops: usize,
+	size: u32,
+	del_prob: f64,
+	writer: &mut HashMapAccess<TestKey, usize>,
+	shadow: &mut BTreeMap<TestKey, usize>,
+	rng: &mut rand::rngs::ThreadRng,
+) {
+	for i in 0..num_ops {
+        let key: TestKey = ((rng.next_u32() % size) as u128).into();
+        let op = TestOp(key, if rng.random_bool(del_prob) { Some(i) } else { None });
+        apply_op(&op, writer, shadow);
+    }
+}
+
+fn do_deletes(
+	num_ops: usize,
+	writer: &mut HashMapAccess<TestKey, usize>,
+	shadow: &mut BTreeMap<TestKey, usize>,
+) {
+	for i in 0..num_ops {
+		let (k, _) = shadow.pop_first().unwrap();
+		let hash = writer.get_hash_value(&k);
+		writer.remove_with_hash(&k, hash);
+	}
+}
+
+fn do_shrink(
+	writer: &mut HashMapAccess<TestKey, usize>,
+	shadow: &mut BTreeMap<TestKey, usize>,
+	from: u32,
+	to: u32
+) {
+	writer.begin_shrink(to);
+	while writer.get_num_buckets_in_use() > to as usize {
+		let (k, _) = shadow.pop_first().unwrap();
+		let hash = writer.get_hash_value(&k);
+		let entry = writer.entry_with_hash(k, hash);
+		if let Entry::Occupied(mut e) = entry {
+			e.remove();
+		}
+	}
+	writer.finish_shrink().unwrap();
+}
+
+#[test]
+fn random_ops() {
+	let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		100000, 120000, "test_random"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+	
+    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
+    let mut rng = rand::rng();
+    for i in 0..100000 {
+        let key: TestKey = (rng.sample(distribution) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &mut writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+        }
+    }
+}
+
+
+#[test]
+fn test_shuffle() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1000, 1200, "test_shuf"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+    writer.shuffle();
+	do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_grow() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1000, 2000, "test_grow"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+    writer.grow(1500).unwrap();
+	do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_shrink() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2000, "test_shrink"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+	
+    do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+    do_shrink(&mut writer, &mut shadow, 1500, 1000);
+	do_deletes(500, &mut writer, &mut shadow);
+	do_random_ops(10000, 500, 0.75, &mut writer, &mut shadow, &mut rng);
+	assert!(writer.get_num_buckets_in_use() <= 1000);
+}
+
+#[test]
+fn test_shrink_grow_seq() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1000, 20000, "test_grow_seq"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(500, 1000, 0.1, &mut writer, &mut shadow, &mut rng);
+	eprintln!("Shrinking to 750");
+    do_shrink(&mut writer, &mut shadow, 1000, 750);
+	do_random_ops(200, 1000, 0.5, &mut writer, &mut shadow, &mut rng);
+	eprintln!("Growing to 1500");
+	writer.grow(1500).unwrap();
+	do_random_ops(600, 1500, 0.1, &mut writer, &mut shadow, &mut rng);
+	eprintln!("Shrinking to 200");
+	do_shrink(&mut writer, &mut shadow, 1500, 200);
+	do_deletes(100, &mut writer, &mut shadow);
+	do_random_ops(50, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+	eprintln!("Growing to 10k");
+	writer.grow(10000).unwrap();
+	do_random_ops(10000, 5000, 0.25, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_bucket_ops() {
+	let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1000, 1200, "test_bucket_ops"
+	).attach_writer();
+	let hash = writer.get_hash_value(&1.into());
+	match writer.entry_with_hash(1.into(), hash) {
+		Entry::Occupied(mut e) => { e.insert(2); },
+		Entry::Vacant(e) => { e.insert(2).unwrap(); },
+	}
+	assert_eq!(writer.get_num_buckets_in_use(), 1);
+	assert_eq!(writer.get_num_buckets(), 1000);
+	assert_eq!(writer.get_with_hash(&1.into(), hash), Some(&2));
+	match writer.entry_with_hash(1.into(), hash) {
+		Entry::Occupied(e) => {
+			assert_eq!(e._key, 1.into());
+			let pos = e.bucket_pos as usize;
+			assert_eq!(writer.entry_at_bucket(pos).unwrap()._key, 1.into());
+			assert_eq!(writer.get_at_bucket(pos), Some(&(1.into(), 2)));
+		},
+		Entry::Vacant(_) => { panic!("Insert didn't affect entry"); },
+	}
+	writer.remove_with_hash(&1.into(), hash);
+	assert_eq!(writer.get_with_hash(&1.into(), hash), None);
+}
+
+#[test]
+fn test_shrink_zero() {
+	let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2000, "test_shrink_zero"
+	).attach_writer();
+	writer.begin_shrink(0);
+	for i in 0..1500 {
+		writer.entry_at_bucket(i).map(|x| x.remove());
+	}
+	writer.finish_shrink().unwrap();
+	assert_eq!(writer.get_num_buckets_in_use(), 0);
+	let hash = writer.get_hash_value(&1.into());
+	let entry = writer.entry_with_hash(1.into(), hash);
+	if let Entry::Vacant(v) = entry {
+		assert!(v.insert(2).is_err());
+	} else {
+		panic!("Somehow got non-vacant entry in empty map.")
+	}
+	writer.grow(50).unwrap();
+	let entry = writer.entry_with_hash(1.into(), hash);
+	if let Entry::Vacant(v) = entry {
+		assert!(v.insert(2).is_ok());
+	} else {
+		panic!("Somehow got non-vacant entry in empty map.")
+	}
+	assert_eq!(writer.get_num_buckets_in_use(), 1);
+}
+
+#[test]
+#[should_panic]
+fn test_grow_oom() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2000, "test_grow_oom"
+	).attach_writer();
+	writer.grow(20000).unwrap();
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_bigger() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2500, "test_shrink_bigger"
+	).attach_writer();
+	writer.begin_shrink(2000);
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_early_finish() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2500, "test_shrink_early_finish"
+	).attach_writer();
+	writer.finish_shrink().unwrap();
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_fixed_size() {
+	let mut area = [MaybeUninit::uninit(); 10000];
+    let init_struct = HashMapInit::<TestKey, usize>::with_fixed(3, &mut area);
+    let mut writer = init_struct.attach_writer();
+	writer.begin_shrink(1);
+}
+
--- a/libs/neon-shmem/src/lib.rs
+++ b/libs/neon-shmem/src/lib.rs
@@ -1,418 +1,4 @@
 //! Shared memory utilities for neon communicator

-use std::num::NonZeroUsize;
-use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
-use std::ptr::NonNull;
-use std::sync::atomic::{AtomicUsize, Ordering};
-
-use nix::errno::Errno;
-use nix::sys::mman::MapFlags;
-use nix::sys::mman::ProtFlags;
-use nix::sys::mman::mmap as nix_mmap;
-use nix::sys::mman::munmap as nix_munmap;
-use nix::unistd::ftruncate as nix_ftruncate;
-
-/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
-/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
-/// specified at creation.
-///
-/// The area is backed by an anonymous file created with memfd_create(). The full address space for
-/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
-/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
-/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
-/// future.
-pub struct ShmemHandle {
-    /// memfd file descriptor
-    fd: OwnedFd,
-
-    max_size: usize,
-
-    // Pointer to the beginning of the shared memory area. The header is stored there.
-    shared_ptr: NonNull<SharedStruct>,
-
-    // Pointer to the beginning of the user data
-    pub data_ptr: NonNull<u8>,
-}
-
-/// This is stored at the beginning in the shared memory area.
-struct SharedStruct {
-    max_size: usize,
-
-    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
-    current_size: AtomicUsize,
-}
-
-const RESIZE_IN_PROGRESS: usize = 1 << 63;
-
-const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
-
-/// Error type returned by the ShmemHandle functions.
-#[derive(thiserror::Error, Debug)]
-#[error("{msg}: {errno}")]
-pub struct Error {
-    pub msg: String,
-    pub errno: Errno,
-}
-
-impl Error {
-    fn new(msg: &str, errno: Errno) -> Error {
-        Error {
-            msg: msg.to_string(),
-            errno,
-        }
-    }
-}
-
-impl ShmemHandle {
-    /// Create a new shared memory area. To communicate between processes, the processes need to be
-    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
-    ///
-    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
-    /// processes can continue using it, however.
-    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
-        // create the backing anonymous file.
-        let fd = create_backing_file(name)?;
-
-        Self::new_with_fd(fd, initial_size, max_size)
-    }
-
-    fn new_with_fd(
-        fd: OwnedFd,
-        initial_size: usize,
-        max_size: usize,
-    ) -> Result<ShmemHandle, Error> {
-        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
-        // is a little larger than this because of the SharedStruct header. Make the upper limit
-        // somewhat smaller than that, because with anything close to that, you'll run out of
-        // memory anyway.
-        if max_size >= 1 << 48 {
-            panic!("max size {} too large", max_size);
-        }
-        if initial_size > max_size {
-            panic!("initial size {initial_size} larger than max size {max_size}");
-        }
-
-        // The actual initial / max size is the one given by the caller, plus the size of
-        // 'SharedStruct'.
-        let initial_size = HEADER_SIZE + initial_size;
-        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
-
-        // Reserve address space for it with mmap
-        //
-        // TODO: Use MAP_HUGETLB if possible
-        let start_ptr = unsafe {
-            nix_mmap(
-                None,
-                max_size,
-                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
-                MapFlags::MAP_SHARED,
-                &fd,
-                0,
-            )
-        }
-        .map_err(|e| Error::new("mmap failed: {e}", e))?;
-
-        // Reserve space for the initial size
-        enlarge_file(fd.as_fd(), initial_size as u64)?;
-
-        // Initialize the header
-        let shared: NonNull<SharedStruct> = start_ptr.cast();
-        unsafe {
-            shared.write(SharedStruct {
-                max_size: max_size.into(),
-                current_size: AtomicUsize::new(initial_size),
-            })
-        };
-
-        // The user data begins after the header
-        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
-
-        Ok(ShmemHandle {
-            fd,
-            max_size: max_size.into(),
-            shared_ptr: shared,
-            data_ptr,
-        })
-    }
-
-    // return reference to the header
-    fn shared(&self) -> &SharedStruct {
-        unsafe { self.shared_ptr.as_ref() }
-    }
-
-    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
-    /// when creating the area.
-    ///
-    /// This may only be called from one process/thread concurrently. We detect that case
-    /// and return an Error.
-    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
-        let new_size = new_size + HEADER_SIZE;
-        let shared = self.shared();
-
-        if new_size > self.max_size {
-            panic!(
-                "new size ({} is greater than max size ({})",
-                new_size, self.max_size
-            );
-        }
-        assert_eq!(self.max_size, shared.max_size);
-
-        // Lock the area by setting the bit in 'current_size'
-        //
-        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
-        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
-        // since this is not performance-critical, better safe than sorry .
-        let mut old_size = shared.current_size.load(Ordering::Acquire);
-        loop {
-            if (old_size & RESIZE_IN_PROGRESS) != 0 {
-                return Err(Error::new(
-                    "concurrent resize detected",
-                    Errno::UnknownErrno,
-                ));
-            }
-            match shared.current_size.compare_exchange(
-                old_size,
-                new_size,
-                Ordering::Acquire,
-                Ordering::Relaxed,
-            ) {
-                Ok(_) => break,
-                Err(x) => old_size = x,
-            }
-        }
-
-        // Ok, we got the lock.
-        //
-        // NB: If anything goes wrong, we *must* clear the bit!
-        let result = {
-            use std::cmp::Ordering::{Equal, Greater, Less};
-            match new_size.cmp(&old_size) {
-                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
-                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
-                }),
-                Equal => Ok(()),
-                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
-            }
-        };
-
-        // Unlock
-        shared.current_size.store(
-            if result.is_ok() { new_size } else { old_size },
-            Ordering::Release,
-        );
-
-        result
-    }
-
-    /// Returns the current user-visible size of the shared memory segment.
-    ///
-    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
-    /// responsibility not to access the area beyond the current size.
-    pub fn current_size(&self) -> usize {
-        let total_current_size =
-            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
-        total_current_size - HEADER_SIZE
-    }
-}
-
-impl Drop for ShmemHandle {
-    fn drop(&mut self) {
-        // SAFETY: The pointer was obtained from mmap() with the given size.
-        // We unmap the entire region.
-        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
-        // The fd is dropped automatically by OwnedFd.
-    }
-}
-
-/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
-/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
-/// development and testing, but in production we want the file to stay in memory.
-///
-/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
-#[allow(unused_variables)]
-fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
-            .map_err(|e| Error::new("memfd_create failed: {e}", e))
-    }
-    #[cfg(target_os = "macos")]
-    {
-        let file = tempfile::tempfile().map_err(|e| {
-            Error::new(
-                "could not create temporary file to back shmem area: {e}",
-                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
-            )
-        })?;
-        Ok(OwnedFd::from(file))
-    }
-}
-
-fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
-    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
-    // we don't get a segfault later when trying to actually use it.
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
-            Error::new(
-                "could not grow shmem segment, posix_fallocate failed: {e}",
-                e,
-            )
-        })
-    }
-    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
-    #[cfg(target_os = "macos")]
-    {
-        nix::unistd::ftruncate(fd, size as i64)
-            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use nix::unistd::ForkResult;
-    use std::ops::Range;
-
-    /// check that all bytes in given range have the expected value.
-    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
-        for i in range {
-            let b = unsafe { *(ptr.add(i)) };
-            assert_eq!(expected, b, "unexpected byte at offset {}", i);
-        }
-    }
-
-    /// Write 'b' to all bytes in the given range
-    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
-        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
-    }
-
-    // simple single-process test of growing and shrinking
-    #[test]
-    fn test_shmem_resize() -> Result<(), Error> {
-        let max_size = 1024 * 1024;
-        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
-
-        assert_eq!(init_struct.current_size(), 0);
-
-        // Initial grow
-        let size1 = 10000;
-        init_struct.set_size(size1).unwrap();
-        assert_eq!(init_struct.current_size(), size1);
-
-        // Write some data
-        let data_ptr = init_struct.data_ptr.as_ptr();
-        write_range(data_ptr, 0xAA, 0..size1);
-        assert_range(data_ptr, 0xAA, 0..size1);
-
-        // Shrink
-        let size2 = 5000;
-        init_struct.set_size(size2).unwrap();
-        assert_eq!(init_struct.current_size(), size2);
-
-        // Grow again
-        let size3 = 20000;
-        init_struct.set_size(size3).unwrap();
-        assert_eq!(init_struct.current_size(), size3);
-
-        // Try to read it. The area that was shrunk and grown again should read as all zeros now
-        assert_range(data_ptr, 0xAA, 0..5000);
-        assert_range(data_ptr, 0, 5000..size1);
-
-        // Try to grow beyond max_size
-        //let size4 = max_size + 1;
-        //assert!(init_struct.set_size(size4).is_err());
-
-        // Dropping init_struct should unmap the memory
-        drop(init_struct);
-
-        Ok(())
-    }
-
-    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
-    /// but is stored in the shared memory area and works across processes. It's implemented by
-    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
-    struct SimpleBarrier {
-        num_procs: usize,
-        count: AtomicUsize,
-    }
-
-    impl SimpleBarrier {
-        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
-            unsafe {
-                *ptr = SimpleBarrier {
-                    num_procs,
-                    count: AtomicUsize::new(0),
-                }
-            }
-        }
-
-        pub fn wait(&self) {
-            let old = self.count.fetch_add(1, Ordering::Relaxed);
-
-            let generation = old / self.num_procs;
-
-            let mut current = old + 1;
-            while current < (generation + 1) * self.num_procs {
-                std::thread::sleep(std::time::Duration::from_millis(10));
-                current = self.count.load(Ordering::Relaxed);
-            }
-        }
-    }
-
-    #[test]
-    fn test_multi_process() {
-        // Initialize
-        let max_size = 1_000_000_000_000;
-        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
-        let ptr = init_struct.data_ptr.as_ptr();
-
-        // Store the SimpleBarrier in the first 1k of the area.
-        init_struct.set_size(10000).unwrap();
-        let barrier_ptr: *mut SimpleBarrier = unsafe {
-            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
-                .cast()
-        };
-        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
-        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
-
-        // Fork another test process. The code after this runs in both processes concurrently.
-        let fork_result = unsafe { nix::unistd::fork().unwrap() };
-
-        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, 1000..2000);
-        } else {
-            write_range(ptr, 0xBB, 2000..3000);
-        }
-        barrier.wait();
-        // Verify the contents. (in both processes)
-        assert_range(ptr, 0xAA, 1000..2000);
-        assert_range(ptr, 0xBB, 2000..3000);
-
-        // Grow, from the child this time
-        let size = 10_000_000;
-        if !fork_result.is_parent() {
-            init_struct.set_size(size).unwrap();
-        }
-        barrier.wait();
-
-        // make some writes at the end
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, (size - 10)..size);
-        } else {
-            write_range(ptr, 0xBB, (size - 20)..(size - 10));
-        }
-        barrier.wait();
-
-        // Verify the contents. (This runs in both processes)
-        assert_range(ptr, 0, (size - 1000)..(size - 20));
-        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
-        assert_range(ptr, 0xAA, (size - 10)..size);
-
-        if let ForkResult::Parent { child } = fork_result {
-            nix::sys::wait::waitpid(child, None).unwrap();
-        }
-    }
-}
+pub mod hash;
+pub mod shmem;
--- a/libs/neon-shmem/src/shmem.rs
+++ b/libs/neon-shmem/src/shmem.rs
@@ -0,0 +1,418 @@
+//! Dynamically resizable contiguous chunk of shared memory
+
+use std::num::NonZeroUsize;
+use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use nix::errno::Errno;
+use nix::sys::mman::MapFlags;
+use nix::sys::mman::ProtFlags;
+use nix::sys::mman::mmap as nix_mmap;
+use nix::sys::mman::munmap as nix_munmap;
+use nix::unistd::ftruncate as nix_ftruncate;
+
+/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
+/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
+/// specified at creation.
+///
+/// The area is backed by an anonymous file created with memfd_create(). The full address space for
+/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
+/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
+/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
+/// future.
+pub struct ShmemHandle {
+    /// memfd file descriptor
+    fd: OwnedFd,
+
+    max_size: usize,
+
+    // Pointer to the beginning of the shared memory area. The header is stored there.
+    shared_ptr: NonNull<SharedStruct>,
+
+    // Pointer to the beginning of the user data
+    pub data_ptr: NonNull<u8>,
+}
+
+/// This is stored at the beginning in the shared memory area.
+struct SharedStruct {
+    max_size: usize,
+
+    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
+    current_size: AtomicUsize,
+}
+
+const RESIZE_IN_PROGRESS: usize = 1 << 63;
+
+const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
+
+/// Error type returned by the ShmemHandle functions.
+#[derive(thiserror::Error, Debug)]
+#[error("{msg}: {errno}")]
+pub struct Error {
+    pub msg: String,
+    pub errno: Errno,
+}
+
+impl Error {
+    fn new(msg: &str, errno: Errno) -> Error {
+        Error {
+            msg: msg.to_string(),
+            errno,
+        }
+    }
+}
+
+impl ShmemHandle {
+    /// Create a new shared memory area. To communicate between processes, the processes need to be
+    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
+    ///
+    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
+    /// processes can continue using it, however.
+    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
+        // create the backing anonymous file.
+        let fd = create_backing_file(name)?;
+
+        Self::new_with_fd(fd, initial_size, max_size)
+    }
+
+    fn new_with_fd(
+        fd: OwnedFd,
+        initial_size: usize,
+        max_size: usize,
+    ) -> Result<ShmemHandle, Error> {
+        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
+        // is a little larger than this because of the SharedStruct header. Make the upper limit
+        // somewhat smaller than that, because with anything close to that, you'll run out of
+        // memory anyway.
+        if max_size >= 1 << 48 {
+            panic!("max size {} too large", max_size);
+        }
+        if initial_size > max_size {
+            panic!("initial size {initial_size} larger than max size {max_size}");
+        }
+
+        // The actual initial / max size is the one given by the caller, plus the size of
+        // 'SharedStruct'.
+        let initial_size = HEADER_SIZE + initial_size;
+        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
+
+        // Reserve address space for it with mmap
+        //
+        // TODO: Use MAP_HUGETLB if possible
+        let start_ptr = unsafe {
+            nix_mmap(
+                None,
+                max_size,
+                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
+                MapFlags::MAP_SHARED,
+                &fd,
+                0,
+            )
+        }
+        .map_err(|e| Error::new("mmap failed: {e}", e))?;
+
+        // Reserve space for the initial size
+        enlarge_file(fd.as_fd(), initial_size as u64)?;
+
+        // Initialize the header
+        let shared: NonNull<SharedStruct> = start_ptr.cast();
+        unsafe {
+            shared.write(SharedStruct {
+                max_size: max_size.into(),
+                current_size: AtomicUsize::new(initial_size),
+            })
+        };
+
+        // The user data begins after the header
+        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
+
+        Ok(ShmemHandle {
+            fd,
+            max_size: max_size.into(),
+            shared_ptr: shared,
+            data_ptr,
+        })
+    }
+
+    // return reference to the header
+    fn shared(&self) -> &SharedStruct {
+        unsafe { self.shared_ptr.as_ref() }
+    }
+
+    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
+    /// when creating the area.
+    ///
+    /// This may only be called from one process/thread concurrently. We detect that case
+    /// and return an Error.
+    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
+        let new_size = new_size + HEADER_SIZE;
+        let shared = self.shared();
+
+        if new_size > self.max_size {
+            panic!(
+                "new size ({} is greater than max size ({})",
+                new_size, self.max_size
+            );
+        }
+        assert_eq!(self.max_size, shared.max_size);
+
+        // Lock the area by setting the bit in 'current_size'
+        //
+        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
+        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
+        // since this is not performance-critical, better safe than sorry .
+        let mut old_size = shared.current_size.load(Ordering::Acquire);
+        loop {
+            if (old_size & RESIZE_IN_PROGRESS) != 0 {
+                return Err(Error::new(
+                    "concurrent resize detected",
+                    Errno::UnknownErrno,
+                ));
+            }
+            match shared.current_size.compare_exchange(
+                old_size,
+                new_size,
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => break,
+                Err(x) => old_size = x,
+            }
+        }
+
+        // Ok, we got the lock.
+        //
+        // NB: If anything goes wrong, we *must* clear the bit!
+        let result = {
+            use std::cmp::Ordering::{Equal, Greater, Less};
+            match new_size.cmp(&old_size) {
+                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
+                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
+                }),
+                Equal => Ok(()),
+                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
+            }
+        };
+
+        // Unlock
+        shared.current_size.store(
+            if result.is_ok() { new_size } else { old_size },
+            Ordering::Release,
+        );
+
+        result
+    }
+
+    /// Returns the current user-visible size of the shared memory segment.
+    ///
+    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
+    /// responsibility not to access the area beyond the current size.
+    pub fn current_size(&self) -> usize {
+        let total_current_size =
+            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
+        total_current_size - HEADER_SIZE
+    }
+}
+
+impl Drop for ShmemHandle {
+    fn drop(&mut self) {
+        // SAFETY: The pointer was obtained from mmap() with the given size.
+        // We unmap the entire region.
+        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
+        // The fd is dropped automatically by OwnedFd.
+    }
+}
+
+/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
+/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
+/// development and testing, but in production we want the file to stay in memory.
+///
+/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
+#[allow(unused_variables)]
+fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
+            .map_err(|e| Error::new("memfd_create failed: {e}", e))
+    }
+    #[cfg(target_os = "macos")]
+    {
+        let file = tempfile::tempfile().map_err(|e| {
+            Error::new(
+                "could not create temporary file to back shmem area: {e}",
+                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
+            )
+        })?;
+        Ok(OwnedFd::from(file))
+    }
+}
+
+fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
+    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
+    // we don't get a segfault later when trying to actually use it.
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
+            Error::new(
+                "could not grow shmem segment, posix_fallocate failed: {e}",
+                e,
+            )
+        })
+    }
+    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
+    #[cfg(target_os = "macos")]
+    {
+        nix::unistd::ftruncate(fd, size as i64)
+            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use nix::unistd::ForkResult;
+    use std::ops::Range;
+
+    /// check that all bytes in given range have the expected value.
+    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
+        for i in range {
+            let b = unsafe { *(ptr.add(i)) };
+            assert_eq!(expected, b, "unexpected byte at offset {}", i);
+        }
+    }
+
+    /// Write 'b' to all bytes in the given range
+    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
+        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
+    }
+
+    // simple single-process test of growing and shrinking
+    #[test]
+    fn test_shmem_resize() -> Result<(), Error> {
+        let max_size = 1024 * 1024;
+        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
+
+        assert_eq!(init_struct.current_size(), 0);
+
+        // Initial grow
+        let size1 = 10000;
+        init_struct.set_size(size1).unwrap();
+        assert_eq!(init_struct.current_size(), size1);
+
+        // Write some data
+        let data_ptr = init_struct.data_ptr.as_ptr();
+        write_range(data_ptr, 0xAA, 0..size1);
+        assert_range(data_ptr, 0xAA, 0..size1);
+
+        // Shrink
+        let size2 = 5000;
+        init_struct.set_size(size2).unwrap();
+        assert_eq!(init_struct.current_size(), size2);
+
+        // Grow again
+        let size3 = 20000;
+        init_struct.set_size(size3).unwrap();
+        assert_eq!(init_struct.current_size(), size3);
+
+        // Try to read it. The area that was shrunk and grown again should read as all zeros now
+        assert_range(data_ptr, 0xAA, 0..5000);
+        assert_range(data_ptr, 0, 5000..size1);
+
+        // Try to grow beyond max_size
+        //let size4 = max_size + 1;
+        //assert!(init_struct.set_size(size4).is_err());
+
+        // Dropping init_struct should unmap the memory
+        drop(init_struct);
+
+        Ok(())
+    }
+
+    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
+    /// but is stored in the shared memory area and works across processes. It's implemented by
+    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
+    struct SimpleBarrier {
+        num_procs: usize,
+        count: AtomicUsize,
+    }
+
+    impl SimpleBarrier {
+        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
+            unsafe {
+                *ptr = SimpleBarrier {
+                    num_procs,
+                    count: AtomicUsize::new(0),
+                }
+            }
+        }
+
+        pub fn wait(&self) {
+            let old = self.count.fetch_add(1, Ordering::Relaxed);
+
+            let generation = old / self.num_procs;
+
+            let mut current = old + 1;
+            while current < (generation + 1) * self.num_procs {
+                std::thread::sleep(std::time::Duration::from_millis(10));
+                current = self.count.load(Ordering::Relaxed);
+            }
+        }
+    }
+
+    #[test]
+    fn test_multi_process() {
+        // Initialize
+        let max_size = 1_000_000_000_000;
+        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
+        let ptr = init_struct.data_ptr.as_ptr();
+
+        // Store the SimpleBarrier in the first 1k of the area.
+        init_struct.set_size(10000).unwrap();
+        let barrier_ptr: *mut SimpleBarrier = unsafe {
+            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
+                .cast()
+        };
+        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
+        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
+
+        // Fork another test process. The code after this runs in both processes concurrently.
+        let fork_result = unsafe { nix::unistd::fork().unwrap() };
+
+        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, 1000..2000);
+        } else {
+            write_range(ptr, 0xBB, 2000..3000);
+        }
+        barrier.wait();
+        // Verify the contents. (in both processes)
+        assert_range(ptr, 0xAA, 1000..2000);
+        assert_range(ptr, 0xBB, 2000..3000);
+
+        // Grow, from the child this time
+        let size = 10_000_000;
+        if !fork_result.is_parent() {
+            init_struct.set_size(size).unwrap();
+        }
+        barrier.wait();
+
+        // make some writes at the end
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, (size - 10)..size);
+        } else {
+            write_range(ptr, 0xBB, (size - 20)..(size - 10));
+        }
+        barrier.wait();
+
+        // Verify the contents. (This runs in both processes)
+        assert_range(ptr, 0, (size - 1000)..(size - 20));
+        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
+        assert_range(ptr, 0xAA, (size - 10)..size);
+
+        if let ForkResult::Parent { child } = fork_result {
+            nix::sys::wait::waitpid(child, None).unwrap();
+        }
+    }
+}
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -20,7 +20,6 @@ use postgres_backend::AuthType;
 use remote_storage::RemoteStorageConfig;
 use serde_with::serde_as;
 use utils::logging::LogFormat;
-use utils::postgres_client::PostgresClientProtocol;

 use crate::models::{ImageCompressionAlgorithm, LsnLease};

@@ -189,7 +188,6 @@ pub struct ConfigToml {
    pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub no_sync: Option<bool>,
-    pub wal_receiver_protocol: PostgresClientProtocol,
    pub page_service_pipelining: PageServicePipeliningConfig,
    pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
    pub enable_read_path_debugging: Option<bool>,
@@ -330,6 +328,8 @@ pub struct TimelineImportConfig {
    pub import_job_concurrency: NonZeroUsize,
    pub import_job_soft_size_limit: NonZeroUsize,
    pub import_job_checkpoint_threshold: NonZeroUsize,
+    /// Max size of the remote storage partial read done by any job
+    pub import_job_max_byte_range_size: NonZeroUsize,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -525,8 +525,6 @@ pub struct TenantConfigToml {
    /// (either this flag or the pageserver-global one need to be set)
    pub timeline_offloading: bool,

-    pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
-
    /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
    /// `index_part.json`, and it cannot be reversed.
    pub rel_size_v2_enabled: bool,
@@ -607,12 +605,6 @@ pub mod defaults {

    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;

-    pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
-        utils::postgres_client::PostgresClientProtocol::Interpreted {
-            format: utils::postgres_client::InterpretedFormat::Protobuf,
-            compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }),
-        };
-
    pub const DEFAULT_SSL_KEY_FILE: &str = "server.key";
    pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt";
 }
@@ -711,7 +703,6 @@ impl Default for ConfigToml {
            virtual_file_io_mode: None,
            tenant_config: TenantConfigToml::default(),
            no_sync: None,
-            wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
            page_service_pipelining: PageServicePipeliningConfig::Pipelined(
                PageServicePipeliningConfigPipelined {
                    max_batch_size: NonZeroUsize::new(32).unwrap(),
@@ -735,6 +726,7 @@ impl Default for ConfigToml {
                import_job_concurrency: NonZeroUsize::new(32).unwrap(),
                import_job_soft_size_limit: NonZeroUsize::new(256 * 1024 * 1024).unwrap(),
                import_job_checkpoint_threshold: NonZeroUsize::new(32).unwrap(),
+                import_job_max_byte_range_size: NonZeroUsize::new(4 * 1024 * 1024).unwrap(),
            },
            basebackup_cache_config: None,
            posthog_config: None,
@@ -855,7 +847,6 @@ impl Default for TenantConfigToml {
            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
            timeline_offloading: true,
-            wal_receiver_protocol_override: None,
            rel_size_v2_enabled: false,
            gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
            gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -20,7 +20,6 @@ use serde_with::serde_as;
 pub use utilization::PageserverUtilization;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
-use utils::postgres_client::PostgresClientProtocol;
 use utils::{completion, serde_system_time};

 use crate::config::Ratio;
@@ -622,8 +621,6 @@ pub struct TenantConfigPatch {
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub timeline_offloading: FieldPatch<bool>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
-    pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
-    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub rel_size_v2_enabled: FieldPatch<bool>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub gc_compaction_enabled: FieldPatch<bool>,
@@ -748,9 +745,6 @@ pub struct TenantConfig {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub timeline_offloading: Option<bool>,

-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
-
    #[serde(skip_serializing_if = "Option::is_none")]
    pub rel_size_v2_enabled: Option<bool>,

@@ -812,7 +806,6 @@ impl TenantConfig {
            mut lsn_lease_length,
            mut lsn_lease_length_for_ts,
            mut timeline_offloading,
-            mut wal_receiver_protocol_override,
            mut rel_size_v2_enabled,
            mut gc_compaction_enabled,
            mut gc_compaction_verification,
@@ -905,9 +898,6 @@ impl TenantConfig {
            .map(|v| humantime::parse_duration(&v))?
            .apply(&mut lsn_lease_length_for_ts);
        patch.timeline_offloading.apply(&mut timeline_offloading);
-        patch
-            .wal_receiver_protocol_override
-            .apply(&mut wal_receiver_protocol_override);
        patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled);
        patch
            .gc_compaction_enabled
@@ -960,7 +950,6 @@ impl TenantConfig {
            lsn_lease_length,
            lsn_lease_length_for_ts,
            timeline_offloading,
-            wal_receiver_protocol_override,
            rel_size_v2_enabled,
            gc_compaction_enabled,
            gc_compaction_verification,
@@ -1058,9 +1047,6 @@ impl TenantConfig {
            timeline_offloading: self
                .timeline_offloading
                .unwrap_or(global_conf.timeline_offloading),
-            wal_receiver_protocol_override: self
-                .wal_receiver_protocol_override
-                .or(global_conf.wal_receiver_protocol_override),
            rel_size_v2_enabled: self
                .rel_size_v2_enabled
                .unwrap_or(global_conf.rel_size_v2_enabled),
--- a/libs/posthog_client_lite/src/background_loop.rs
+++ b/libs/posthog_client_lite/src/background_loop.rs
@@ -6,7 +6,7 @@ use arc_swap::ArcSwap;
 use tokio_util::sync::CancellationToken;
 use tracing::{Instrument, info_span};

-use crate::{FeatureStore, PostHogClient, PostHogClientConfig};
+use crate::{CaptureEvent, FeatureStore, PostHogClient, PostHogClientConfig};

 /// A background loop that fetches feature flags from PostHog and updates the feature store.
 pub struct FeatureResolverBackgroundLoop {
@@ -24,9 +24,16 @@ impl FeatureResolverBackgroundLoop {
        }
    }

-    pub fn spawn(self: Arc<Self>, handle: &tokio::runtime::Handle, refresh_period: Duration) {
+    pub fn spawn(
+        self: Arc<Self>,
+        handle: &tokio::runtime::Handle,
+        refresh_period: Duration,
+        fake_tenants: Vec<CaptureEvent>,
+    ) {
        let this = self.clone();
        let cancel = self.cancel.clone();
+
+        // Main loop of updating the feature flags.
        handle.spawn(
            async move {
                tracing::info!("Starting PostHog feature resolver");
@@ -56,6 +63,22 @@ impl FeatureResolverBackgroundLoop {
            }
            .instrument(info_span!("posthog_feature_resolver")),
        );
+
+        // Report fake tenants to PostHog so that we have the combination of all the properties in the UI.
+        // Do one report per pageserver restart.
+        let this = self.clone();
+        handle.spawn(
+            async move {
+                tracing::info!("Starting PostHog feature reporter");
+                for tenant in &fake_tenants {
+                    tracing::info!("Reporting fake tenant: {:?}", tenant);
+                }
+                if let Err(e) = this.posthog_client.capture_event_batch(&fake_tenants).await {
+                    tracing::warn!("Cannot report fake tenants: {}", e);
+                }
+            }
+            .instrument(info_span!("posthog_feature_reporter")),
+        );
    }

    pub fn feature_store(&self) -> Arc<FeatureStore> {
--- a/libs/posthog_client_lite/src/lib.rs
+++ b/libs/posthog_client_lite/src/lib.rs
@@ -22,6 +22,16 @@ pub enum PostHogEvaluationError {
    Internal(String),
 }

+impl PostHogEvaluationError {
+    pub fn as_variant_str(&self) -> &'static str {
+        match self {
+            PostHogEvaluationError::NotAvailable(_) => "not_available",
+            PostHogEvaluationError::NoConditionGroupMatched => "no_condition_group_matched",
+            PostHogEvaluationError::Internal(_) => "internal",
+        }
+    }
+}
+
 #[derive(Deserialize)]
 pub struct LocalEvaluationResponse {
    pub flags: Vec<LocalEvaluationFlag>,
@@ -54,7 +64,7 @@ pub struct LocalEvaluationFlagFilterProperty {
    operator: String,
 }

-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, Clone)]
 #[serde(untagged)]
 pub enum PostHogFlagFilterPropertyValue {
    String(String),
@@ -497,6 +507,13 @@ pub struct PostHogClient {
    client: reqwest::Client,
 }

+#[derive(Serialize, Debug)]
+pub struct CaptureEvent {
+    pub event: String,
+    pub distinct_id: String,
+    pub properties: serde_json::Value,
+}
+
 impl PostHogClient {
    pub fn new(config: PostHogClientConfig) -> Self {
        let client = reqwest::Client::new();
@@ -560,12 +577,12 @@ impl PostHogClient {
        &self,
        event: &str,
        distinct_id: &str,
-        properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
+        properties: &serde_json::Value,
    ) -> anyhow::Result<()> {
        // PUBLIC_URL/capture/
-        // with bearer token of self.client_api_key
        let url = format!("{}/capture/", self.config.public_api_url);
-        self.client
+        let response = self
+            .client
            .post(url)
            .body(serde_json::to_string(&json!({
                "api_key": self.config.client_api_key,
@@ -575,6 +592,39 @@ impl PostHogClient {
            }))?)
            .send()
            .await?;
+        let status = response.status();
+        let body = response.text().await?;
+        if !status.is_success() {
+            return Err(anyhow::anyhow!(
+                "Failed to capture events: {}, {}",
+                status,
+                body
+            ));
+        }
+        Ok(())
+    }
+
+    pub async fn capture_event_batch(&self, events: &[CaptureEvent]) -> anyhow::Result<()> {
+        // PUBLIC_URL/batch/
+        let url = format!("{}/batch/", self.config.public_api_url);
+        let response = self
+            .client
+            .post(url)
+            .body(serde_json::to_string(&json!({
+                "api_key": self.config.client_api_key,
+                "batch": events,
+            }))?)
+            .send()
+            .await?;
+        let status = response.status();
+        let body = response.text().await?;
+        if !status.is_success() {
+            return Err(anyhow::anyhow!(
+                "Failed to capture events: {}, {}",
+                status,
+                body
+            ));
+        }
        Ok(())
    }
 }
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -439,6 +439,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
        currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 },
        shard_ps_feedback: [empty_feedback; 128],
        num_shards: 0,
+        replica_promote: false,
        min_ps_feedback: empty_feedback,
    }
 }
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -9,6 +9,7 @@ license.workspace = true
 [dependencies]
 anyhow.workspace = true
 async-trait.workspace = true
+bytes.workspace = true
 camino.workspace = true
 clap.workspace = true
 futures.workspace = true
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,4 +1,4 @@
-use std::collections::{HashSet, VecDeque};
+use std::collections::{HashMap, HashSet, VecDeque};
 use std::future::Future;
 use std::num::NonZeroUsize;
 use std::pin::Pin;
@@ -8,12 +8,12 @@ use std::time::{Duration, Instant};

 use anyhow::Context;
 use async_trait::async_trait;
+use bytes::Bytes;
 use camino::Utf8PathBuf;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpaceAccum;
-use pageserver_api::models::{
-    PagestreamGetPageRequest, PagestreamGetPageResponse, PagestreamRequest,
-};
+use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
+use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::TenantShardId;
 use pageserver_page_api::proto;
 use rand::prelude::*;
@@ -77,6 +77,16 @@ pub(crate) struct Args {
    #[clap(long, default_value = "1")]
    queue_depth: NonZeroUsize,

+    /// Batch size of contiguous pages generated by each client. This is equivalent to how Postgres
+    /// will request page batches (e.g. prefetches or vectored reads). A batch counts as 1 RPS and
+    /// 1 queue depth.
+    ///
+    /// The libpq protocol does not support client-side batching, and will submit batches as many
+    /// individual requests, in the hope that the server will batch them. Each batch still counts as
+    /// 1 RPS and 1 queue depth.
+    #[clap(long, default_value = "1")]
+    batch_size: NonZeroUsize,
+
    #[clap(long)]
    only_relnode: Option<u32>,

@@ -392,7 +402,16 @@ async fn run_worker(
    shared_state.start_work_barrier.wait().await;
    let client_start = Instant::now();
    let mut ticks_processed = 0;
-    let mut inflight = VecDeque::new();
+    let mut req_id = 0;
+    let batch_size: usize = args.batch_size.into();
+
+    // Track inflight requests by request ID and start time. This times the request duration, and
+    // ensures responses match requests. We don't expect responses back in any particular order.
+    //
+    // NB: this does not check that all requests received a response, because we don't wait for the
+    // inflight requests to complete when the duration elapses.
+    let mut inflight: HashMap<u64, Instant> = HashMap::new();
+
    while !cancel.is_cancelled() {
        // Detect if a request took longer than the RPS rate
        if let Some(period) = &rps_period {
@@ -408,36 +427,72 @@ async fn run_worker(
        }

        while inflight.len() < args.queue_depth.get() {
+            req_id += 1;
            let start = Instant::now();
-            let req = {
+            let (req_lsn, mod_lsn, rel, blks) = {
+                /// Converts a compact i128 key to a relation tag and block number.
+                fn key_to_block(key: i128) -> (RelTag, u32) {
+                    let key = Key::from_i128(key);
+                    assert!(key.is_rel_block_key());
+                    key.to_rel_block()
+                        .expect("we filter non-rel-block keys out above")
+                }
+
+                // Pick a random page from a random relation.
                let mut rng = rand::thread_rng();
                let r = &ranges[weights.sample(&mut rng)];
                let key: i128 = rng.gen_range(r.start..r.end);
-                let key = Key::from_i128(key);
-                assert!(key.is_rel_block_key());
-                let (rel_tag, block_no) = key
-                    .to_rel_block()
-                    .expect("we filter non-rel-block keys out above");
-                PagestreamGetPageRequest {
-                    hdr: PagestreamRequest {
-                        reqid: 0,
-                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
-                            Lsn::MAX
-                        } else {
-                            r.timeline_lsn
-                        },
-                        not_modified_since: r.timeline_lsn,
-                    },
-                    rel: rel_tag,
-                    blkno: block_no,
+                let (rel_tag, block_no) = key_to_block(key);
+
+                let mut blks = VecDeque::with_capacity(batch_size);
+                blks.push_back(block_no);
+
+                // If requested, populate a batch of sequential pages. This is how Postgres will
+                // request page batches (e.g. prefetches). If we hit the end of the relation, we
+                // grow the batch towards the start too.
+                for i in 1..batch_size {
+                    let (r, b) = key_to_block(key + i as i128);
+                    if r != rel_tag {
+                        break; // went outside relation
+                    }
+                    blks.push_back(b)
                }
+
+                if blks.len() < batch_size {
+                    // Grow batch backwards if needed.
+                    for i in 1..batch_size {
+                        let (r, b) = key_to_block(key - i as i128);
+                        if r != rel_tag {
+                            break; // went outside relation
+                        }
+                        blks.push_front(b)
+                    }
+                }
+
+                // We assume that the entire batch can fit within the relation.
+                assert_eq!(blks.len(), batch_size, "incomplete batch");
+
+                let req_lsn = if rng.gen_bool(args.req_latest_probability) {
+                    Lsn::MAX
+                } else {
+                    r.timeline_lsn
+                };
+                (req_lsn, r.timeline_lsn, rel_tag, blks.into())
            };
-            client.send_get_page(req).await.unwrap();
-            inflight.push_back(start);
+            client
+                .send_get_page(req_id, req_lsn, mod_lsn, rel, blks)
+                .await
+                .unwrap();
+            let old = inflight.insert(req_id, start);
+            assert!(old.is_none(), "duplicate request ID {req_id}");
        }

-        let start = inflight.pop_front().unwrap();
-        client.recv_get_page().await.unwrap();
+        let (req_id, pages) = client.recv_get_page().await.unwrap();
+        assert_eq!(pages.len(), batch_size, "unexpected page count");
+        assert!(pages.iter().all(|p| !p.is_empty()), "empty page");
+        let start = inflight
+            .remove(&req_id)
+            .expect("response for unknown request ID");
        let end = Instant::now();
        shared_state.live_stats.request_done();
        ticks_processed += 1;
@@ -467,15 +522,24 @@ async fn run_worker(
 #[async_trait]
 trait Client: Send {
    /// Sends an asynchronous GetPage request to the pageserver.
-    async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()>;
+    async fn send_get_page(
+        &mut self,
+        req_id: u64,
+        req_lsn: Lsn,
+        mod_lsn: Lsn,
+        rel: RelTag,
+        blks: Vec<u32>,
+    ) -> anyhow::Result<()>;

    /// Receives the next GetPage response from the pageserver.
-    async fn recv_get_page(&mut self) -> anyhow::Result<PagestreamGetPageResponse>;
+    async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)>;
 }

 /// A libpq-based Pageserver client.
 struct LibpqClient {
    inner: pageserver_client::page_service::PagestreamClient,
+    // Track sent batches, so we know how many responses to expect.
+    batch_sizes: VecDeque<usize>,
 }

 impl LibpqClient {
@@ -484,18 +548,55 @@ impl LibpqClient {
            .await?
            .pagestream(ttid.tenant_id, ttid.timeline_id)
            .await?;
-        Ok(Self { inner })
+        Ok(Self {
+            inner,
+            batch_sizes: VecDeque::new(),
+        })
    }
 }

 #[async_trait]
 impl Client for LibpqClient {
-    async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
-        self.inner.getpage_send(req).await
+    async fn send_get_page(
+        &mut self,
+        req_id: u64,
+        req_lsn: Lsn,
+        mod_lsn: Lsn,
+        rel: RelTag,
+        blks: Vec<u32>,
+    ) -> anyhow::Result<()> {
+        // libpq doesn't support client-side batches, so we send a bunch of individual requests
+        // instead in the hope that the server will batch them for us. We use the same request ID
+        // for all, because we'll return a single batch response.
+        self.batch_sizes.push_back(blks.len());
+        for blkno in blks {
+            let req = PagestreamGetPageRequest {
+                hdr: PagestreamRequest {
+                    reqid: req_id,
+                    request_lsn: req_lsn,
+                    not_modified_since: mod_lsn,
+                },
+                rel,
+                blkno,
+            };
+            self.inner.getpage_send(req).await?;
+        }
+        Ok(())
    }

-    async fn recv_get_page(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
-        self.inner.getpage_recv().await
+    async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
+        let batch_size = self.batch_sizes.pop_front().unwrap();
+        let mut batch = Vec::with_capacity(batch_size);
+        let mut req_id = None;
+        for _ in 0..batch_size {
+            let resp = self.inner.getpage_recv().await?;
+            if req_id.is_none() {
+                req_id = Some(resp.req.hdr.reqid);
+            }
+            assert_eq!(req_id, Some(resp.req.hdr.reqid), "request ID mismatch");
+            batch.push(resp.page);
+        }
+        Ok((req_id.unwrap(), batch))
    }
 }

@@ -532,31 +633,35 @@ impl GrpcClient {

 #[async_trait]
 impl Client for GrpcClient {
-    async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
+    async fn send_get_page(
+        &mut self,
+        req_id: u64,
+        req_lsn: Lsn,
+        mod_lsn: Lsn,
+        rel: RelTag,
+        blks: Vec<u32>,
+    ) -> anyhow::Result<()> {
        let req = proto::GetPageRequest {
-            request_id: 0,
+            request_id: req_id,
            request_class: proto::GetPageClass::Normal as i32,
            read_lsn: Some(proto::ReadLsn {
-                request_lsn: req.hdr.request_lsn.0,
-                not_modified_since_lsn: req.hdr.not_modified_since.0,
+                request_lsn: req_lsn.0,
+                not_modified_since_lsn: mod_lsn.0,
            }),
-            rel: Some(req.rel.into()),
-            block_number: vec![req.blkno],
+            rel: Some(rel.into()),
+            block_number: blks,
        };
        self.req_tx.send(req).await?;
        Ok(())
    }

-    async fn recv_get_page(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
+    async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
        let resp = self.resp_rx.message().await?.unwrap();
        anyhow::ensure!(
            resp.status_code == proto::GetPageStatusCode::Ok as i32,
            "unexpected status code: {}",
            resp.status_code
        );
-        Ok(PagestreamGetPageResponse {
-            page: resp.page_image[0].clone(),
-            req: PagestreamGetPageRequest::default(), // dummy
-        })
+        Ok((resp.request_id, resp.page_image))
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -158,7 +158,6 @@ fn main() -> anyhow::Result<()> {
    // (maybe we should automate this with a visitor?).
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
    info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
-    info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
    info!(?conf.validate_wal_contiguity, "starting with WAL contiguity validation");
    info!(?conf.page_service_pipelining, "starting with page service pipelining config");
    info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config");
@@ -819,6 +818,7 @@ fn start_pageserver(
            tenant_manager.clone(),
            grpc_auth,
            otel_guard.as_ref().map(|g| g.dispatch.clone()),
+            conf.get_vectored_concurrent_io,
            grpc_listener,
        )?);
    }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -27,7 +27,6 @@ use reqwest::Url;
 use storage_broker::Uri;
 use utils::id::{NodeId, TimelineId};
 use utils::logging::{LogFormat, SecretString};
-use utils::postgres_client::PostgresClientProtocol;

 use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -211,8 +210,6 @@ pub struct PageServerConf {
    /// Optionally disable disk syncs (unsafe!)
    pub no_sync: bool,

-    pub wal_receiver_protocol: PostgresClientProtocol,
-
    pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig,

    pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo,
@@ -421,7 +418,6 @@ impl PageServerConf {
            virtual_file_io_engine,
            tenant_config,
            no_sync,
-            wal_receiver_protocol,
            page_service_pipelining,
            get_vectored_concurrent_io,
            enable_read_path_debugging,
@@ -484,7 +480,6 @@ impl PageServerConf {
            import_pgdata_upcall_api,
            import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from),
            import_pgdata_aws_endpoint_url,
-            wal_receiver_protocol,
            page_service_pipelining,
            get_vectored_concurrent_io,
            tracing,
--- a/pageserver/src/feature_resolver.rs
+++ b/pageserver/src/feature_resolver.rs
@@ -1,21 +1,28 @@
 use std::{collections::HashMap, sync::Arc, time::Duration};

 use posthog_client_lite::{
-    FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError,
+    CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError,
+    PostHogFlagFilterPropertyValue,
 };
+use remote_storage::RemoteStorageKind;
+use serde_json::json;
 use tokio_util::sync::CancellationToken;
 use utils::id::TenantId;

-use crate::config::PageServerConf;
+use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION};

 #[derive(Clone)]
 pub struct FeatureResolver {
    inner: Option<Arc<FeatureResolverBackgroundLoop>>,
+    internal_properties: Option<Arc<HashMap<String, PostHogFlagFilterPropertyValue>>>,
 }

 impl FeatureResolver {
    pub fn new_disabled() -> Self {
-        Self { inner: None }
+        Self {
+            inner: None,
+            internal_properties: None,
+        }
    }

    pub fn spawn(
@@ -36,14 +43,114 @@ impl FeatureResolver {
                shutdown_pageserver,
            );
            let inner = Arc::new(inner);
-            // TODO: make this configurable
-            inner.clone().spawn(handle, Duration::from_secs(60));
-            Ok(FeatureResolver { inner: Some(inner) })
+
+            // The properties shared by all tenants on this pageserver.
+            let internal_properties = {
+                let mut properties = HashMap::new();
+                properties.insert(
+                    "pageserver_id".to_string(),
+                    PostHogFlagFilterPropertyValue::String(conf.id.to_string()),
+                );
+                if let Some(availability_zone) = &conf.availability_zone {
+                    properties.insert(
+                        "availability_zone".to_string(),
+                        PostHogFlagFilterPropertyValue::String(availability_zone.clone()),
+                    );
+                }
+                // Infer region based on the remote storage config.
+                if let Some(remote_storage) = &conf.remote_storage_config {
+                    match &remote_storage.storage {
+                        RemoteStorageKind::AwsS3(config) => {
+                            properties.insert(
+                                "region".to_string(),
+                                PostHogFlagFilterPropertyValue::String(format!(
+                                    "aws-{}",
+                                    config.bucket_region
+                                )),
+                            );
+                        }
+                        RemoteStorageKind::AzureContainer(config) => {
+                            properties.insert(
+                                "region".to_string(),
+                                PostHogFlagFilterPropertyValue::String(format!(
+                                    "azure-{}",
+                                    config.container_region
+                                )),
+                            );
+                        }
+                        RemoteStorageKind::LocalFs { .. } => {
+                            properties.insert(
+                                "region".to_string(),
+                                PostHogFlagFilterPropertyValue::String("local".to_string()),
+                            );
+                        }
+                    }
+                }
+                // TODO: add pageserver URL.
+                Arc::new(properties)
+            };
+            let fake_tenants = {
+                let mut tenants = Vec::new();
+                for i in 0..10 {
+                    let distinct_id = format!(
+                        "fake_tenant_{}_{}_{}",
+                        conf.availability_zone.as_deref().unwrap_or_default(),
+                        conf.id,
+                        i
+                    );
+                    let properties = Self::collect_properties_inner(
+                        distinct_id.clone(),
+                        Some(&internal_properties),
+                    );
+                    tenants.push(CaptureEvent {
+                        event: "initial_tenant_report".to_string(),
+                        distinct_id,
+                        properties: json!({ "$set": properties }), // use `$set` to set the person properties instead of the event properties
+                    });
+                }
+                tenants
+            };
+            // TODO: make refresh period configurable
+            inner
+                .clone()
+                .spawn(handle, Duration::from_secs(60), fake_tenants);
+            Ok(FeatureResolver {
+                inner: Some(inner),
+                internal_properties: Some(internal_properties),
+            })
        } else {
-            Ok(FeatureResolver { inner: None })
+            Ok(FeatureResolver {
+                inner: None,
+                internal_properties: None,
+            })
        }
    }

+    fn collect_properties_inner(
+        tenant_id: String,
+        internal_properties: Option<&HashMap<String, PostHogFlagFilterPropertyValue>>,
+    ) -> HashMap<String, PostHogFlagFilterPropertyValue> {
+        let mut properties = HashMap::new();
+        if let Some(internal_properties) = internal_properties {
+            for (key, value) in internal_properties.iter() {
+                properties.insert(key.clone(), value.clone());
+            }
+        }
+        properties.insert(
+            "tenant_id".to_string(),
+            PostHogFlagFilterPropertyValue::String(tenant_id),
+        );
+        properties
+    }
+
+    /// Collect all properties availble for the feature flag evaluation.
+    pub(crate) fn collect_properties(
+        &self,
+        tenant_id: TenantId,
+    ) -> HashMap<String, PostHogFlagFilterPropertyValue> {
+        Self::collect_properties_inner(tenant_id.to_string(), self.internal_properties.as_deref())
+    }
+
    /// Evaluate a multivariate feature flag. Currently, we do not support any properties.
    ///
    /// Error handling: the caller should inspect the error and decide the behavior when a feature flag
@@ -55,11 +162,24 @@ impl FeatureResolver {
        tenant_id: TenantId,
    ) -> Result<String, PostHogEvaluationError> {
        if let Some(inner) = &self.inner {
-            inner.feature_store().evaluate_multivariate(
+            let res = inner.feature_store().evaluate_multivariate(
                flag_key,
                &tenant_id.to_string(),
-                &HashMap::new(),
-            )
+                &self.collect_properties(tenant_id),
+            );
+            match &res {
+                Ok(value) => {
+                    FEATURE_FLAG_EVALUATION
+                        .with_label_values(&[flag_key, "ok", value])
+                        .inc();
+                }
+                Err(e) => {
+                    FEATURE_FLAG_EVALUATION
+                        .with_label_values(&[flag_key, "error", e.as_variant_str()])
+                        .inc();
+                }
+            }
+            res
        } else {
            Err(PostHogEvaluationError::NotAvailable(
                "PostHog integration is not enabled".to_string(),
@@ -80,11 +200,24 @@ impl FeatureResolver {
        tenant_id: TenantId,
    ) -> Result<(), PostHogEvaluationError> {
        if let Some(inner) = &self.inner {
-            inner.feature_store().evaluate_boolean(
+            let res = inner.feature_store().evaluate_boolean(
                flag_key,
                &tenant_id.to_string(),
-                &HashMap::new(),
-            )
+                &self.collect_properties(tenant_id),
+            );
+            match &res {
+                Ok(()) => {
+                    FEATURE_FLAG_EVALUATION
+                        .with_label_values(&[flag_key, "ok", "true"])
+                        .inc();
+                }
+                Err(e) => {
+                    FEATURE_FLAG_EVALUATION
+                        .with_label_values(&[flag_key, "error", e.as_variant_str()])
+                        .inc();
+                }
+            }
+            res
        } else {
            Err(PostHogEvaluationError::NotAvailable(
                "PostHog integration is not enabled".to_string(),
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -43,6 +43,7 @@ use pageserver_api::models::{
 use pageserver_api::shard::{ShardCount, TenantShardId};
 use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
 use scopeguard::defer;
+use serde_json::json;
 use tenant_size_model::svg::SvgBranchKind;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio::time::Instant;
@@ -3679,23 +3680,24 @@ async fn tenant_evaluate_feature_flag(
        let tenant = state
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;
+        let properties = tenant.feature_resolver.collect_properties(tenant_shard_id.tenant_id);
        if as_type == "boolean" {
            let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
            let result = result.map(|_| true).map_err(|e| e.to_string());
-            json_response(StatusCode::OK, result)
+            json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
        } else if as_type == "multivariate" {
            let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
-            json_response(StatusCode::OK, result)
+            json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
        } else {
            // Auto infer the type of the feature flag.
            let is_boolean = tenant.feature_resolver.is_feature_flag_boolean(&flag).map_err(|e| ApiError::InternalServerError(anyhow::anyhow!("{e}")))?;
            if is_boolean {
                let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
                let result = result.map(|_| true).map_err(|e| e.to_string());
-                json_response(StatusCode::OK, result)
+                json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
            } else {
                let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
-                json_response(StatusCode::OK, result)
+                json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
            }
        }
    }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -446,6 +446,15 @@ static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) static FEATURE_FLAG_EVALUATION: Lazy<CounterVec> = Lazy::new(|| {
+    register_counter_vec!(
+        "pageserver_feature_flag_evaluation",
+        "Number of times a feature flag is evaluated",
+        &["flag_key", "status", "value"],
+    )
+    .unwrap()
+});
+
 #[derive(IntoStaticStr)]
 #[strum(serialize_all = "kebab_case")]
 pub(crate) enum PageCacheErrorKind {
@@ -2846,7 +2855,6 @@ pub(crate) struct WalIngestMetrics {
    pub(crate) records_received: IntCounter,
    pub(crate) records_observed: IntCounter,
    pub(crate) records_committed: IntCounter,
-    pub(crate) records_filtered: IntCounter,
    pub(crate) values_committed_metadata_images: IntCounter,
    pub(crate) values_committed_metadata_deltas: IntCounter,
    pub(crate) values_committed_data_images: IntCounter,
@@ -2902,11 +2910,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
        "Number of WAL records which resulted in writes to pageserver storage"
    )
    .expect("failed to define a metric"),
-    records_filtered: register_int_counter!(
-        "pageserver_wal_ingest_records_filtered",
-        "Number of WAL records filtered out due to sharding"
-    )
-    .expect("failed to define a metric"),
    values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]),
    values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]),
    values_committed_data_images: values_committed.with_label_values(&["data", "image"]),
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -178,6 +178,7 @@ pub fn spawn_grpc(
    tenant_manager: Arc<TenantManager>,
    auth: Option<Arc<SwappableJwtAuth>>,
    perf_trace_dispatch: Option<Dispatch>,
+    get_vectored_concurrent_io: GetVectoredConcurrentIo,
    listener: std::net::TcpListener,
 ) -> anyhow::Result<CancellableTask> {
    let cancel = CancellationToken::new();
@@ -214,6 +215,8 @@ pub fn spawn_grpc(
    let page_service_handler = GrpcPageServiceHandler {
        tenant_manager,
        ctx,
+        gate_guard: gate.enter().expect("gate was just created"),
+        get_vectored_concurrent_io,
    };

    let observability_layer = ObservabilityLayer;
@@ -497,10 +500,6 @@ async fn page_service_conn_main(
 }

 /// Page service connection handler.
-///
-/// TODO: for gRPC, this will be shared by all requests from all connections.
-/// Decompose it into global state and per-connection/request state, and make
-/// libpq-specific options (e.g. pipelining) separate.
 struct PageServerHandler {
    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,
@@ -3362,6 +3361,8 @@ where
 pub struct GrpcPageServiceHandler {
    tenant_manager: Arc<TenantManager>,
    ctx: RequestContext,
+    gate_guard: GateGuard,
+    get_vectored_concurrent_io: GetVectoredConcurrentIo,
 }

 impl GrpcPageServiceHandler {
@@ -3721,6 +3722,14 @@ impl proto::PageService for GrpcPageServiceHandler {
            .get(ttid.tenant_id, ttid.timeline_id, shard_selector)
            .await?;

+        // Spawn an IoConcurrency sidecar, if enabled.
+        let Ok(gate_guard) = self.gate_guard.try_clone() else {
+            return Err(tonic::Status::unavailable("shutting down"));
+        };
+        let io_concurrency =
+            IoConcurrency::spawn_from_conf(self.get_vectored_concurrent_io, gate_guard);
+
+        // Spawn a task to handle the GetPageRequest stream.
        let span = Span::current();
        let ctx = self.ctx.attached_child();
        let mut reqs = req.into_inner();
@@ -3731,8 +3740,7 @@ impl proto::PageService for GrpcPageServiceHandler {
                .await?
                .downgrade();
            while let Some(req) = reqs.message().await? {
-                // TODO: implement IoConcurrency sidecar.
-                yield Self::get_page(&ctx, &timeline, req, IoConcurrency::Sequential)
+                yield Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
                    .instrument(span.clone()) // propagate request span
                    .await?
            }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2506,6 +2506,13 @@ impl Timeline {
            // Preparing basebackup doesn't make sense for shards other than shard zero.
            return;
        }
+        if !self.is_active() {
+            // May happen during initial timeline creation.
+            // Such timeline is not in the global timeline map yet,
+            // so basebackup cache will not be able to find it.
+            // TODO(diko): We can prepare such timelines in finish_creation().
+            return;
+        }

        let res = self
            .basebackup_prepare_sender
@@ -2845,21 +2852,6 @@ impl Timeline {
            )
    }

-    /// Resolve the effective WAL receiver protocol to use for this tenant.
-    ///
-    /// Priority order is:
-    /// 1. Tenant config override
-    /// 2. Default value for tenant config override
-    /// 3. Pageserver config override
-    /// 4. Pageserver config default
-    pub fn resolve_wal_receiver_protocol(&self) -> PostgresClientProtocol {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
-        tenant_conf
-            .wal_receiver_protocol_override
-            .or(self.conf.default_tenant_conf.wal_receiver_protocol_override)
-            .unwrap_or(self.conf.wal_receiver_protocol)
-    }
-
    pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) {
        // NB: Most tenant conf options are read by background loops, so,
        // changes will automatically be picked up.
@@ -3215,10 +3207,16 @@ impl Timeline {
            guard.is_none(),
            "multiple launches / re-launches of WAL receiver are not supported"
        );
+
+        let protocol = PostgresClientProtocol::Interpreted {
+            format: utils::postgres_client::InterpretedFormat::Protobuf,
+            compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }),
+        };
+
        *guard = Some(WalReceiver::start(
            Arc::clone(self),
            WalReceiverConf {
-                protocol: self.resolve_wal_receiver_protocol(),
+                protocol,
                wal_connect_timeout,
                lagging_wal_timeout,
                max_lsn_wal_lag,
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -100,6 +100,7 @@ async fn run_v1(
                        .unwrap(),
                    import_job_concurrency: base.import_job_concurrency,
                    import_job_checkpoint_threshold: base.import_job_checkpoint_threshold,
+                    import_job_max_byte_range_size: base.import_job_max_byte_range_size,
                }
            }
            None => timeline.conf.timeline_import_config.clone(),
@@ -441,6 +442,7 @@ impl Plan {

        let mut last_completed_job_idx = start_after_job_idx.unwrap_or(0);
        let checkpoint_every: usize = import_config.import_job_checkpoint_threshold.into();
+        let max_byte_range_size: usize = import_config.import_job_max_byte_range_size.into();

        // Run import jobs concurrently up to the limit specified by the pageserver configuration.
        // Note that we process completed futures in the oreder of insertion. This will be the
@@ -456,7 +458,7 @@ impl Plan {

                    work.push_back(tokio::task::spawn(async move {
                        let _permit = permit;
-                        let res = job.run(job_timeline, &ctx).await;
+                        let res = job.run(job_timeline, max_byte_range_size, &ctx).await;
                        (job_idx, res)
                    }));
                },
@@ -679,6 +681,7 @@ trait ImportTask {
    async fn doit(
        self,
        layer_writer: &mut ImageLayerWriter,
+        max_byte_range_size: usize,
        ctx: &RequestContext,
    ) -> anyhow::Result<usize>;
 }
@@ -715,6 +718,7 @@ impl ImportTask for ImportSingleKeyTask {
    async fn doit(
        self,
        layer_writer: &mut ImageLayerWriter,
+        _max_byte_range_size: usize,
        ctx: &RequestContext,
    ) -> anyhow::Result<usize> {
        layer_writer.put_image(self.key, self.buf, ctx).await?;
@@ -768,10 +772,9 @@ impl ImportTask for ImportRelBlocksTask {
    async fn doit(
        self,
        layer_writer: &mut ImageLayerWriter,
+        max_byte_range_size: usize,
        ctx: &RequestContext,
    ) -> anyhow::Result<usize> {
-        const MAX_BYTE_RANGE_SIZE: usize = 4 * 1024 * 1024;
-
        debug!("Importing relation file");

        let (rel_tag, start_blk) = self.key_range.start.to_rel_block()?;
@@ -796,7 +799,7 @@ impl ImportTask for ImportRelBlocksTask {
                assert_eq!(key.len(), 1);
                assert!(!acc.is_empty());
                assert!(acc_end > acc_start);
-                if acc_end == start && end - acc_start <= MAX_BYTE_RANGE_SIZE {
+                if acc_end == start && end - acc_start <= max_byte_range_size {
                    acc.push(key.pop().unwrap());
                    Ok((acc, acc_start, end))
                } else {
@@ -860,6 +863,7 @@ impl ImportTask for ImportSlruBlocksTask {
    async fn doit(
        self,
        layer_writer: &mut ImageLayerWriter,
+        _max_byte_range_size: usize,
        ctx: &RequestContext,
    ) -> anyhow::Result<usize> {
        debug!("Importing SLRU segment file {}", self.path);
@@ -906,12 +910,13 @@ impl ImportTask for AnyImportTask {
    async fn doit(
        self,
        layer_writer: &mut ImageLayerWriter,
+        max_byte_range_size: usize,
        ctx: &RequestContext,
    ) -> anyhow::Result<usize> {
        match self {
-            Self::SingleKey(t) => t.doit(layer_writer, ctx).await,
-            Self::RelBlocks(t) => t.doit(layer_writer, ctx).await,
-            Self::SlruBlocks(t) => t.doit(layer_writer, ctx).await,
+            Self::SingleKey(t) => t.doit(layer_writer, max_byte_range_size, ctx).await,
+            Self::RelBlocks(t) => t.doit(layer_writer, max_byte_range_size, ctx).await,
+            Self::SlruBlocks(t) => t.doit(layer_writer, max_byte_range_size, ctx).await,
        }
    }
 }
@@ -952,7 +957,12 @@ impl ChunkProcessingJob {
        }
    }

-    async fn run(self, timeline: Arc<Timeline>, ctx: &RequestContext) -> anyhow::Result<()> {
+    async fn run(
+        self,
+        timeline: Arc<Timeline>,
+        max_byte_range_size: usize,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
        let mut writer = ImageLayerWriter::new(
            timeline.conf,
            timeline.timeline_id,
@@ -967,7 +977,7 @@ impl ChunkProcessingJob {

        let mut nimages = 0;
        for task in self.tasks {
-            nimages += task.doit(&mut writer, ctx).await?;
+            nimages += task.doit(&mut writer, max_byte_range_size, ctx).await?;
        }

        let resident_layer = if nimages > 0 {
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -32,9 +32,7 @@ use utils::backoff::{
 };
 use utils::id::{NodeId, TenantTimelineId};
 use utils::lsn::Lsn;
-use utils::postgres_client::{
-    ConnectionConfigArgs, PostgresClientProtocol, wal_stream_connection_config,
-};
+use utils::postgres_client::{ConnectionConfigArgs, wal_stream_connection_config};

 use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError};
 use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf};
@@ -991,19 +989,12 @@ impl ConnectionManagerState {
                    return None; // no connection string, ignore sk
                }

-                let (shard_number, shard_count, shard_stripe_size) = match self.conf.protocol {
-                    PostgresClientProtocol::Vanilla => {
-                        (None, None, None)
-                    },
-                    PostgresClientProtocol::Interpreted { .. } => {
-                        let shard_identity = self.timeline.get_shard_identity();
-                        (
-                            Some(shard_identity.number.0),
-                            Some(shard_identity.count.0),
-                            Some(shard_identity.stripe_size.0),
-                        )
-                    }
-                };
+                let shard_identity = self.timeline.get_shard_identity();
+                let (shard_number, shard_count, shard_stripe_size) = (
+                    Some(shard_identity.number.0),
+                    Some(shard_identity.count.0),
+                    Some(shard_identity.stripe_size.0),
+                );

                let connection_conf_args = ConnectionConfigArgs {
                    protocol: self.conf.protocol,
@@ -1120,8 +1111,8 @@ impl ReconnectReason {

 #[cfg(test)]
 mod tests {
-    use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL;
    use url::Host;
+    use utils::postgres_client::PostgresClientProtocol;

    use super::*;
    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
@@ -1552,6 +1543,11 @@ mod tests {
            .await
            .expect("Failed to create an empty timeline for dummy wal connection manager");

+        let protocol = PostgresClientProtocol::Interpreted {
+            format: utils::postgres_client::InterpretedFormat::Protobuf,
+            compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }),
+        };
+
        ConnectionManagerState {
            id: TenantTimelineId {
                tenant_id: harness.tenant_shard_id.tenant_id,
@@ -1560,7 +1556,7 @@ mod tests {
            timeline,
            cancel: CancellationToken::new(),
            conf: WalReceiverConf {
-                protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
+                protocol,
                wal_connect_timeout: Duration::from_secs(1),
                lagging_wal_timeout: Duration::from_secs(1),
                max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -15,7 +15,7 @@ use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::WAL_SEGMENT_SIZE;
 use postgres_ffi::v14::xlog_utils::normalize_lsn;
-use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder};
+use postgres_ffi::waldecoder::WalDecodeError;
 use postgres_protocol::message::backend::ReplicationMessage;
 use postgres_types::PgLsn;
 use tokio::sync::watch;
@@ -31,7 +31,7 @@ use utils::lsn::Lsn;
 use utils::pageserver_feedback::PageserverFeedback;
 use utils::postgres_client::PostgresClientProtocol;
 use utils::sync::gate::GateError;
-use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords};
+use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecords};
 use wal_decoder::wire_format::FromWireFormat;

 use super::TaskStateUpdate;
@@ -275,8 +275,6 @@ pub(super) async fn handle_walreceiver_connection(
    let copy_stream = replication_client.copy_both_simple(&query).await?;
    let mut physical_stream = pin!(ReplicationStream::new(copy_stream));

-    let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
-
    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
        .await
        .map_err(|e| match e.kind {
@@ -284,14 +282,16 @@ pub(super) async fn handle_walreceiver_connection(
            _ => WalReceiverError::Other(e.into()),
        })?;

-    let shard = vec![*timeline.get_shard_identity()];
-
-    let interpreted_proto_config = match protocol {
-        PostgresClientProtocol::Vanilla => None,
+    let (format, compression) = match protocol {
        PostgresClientProtocol::Interpreted {
            format,
            compression,
-        } => Some((format, compression)),
+        } => (format, compression),
+        PostgresClientProtocol::Vanilla => {
+            return Err(WalReceiverError::Other(anyhow!(
+                "Vanilla WAL receiver protocol is no longer supported for ingest"
+            )));
+        }
    };

    let mut expected_wal_start = startpoint;
@@ -313,16 +313,6 @@ pub(super) async fn handle_walreceiver_connection(
        // Update the connection status before processing the message. If the message processing
        // fails (e.g. in walingest), we still want to know latests LSNs from the safekeeper.
        match &replication_message {
-            ReplicationMessage::XLogData(xlog_data) => {
-                connection_status.latest_connection_update = now;
-                connection_status.commit_lsn = Some(Lsn::from(xlog_data.wal_end()));
-                connection_status.streaming_lsn = Some(Lsn::from(
-                    xlog_data.wal_start() + xlog_data.data().len() as u64,
-                ));
-                if !xlog_data.data().is_empty() {
-                    connection_status.latest_wal_update = now;
-                }
-            }
            ReplicationMessage::PrimaryKeepAlive(keepalive) => {
                connection_status.latest_connection_update = now;
                connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end()));
@@ -353,7 +343,6 @@ pub(super) async fn handle_walreceiver_connection(
                // were interpreted.
                let streaming_lsn = Lsn::from(raw.streaming_lsn());

-                let (format, compression) = interpreted_proto_config.unwrap();
                let batch = InterpretedWalRecords::from_wire(raw.data(), format, compression)
                    .await
                    .with_context(|| {
@@ -509,138 +498,6 @@ pub(super) async fn handle_walreceiver_connection(
                Some(streaming_lsn)
            }

-            ReplicationMessage::XLogData(xlog_data) => {
-                async fn commit(
-                    modification: &mut DatadirModification<'_>,
-                    uncommitted: &mut u64,
-                    filtered: &mut u64,
-                    ctx: &RequestContext,
-                ) -> anyhow::Result<()> {
-                    let stats = modification.stats();
-                    modification.commit(ctx).await?;
-                    WAL_INGEST
-                        .records_committed
-                        .inc_by(*uncommitted - *filtered);
-                    WAL_INGEST.inc_values_committed(&stats);
-                    *uncommitted = 0;
-                    *filtered = 0;
-                    Ok(())
-                }
-
-                // Pass the WAL data to the decoder, and see if we can decode
-                // more records as a result.
-                let data = xlog_data.data();
-                let startlsn = Lsn::from(xlog_data.wal_start());
-                let endlsn = startlsn + data.len() as u64;
-
-                trace!("received XLogData between {startlsn} and {endlsn}");
-
-                WAL_INGEST.bytes_received.inc_by(data.len() as u64);
-                waldecoder.feed_bytes(data);
-
-                {
-                    let mut modification = timeline.begin_modification(startlsn);
-                    let mut uncommitted_records = 0;
-                    let mut filtered_records = 0;
-
-                    while let Some((next_record_lsn, recdata)) = waldecoder.poll_decode()? {
-                        // It is important to deal with the aligned records as lsn in getPage@LSN is
-                        // aligned and can be several bytes bigger. Without this alignment we are
-                        // at risk of hitting a deadlock.
-                        if !next_record_lsn.is_aligned() {
-                            return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
-                        }
-
-                        // Deserialize and interpret WAL record
-                        let interpreted = InterpretedWalRecord::from_bytes_filtered(
-                            recdata,
-                            &shard,
-                            next_record_lsn,
-                            modification.tline.pg_version,
-                        )?
-                        .remove(timeline.get_shard_identity())
-                        .unwrap();
-
-                        if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
-                            && uncommitted_records > 0
-                        {
-                            // Special case: legacy PG database creations operate by reading pages from a 'template' database:
-                            // these are the only kinds of WAL record that require reading data blocks while ingesting.  Ensure
-                            // all earlier writes of data blocks are visible by committing any modification in flight.
-                            commit(
-                                &mut modification,
-                                &mut uncommitted_records,
-                                &mut filtered_records,
-                                &ctx,
-                            )
-                            .await?;
-                        }
-
-                        // Ingest the records without immediately committing them.
-                        timeline.metrics.wal_records_received.inc();
-                        let ingested = walingest
-                            .ingest_record(interpreted, &mut modification, &ctx)
-                            .await
-                            .with_context(|| {
-                                format!("could not ingest record at {next_record_lsn}")
-                            })
-                            .inspect_err(|err| {
-                                // TODO: we can't differentiate cancellation errors with
-                                // anyhow::Error, so just ignore it if we're cancelled.
-                                if !cancellation.is_cancelled() && !timeline.is_stopping() {
-                                    critical!("{err:?}")
-                                }
-                            })?;
-                        if !ingested {
-                            tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
-                            WAL_INGEST.records_filtered.inc();
-                            filtered_records += 1;
-                        }
-
-                        // FIXME: this cannot be made pausable_failpoint without fixing the
-                        // failpoint library; in tests, the added amount of debugging will cause us
-                        // to timeout the tests.
-                        fail_point!("walreceiver-after-ingest");
-
-                        last_rec_lsn = next_record_lsn;
-
-                        // Commit every ingest_batch_size records. Even if we filtered out
-                        // all records, we still need to call commit to advance the LSN.
-                        uncommitted_records += 1;
-                        if uncommitted_records >= ingest_batch_size
-                            || modification.approx_pending_bytes()
-                                > DatadirModification::MAX_PENDING_BYTES
-                        {
-                            commit(
-                                &mut modification,
-                                &mut uncommitted_records,
-                                &mut filtered_records,
-                                &ctx,
-                            )
-                            .await?;
-                        }
-                    }
-
-                    // Commit the remaining records.
-                    if uncommitted_records > 0 {
-                        commit(
-                            &mut modification,
-                            &mut uncommitted_records,
-                            &mut filtered_records,
-                            &ctx,
-                        )
-                        .await?;
-                    }
-                }
-
-                if !caught_up && endlsn >= end_of_wal {
-                    info!("caught up at LSN {endlsn}");
-                    caught_up = true;
-                }
-
-                Some(endlsn)
-            }
-
            ReplicationMessage::PrimaryKeepAlive(keepalive) => {
                let wal_end = keepalive.wal_end();
                let timestamp = keepalive.timestamp();
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -1,6 +1,5 @@
 # pgxs/neon/Makefile

-
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
@@ -22,7 +21,8 @@ OBJS = \
 	walproposer.o \
 	walproposer_pg.o \
 	control_plane_connector.o \
-	walsender_hooks.o
+	walsender_hooks.o \
+	$(LIBCOMMUNICATOR_PATH)/libcommunicator.a

 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
--- a/pgxn/neon/communicator/Cargo.toml
+++ b/pgxn/neon/communicator/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "communicator"
+version = "0.1.0"
+edition = "2024"
+
+[lib]
+crate-type = ["staticlib"]
+
+[dependencies]
+neon-shmem.workspace = true
+
+[build-dependencies]
+cbindgen.workspace = true
--- a/pgxn/neon/communicator/README.md
+++ b/pgxn/neon/communicator/README.md
@@ -0,0 +1,8 @@
+This package will evolve into a "compute-pageserver communicator"
+process and machinery. For now, it just provides wrappers on the
+neon-shmem Rust crate, to allow using it in the C implementation of
+the LFC.
+
+At compilation time, pgxn/neon/communicator/ produces a static
+library, libcommunicator.a. It is linked to the neon.so extension
+library. 
--- a/pgxn/neon/communicator/build.rs
+++ b/pgxn/neon/communicator/build.rs
@@ -0,0 +1,22 @@
+use std::env;
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
+
+    cbindgen::generate(crate_dir).map_or_else(
+        |error| match error {
+            cbindgen::Error::ParseSyntaxError { .. } => {
+                // This means there was a syntax error in the Rust sources. Don't panic, because
+                // we want the build to continue and the Rust compiler to hit the error. The
+                // Rust compiler produces a better error message than cbindgen.
+                eprintln!("Generating C bindings failed because of a Rust syntax error");
+            }
+            e => panic!("Unable to generate C bindings: {:?}", e),
+        },
+        |bindings| {
+            bindings.write_to_file("communicator_bindings.h");
+        },
+    );
+
+    Ok(())
+}
--- a/pgxn/neon/communicator/cbindgen.toml
+++ b/pgxn/neon/communicator/cbindgen.toml
@@ -0,0 +1,4 @@
+language = "C"
+
+[enum]
+prefix_with_name = true
--- a/pgxn/neon/communicator/src/file_cache_hashmap.rs
+++ b/pgxn/neon/communicator/src/file_cache_hashmap.rs
@@ -0,0 +1,240 @@
+//! Glue code to allow using the Rust shmem hash map implementation from C code
+//!
+//! For convience of adapting existing code, the interface provided somewhat resembles the dynahash
+//! interface.
+//!
+//! NOTE: The caller is responsible for locking! The caller is expected to hold the PostgreSQL
+//! LWLock, 'lfc_lock', while accessing the hash table, in shared or exclusive mode as appropriate.
+
+use std::ffi::c_void;
+use std::marker::PhantomData;
+
+use neon_shmem::hash::entry::Entry;
+use neon_shmem::hash::{HashMapAccess, HashMapInit};
+use neon_shmem::shmem::ShmemHandle;
+
+/// NB: This must match the definition of BufferTag in Postgres C headers. We could use bindgen to
+/// generate this from the C headers, but prefer to not introduce dependency on bindgen for now.
+///
+/// Note that there are no padding bytes. If the corresponding C struct has padding bytes, the C C
+/// code must clear them.
+#[derive(Clone, Debug, Hash, Eq, PartialEq)]
+#[repr(C)]
+pub struct FileCacheKey {
+    pub _spc_id: u32,
+    pub _db_id: u32,
+    pub _rel_number: u32,
+    pub _fork_num: u32,
+    pub _block_num: u32,
+}
+
+/// Like with FileCacheKey, this must match the definition of FileCacheEntry in file_cache.c.  We
+/// don't look at the contents here though, it's sufficent that the size and alignment matches.
+#[derive(Clone, Debug, Default)]
+#[repr(C)]
+pub struct FileCacheEntry {
+    pub _offset: u32,
+    pub _access_count: u32,
+    pub _prev: *mut FileCacheEntry,
+    pub _next: *mut FileCacheEntry,
+    pub _state: [u32; 8],
+}
+
+/// XXX: This could be just:
+///
+/// ```ignore
+/// type FileCacheHashMapHandle = HashMapInit<'a, FileCacheKey, FileCacheEntry>
+/// ```
+///
+/// but with that, cbindgen generates a broken typedef in the C header file which doesn't
+/// compile. It apparently gets confused by the generics.
+#[repr(transparent)]
+pub struct FileCacheHashMapHandle<'a>(
+    pub *mut c_void,
+    PhantomData<HashMapInit<'a, FileCacheKey, FileCacheEntry>>,
+);
+impl<'a> From<Box<HashMapInit<'a, FileCacheKey, FileCacheEntry>>> for FileCacheHashMapHandle<'a> {
+    fn from(x: Box<HashMapInit<'a, FileCacheKey, FileCacheEntry>>) -> Self {
+        FileCacheHashMapHandle(Box::into_raw(x) as *mut c_void, PhantomData::default())
+    }
+}
+impl<'a> From<FileCacheHashMapHandle<'a>> for Box<HashMapInit<'a, FileCacheKey, FileCacheEntry>> {
+    fn from(x: FileCacheHashMapHandle) -> Self {
+        unsafe { Box::from_raw(x.0.cast()) }
+    }
+}
+
+/// XXX: same for this
+#[repr(transparent)]
+pub struct FileCacheHashMapAccess<'a>(
+    pub *mut c_void,
+    PhantomData<HashMapAccess<'a, FileCacheKey, FileCacheEntry>>,
+);
+impl<'a> From<Box<HashMapAccess<'a, FileCacheKey, FileCacheEntry>>> for FileCacheHashMapAccess<'a> {
+    fn from(x: Box<HashMapAccess<'a, FileCacheKey, FileCacheEntry>>) -> Self {
+        // Convert the Box into a raw mutable pointer to the HashMapAccess itself.
+        // This transfers ownership of the HashMapAccess (and its contained ShmemHandle)
+        // to the raw pointer. The C caller is now responsible for managing this memory.
+        FileCacheHashMapAccess(Box::into_raw(x) as *mut c_void, PhantomData::default())
+    }
+}
+impl<'a> FileCacheHashMapAccess<'a> {
+    fn as_ref(self) -> &'a HashMapAccess<'a, FileCacheKey, FileCacheEntry> {
+        let ptr: *mut HashMapAccess<'_, FileCacheKey, FileCacheEntry> = self.0.cast();
+        unsafe { ptr.as_ref().unwrap() }
+    }
+    fn as_mut(self) -> &'a mut HashMapAccess<'a, FileCacheKey, FileCacheEntry> {
+        let ptr: *mut HashMapAccess<'_, FileCacheKey, FileCacheEntry> = self.0.cast();
+        unsafe { ptr.as_mut().unwrap() }
+    }
+}
+
+/// Initialize the shared memory area at postmaster startup. The returned handle is inherited
+/// by all the backend processes across fork()
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_shmem_init<'a>(
+    initial_num_buckets: u32,
+    max_num_buckets: u32,
+) -> FileCacheHashMapHandle<'a> {
+    let max_bytes = HashMapInit::<FileCacheKey, FileCacheEntry>::estimate_size(max_num_buckets);
+    let shmem_handle =
+        ShmemHandle::new("lfc mapping", 0, max_bytes).expect("shmem initialization failed");
+
+    let handle = HashMapInit::<FileCacheKey, FileCacheEntry>::init_in_shmem(
+        initial_num_buckets,
+        shmem_handle,
+    );
+
+    Box::new(handle).into()
+}
+
+/// Initialize the access to the shared memory area in a backend process.
+///
+/// XXX: I'm not sure if this actually gets called in each process, or if the returned struct
+/// is also inherited across fork(). It currently works either way but if this did more
+/// initialization that needed to be done after fork(), then it would matter.
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_shmem_access<'a>(
+    handle: FileCacheHashMapHandle<'a>,
+) -> FileCacheHashMapAccess<'a> {
+    let handle: Box<HashMapInit<'_, FileCacheKey, FileCacheEntry>> = handle.into();
+    Box::new(handle.attach_writer()).into()
+}
+
+/// Return the current number of buckets in the hash table
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_get_num_buckets<'a>(
+    map: FileCacheHashMapAccess<'static>,
+) -> u32 {
+    let map = map.as_ref();
+    map.get_num_buckets().try_into().unwrap()
+}
+
+/// Look up the entry with given key and hash.
+///
+/// This is similar to dynahash's hash_search(... , HASH_FIND)
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_hash_find<'a>(
+    map: FileCacheHashMapAccess<'static>,
+    key: &FileCacheKey,
+    hash: u64,
+) -> Option<&'static FileCacheEntry> {
+    let map = map.as_ref();
+    map.get_with_hash(key, hash)
+}
+
+/// Look up the entry at given bucket position
+///
+/// This has no direct equivalent in the dynahash interface, but can be used to
+/// iterate through all entries in the hash table.
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_hash_get_at_pos<'a>(
+    map: FileCacheHashMapAccess<'static>,
+    pos: u32,
+) -> Option<&'static FileCacheEntry> {
+    let map = map.as_ref();
+    map.get_at_bucket(pos as usize).map(|(_k, v)| v)
+}
+
+/// Remove entry, given a pointer to the value.
+///
+/// This is equivalent to dynahash hash_search(entry->key, HASH_REMOVE), where 'entry'
+/// is an entry you have previously looked up
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_hash_remove_entry<'a, 'b>(
+    map: FileCacheHashMapAccess,
+    entry: *mut FileCacheEntry,
+) {
+    let map = map.as_mut();
+    let pos = map.get_bucket_for_value(entry);
+    match map.entry_at_bucket(pos) {
+        Some(e) => {
+            e.remove();
+        }
+        None => {
+            // todo: shouldn't happen, panic?
+        }
+    }
+}
+
+/// Compute the hash for given key
+///
+/// This is equivalent to dynahash get_hash_value() function. We use Rust's default hasher
+/// for calculating the hash though.
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_get_hash_value<'a, 'b>(
+    map: FileCacheHashMapAccess<'static>,
+    key: &FileCacheKey,
+) -> u64 {
+    map.as_ref().get_hash_value(key)
+}
+
+/// Insert a new entry to the hash table
+///
+/// This is equivalent to dynahash hash_search(..., HASH_ENTER).
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_hash_enter<'a, 'b>(
+    map: FileCacheHashMapAccess,
+    key: &FileCacheKey,
+    hash: u64,
+    found: &mut bool,
+) -> *mut FileCacheEntry {
+    match map.as_mut().entry_with_hash(key.clone(), hash) {
+        Entry::Occupied(mut e) => {
+            *found = true;
+            e.get_mut()
+        }
+        Entry::Vacant(e) => {
+            *found = false;
+            let initial_value = FileCacheEntry::default();
+            e.insert(initial_value).expect("TODO: hash table full")
+        }
+    }
+}
+
+/// Get the key for a given entry, which must be present in the hash table.
+///
+/// Dynahash requires the key to be part of the "value" struct, so you can always
+/// access the key with something like `entry->key`. The Rust implementation however
+/// stores the key separately. This function extracts the separately stored key.
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_hash_get_key_for_entry<'a, 'b>(
+    map: FileCacheHashMapAccess,
+    entry: *const FileCacheEntry,
+) -> Option<&FileCacheKey> {
+    let map = map.as_ref();
+    let pos = map.get_bucket_for_value(entry);
+    map.get_at_bucket(pos as usize).map(|(k, _v)| k)
+}
+
+/// Remove all entries from the hash table
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_file_cache_hash_reset<'a, 'b>(map: FileCacheHashMapAccess) {
+    let map = map.as_mut();
+    let num_buckets = map.get_num_buckets();
+    for i in 0..num_buckets {
+        if let Some(e) = map.entry_at_bucket(i) {
+            e.remove();
+        }
+    }
+}
--- a/pgxn/neon/communicator/src/lib.rs
+++ b/pgxn/neon/communicator/src/lib.rs
@@ -0,0 +1 @@
+pub mod file_cache_hashmap;
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -21,7 +21,7 @@
 #include "access/xlog.h"
 #include "funcapi.h"
 #include "miscadmin.h"
-#include "common/hashfn.h"
+#include "common/file_utils.h"
 #include "pgstat.h"
 #include "port/pg_iovec.h"
 #include "postmaster/bgworker.h"
@@ -36,7 +36,6 @@
 #include "storage/procsignal.h"
 #include "tcop/tcopprot.h"
 #include "utils/builtins.h"
-#include "utils/dynahash.h"
 #include "utils/guc.h"

 #if PG_VERSION_NUM >= 150000
@@ -46,6 +45,7 @@
 #include "hll.h"
 #include "bitmap.h"
 #include "file_cache.h"
+#include "file_cache_rust_hash.h"
 #include "neon.h"
 #include "neon_lwlsncache.h"
 #include "neon_perf_counters.h"
@@ -64,7 +64,7 @@
 *
 * Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about
 * its consistency.
-
+ *
 *
 * ## Holes
 *
@@ -76,13 +76,15 @@
 * fallocate(FALLOC_FL_PUNCH_HOLE) call. The nominal size of the file doesn't
 * shrink, but the disk space it uses does.
 *
- * Each hole is tracked by a dummy FileCacheEntry, which are kept in the
- * 'holes' linked list. They are entered into the chunk hash table, with a
- * special key where the blockNumber is used to store the 'offset' of the
- * hole, and all other fields are zero. Holes are never looked up in the hash
- * table, we only enter them there to have a FileCacheEntry that we can keep
- * in the linked list. If the soft limit is raised again, we reuse the holes
- * before extending the nominal size of the file.
+ * Each hole is tracked in a freelist. The freelist consists of two parts: a
+ * fixed-size array in shared memory, and a linked chain of on-disk
+ * blocks. When the in-memory array fills up, it's flushed to a new on-disk
+ * chunk. If the soft limit is raised again, we reuse the holes before
+ * extending the nominal size of the file.
+ *
+ * The in-memory freelist array is protected by 'lfc_lock', while the on-disk
+ * chain is protected by a separate 'lfc_freelist_lock'.  Locking rule to
+ * avoid deadlocks: always acquire lfc_freelist_lock first, then lfc_lock.
 */

 /* Local file storage allocation chunk.
@@ -92,13 +94,15 @@
 *    1Mb chunks can reduce hash map size to 320Mb.
 * 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed
 */
-#define MAX_BLOCKS_PER_CHUNK_LOG  7 /* 1Mb chunk */
-#define MAX_BLOCKS_PER_CHUNK	  (1 << MAX_BLOCKS_PER_CHUNK_LOG)
+#define BLOCKS_PER_CHUNK_LOG  7 /* 1Mb chunk */
+#define BLOCKS_PER_CHUNK	  (1 << BLOCKS_PER_CHUNK_LOG)

 #define MB					((uint64)1024*1024)

-#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ >> lfc_chunk_size_log))
-#define BLOCK_TO_CHUNK_OFF(blkno) ((blkno) & (lfc_blocks_per_chunk-1))
+#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ >> BLOCKS_PER_CHUNK_LOG))
+#define BLOCK_TO_CHUNK_OFF(blkno) ((blkno) & (BLOCKS_PER_CHUNK-1))
+
+#define INVALID_OFFSET (0xffffffff)

 /*
 * Blocks are read or written to LFC file outside LFC critical section.
@@ -119,15 +123,18 @@ typedef enum FileCacheBlockState

 typedef struct FileCacheEntry
 {
-	BufferTag	key;
-	uint32		hash;
 	uint32		offset;
 	uint32		access_count;
-	dlist_node	list_node;		/* LRU/holes list node */
-	uint32		state[FLEXIBLE_ARRAY_MEMBER]; /* two bits per block */
+	dlist_node	list_node;		/* LRU list node */
+	uint32		state[(BLOCKS_PER_CHUNK * 2 + 31) / 32]; /* two bits per block */
 } FileCacheEntry;

-#define FILE_CACHE_ENRTY_SIZE MAXALIGN(offsetof(FileCacheEntry, state) + (lfc_blocks_per_chunk*2+31)/32*4)
+/* Todo: alignment must be the same too */
+StaticAssertDecl(sizeof(FileCacheEntry) == sizeof(RustFileCacheEntry),
+				 "Rust and C declarations of FileCacheEntry are incompatible");
+StaticAssertDecl(sizeof(BufferTag) == sizeof(RustFileCacheKey),
+				 "Rust and C declarations of FileCacheKey are incompatible");
+
 #define GET_STATE(entry, i) (((entry)->state[(i) / 16] >> ((i) % 16 * 2)) & 3)
 #define SET_STATE(entry, i, new_state) (entry)->state[(i) / 16] = ((entry)->state[(i) / 16] & ~(3 << ((i) % 16 * 2))) | ((new_state) << ((i) % 16 * 2))

@@ -136,6 +143,9 @@ typedef struct FileCacheEntry

 #define MAX_PREWARM_WORKERS 8

+
+#define FREELIST_ENTRIES_PER_CHUNK (BLOCKS_PER_CHUNK * BLCKSZ / sizeof(uint32) - 2)
+
 typedef struct PrewarmWorkerState
 {
 	uint32		prewarmed_pages;
@@ -161,7 +171,6 @@ typedef struct FileCacheControl
 	uint64		evicted_pages;	/* number of evicted pages */
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
-	dlist_head  holes;          /* double linked list of punched holes */
 	HyperLogLogState wss_estimation; /* estimation of working set size */
 	ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */
 	PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS];
@@ -172,23 +181,39 @@ typedef struct FileCacheControl
 	bool   prewarm_active;
 	bool   prewarm_canceled;
 	dsm_handle prewarm_lfc_state_handle;
+
+	/*
+	 * Free list. This is large enough to hold one chunks worth of entries.
+	 */
+	uint32		freelist_size;
+	uint32		freelist_head;
+	uint32		num_free_pages;
+	uint32		free_pages[FREELIST_ENTRIES_PER_CHUNK];
 } FileCacheControl;

+typedef struct FreeListChunk
+{
+	uint32		next;
+	uint32		num_free_pages;
+	uint32		free_pages[FREELIST_ENTRIES_PER_CHUNK];
+} FreeListChunk;
+
 #define FILE_CACHE_STATE_MAGIC 0xfcfcfcfc

 #define FILE_CACHE_STATE_BITMAP(fcs)	((uint8*)&(fcs)->chunks[(fcs)->n_chunks])
-#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks)	(sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * lfc_blocks_per_chunk)+7)/8)
+#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks)	(sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * BLOCKS_PER_CHUNK)+7)/8)
 #define FILE_CACHE_STATE_SIZE(fcs)		(sizeof(FileCacheState) + (fcs->n_chunks)*sizeof(BufferTag) + (((fcs->n_chunks) << fcs->chunk_size_log)+7)/8)

-static HTAB *lfc_hash;
+static FileCacheHashMapHandle lfc_hash_handle;
+static FileCacheHashMapAccess lfc_hash;
 static int	lfc_desc = -1;
 static LWLockId lfc_lock;
+static LWLockId lfc_freelist_lock;
 static int	lfc_max_size;
 static int	lfc_size_limit;
 static int	lfc_prewarm_limit;
 static int	lfc_prewarm_batch;
-static int	lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
-static int	lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
+static int	lfc_blocks_per_chunk_ro = BLOCKS_PER_CHUNK;
 static char *lfc_path;
 static uint64 lfc_generation;
 static FileCacheControl *lfc_ctl;
@@ -205,6 +230,11 @@ bool AmPrewarmWorker;

 #define LFC_ENABLED() (lfc_ctl->limit != 0)

+static bool freelist_push(uint32 offset);
+static bool freelist_prepare_pop(void);
+static uint32 freelist_pop(void);
+static bool freelist_is_empty(void);
+
 /*
 * Close LFC file if opened.
 * All backends should close their LFC files once LFC is disabled.
@@ -232,15 +262,9 @@ lfc_switch_off(void)

 	if (LFC_ENABLED())
 	{
-		HASH_SEQ_STATUS status;
-		FileCacheEntry *entry;
-
 		/* Invalidate hash */
-		hash_seq_init(&status, lfc_hash);
-		while ((entry = hash_seq_search(&status)) != NULL)
-		{
-			hash_search_with_hash_value(lfc_hash, &entry->key, entry->hash, HASH_REMOVE, NULL);
-		}
+		file_cache_hash_reset(lfc_hash);
+
 		lfc_ctl->generation += 1;
 		lfc_ctl->size = 0;
 		lfc_ctl->pinned = 0;
@@ -248,7 +272,9 @@ lfc_switch_off(void)
 		lfc_ctl->used_pages = 0;
 		lfc_ctl->limit = 0;
 		dlist_init(&lfc_ctl->lru);
-		dlist_init(&lfc_ctl->holes);
+
+		lfc_ctl->freelist_head = INVALID_OFFSET;
+		lfc_ctl->num_free_pages = 0;

 		/*
 		 * We need to use unlink to to avoid races in LFC write, because it is not
@@ -317,8 +343,8 @@ lfc_ensure_opened(void)
 static void
 lfc_shmem_startup(void)
 {
+	size_t		size;
 	bool		found;
-	static HASHCTL info;

 	if (prev_shmem_startup_hook)
 	{
@@ -327,27 +353,29 @@ lfc_shmem_startup(void)

 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);

-	lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
+	size = sizeof(FileCacheControl);
+
+	lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", size, &found);
 	if (!found)
 	{
 		int			fd;
 		uint32		n_chunks = SIZE_MB_TO_CHUNKS(lfc_max_size);

 		lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
-		info.keysize = sizeof(BufferTag);
-		info.entrysize = FILE_CACHE_ENRTY_SIZE;
+		lfc_freelist_lock = (LWLockId) GetNamedLWLockTranche("lfc_freelist_lock");

 		/*
 		 * n_chunks+1 because we add new element to hash table before eviction
 		 * of victim
 		 */
-		lfc_hash = ShmemInitHash("lfc_hash",
-								 n_chunks + 1, n_chunks + 1,
-								 &info,
-								 HASH_ELEM | HASH_BLOBS);
-		memset(lfc_ctl, 0, sizeof(FileCacheControl));
+		lfc_hash_handle = file_cache_hash_shmem_init(n_chunks + 1, n_chunks + 1);
+
+		memset(lfc_ctl, 0, offsetof(FileCacheControl, free_pages));
 		dlist_init(&lfc_ctl->lru);
-		dlist_init(&lfc_ctl->holes);
+
+		lfc_ctl->freelist_size = FREELIST_ENTRIES_PER_CHUNK;
+		lfc_ctl->freelist_head = INVALID_OFFSET;
+		lfc_ctl->num_free_pages = 0;

 		/* Initialize hyper-log-log structure for estimating working set size */
 		initSHLL(&lfc_ctl->wss_estimation);
@@ -371,18 +399,25 @@ lfc_shmem_startup(void)

 	}
 	LWLockRelease(AddinShmemInitLock);
+
+	lfc_hash = file_cache_hash_shmem_access(lfc_hash_handle);
 }

 static void
 lfc_shmem_request(void)
 {
+	size_t		size;
+
 #if PG_VERSION_NUM>=150000
 	if (prev_shmem_request_hook)
 		prev_shmem_request_hook();
 #endif

-	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, FILE_CACHE_ENRTY_SIZE));
+	size = sizeof(FileCacheControl);
+
+	RequestAddinShmemSpace(size);
 	RequestNamedLWLockTranche("lfc_lock", 1);
+	RequestNamedLWLockTranche("lfc_freelist_lock", 2);
 }

 static bool
@@ -398,24 +433,6 @@ is_normal_backend(void)
 	return lfc_ctl && MyProc && UsedShmemSegAddr && !IsParallelWorker();
 }

-static bool
-lfc_check_chunk_size(int *newval, void **extra, GucSource source)
-{
-	if (*newval & (*newval - 1))
-	{
-		elog(ERROR, "LFC chunk size should be power of two");
-		return false;
-	}
-	return true;
-}
-
-static void
-lfc_change_chunk_size(int newval, void* extra)
-{
-	lfc_chunk_size_log = pg_ceil_log2_32(newval);
-}
-
-
 static bool
 lfc_check_limit_hook(int *newval, void **extra, GucSource source)
 {
@@ -435,12 +452,14 @@ lfc_change_limit_hook(int newval, void *extra)
 	if (!lfc_ctl || !is_normal_backend())
 		return;

+	LWLockAcquire(lfc_freelist_lock, LW_EXCLUSIVE);
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);

 	/* Open LFC file only if LFC was enabled or we are going to reenable it */
 	if (newval == 0 && !LFC_ENABLED())
 	{
 		LWLockRelease(lfc_lock);
+		LWLockRelease(lfc_freelist_lock);
 		/* File should be reopened if LFC is reenabled */
 		lfc_close_file();
 		return;
@@ -449,6 +468,7 @@ lfc_change_limit_hook(int newval, void *extra)
 	if (!lfc_ensure_opened())
 	{
 		LWLockRelease(lfc_lock);
+		LWLockRelease(lfc_freelist_lock);
 		return;
 	}

@@ -464,35 +484,30 @@ lfc_change_limit_hook(int newval, void *extra)
 		 * returning their space to file system
 		 */
 		FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
-		FileCacheEntry *hole;
 		uint32		offset = victim->offset;
-		uint32		hash;
-		bool		found;
-		BufferTag	holetag;

 		CriticalAssert(victim->access_count == 0);
 #ifdef FALLOC_FL_PUNCH_HOLE
-		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * lfc_blocks_per_chunk * BLCKSZ, lfc_blocks_per_chunk * BLCKSZ) < 0)
+		if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
 			neon_log(LOG, "Failed to punch hole in file: %m");
 #endif
-		/* We remove the old entry, and re-enter a hole to the hash table */
-		for (int i = 0; i < lfc_blocks_per_chunk; i++)
+		/* We remove the entry, and enter a hole to the freelist */
+		for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 		{
 			bool is_page_cached = GET_STATE(victim, i) == AVAILABLE;
 			lfc_ctl->used_pages -= is_page_cached;
 			lfc_ctl->evicted_pages += is_page_cached;
 		}
-		hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
+		file_cache_hash_remove_entry(lfc_hash, victim);

-		memset(&holetag, 0, sizeof(holetag));
-		holetag.blockNum = offset;
-		hash = get_hash_value(lfc_hash, &holetag);
-		hole = hash_search_with_hash_value(lfc_hash, &holetag, hash, HASH_ENTER, &found);
-		hole->hash = hash;
-		hole->offset = offset;
-		hole->access_count = 0;
-		CriticalAssert(!found);
-		dlist_push_tail(&lfc_ctl->holes, &hole->list_node);
+		if (!freelist_push(offset))
+		{
+			/* freelist_push already logged the error */
+			lfc_switch_off();
+			LWLockRelease(lfc_lock);
+			LWLockRelease(lfc_freelist_lock);
+			return;
+		}

 		lfc_ctl->used -= 1;
 	}
@@ -504,6 +519,7 @@ lfc_change_limit_hook(int newval, void *extra)
 	neon_log(DEBUG1, "set local file cache limit to %d", new_size);

 	LWLockRelease(lfc_lock);
+	LWLockRelease(lfc_freelist_lock);
 }

 void
@@ -579,14 +595,14 @@ lfc_init(void)
 	DefineCustomIntVariable("neon.file_cache_chunk_size",
 							"LFC chunk size in blocks (should be power of two)",
 							NULL,
-							&lfc_blocks_per_chunk,
-							MAX_BLOCKS_PER_CHUNK,
-							1,
-							MAX_BLOCKS_PER_CHUNK,
-							PGC_POSTMASTER,
+							&lfc_blocks_per_chunk_ro,
+							BLOCKS_PER_CHUNK,
+							BLOCKS_PER_CHUNK,
+							BLOCKS_PER_CHUNK,
+							PGC_INTERNAL,
 							GUC_UNIT_BLOCKS,
-							lfc_check_chunk_size,
-							lfc_change_chunk_size,
+							NULL,
+							NULL,
 							NULL);

 	DefineCustomIntVariable("neon.file_cache_prewarm_limit",
@@ -649,19 +665,19 @@ lfc_get_state(size_t max_entries)
 		fcs = (FileCacheState*)palloc0(state_size);
 		SET_VARSIZE(fcs, state_size);
 		fcs->magic = FILE_CACHE_STATE_MAGIC;
-		fcs->chunk_size_log = lfc_chunk_size_log;
+		fcs->chunk_size_log = BLOCKS_PER_CHUNK_LOG;
 		fcs->n_chunks = n_entries;
 		bitmap = FILE_CACHE_STATE_BITMAP(fcs);

 		dlist_reverse_foreach(iter, &lfc_ctl->lru)
 		{
 			FileCacheEntry *entry = dlist_container(FileCacheEntry, list_node, iter.cur);
-			fcs->chunks[i] = entry->key;
-			for (int j = 0; j < lfc_blocks_per_chunk; j++)
+			fcs->chunks[i] = *file_cache_hash_get_key_for_entry(lfc_hash, entry);
+			for (int j = 0; j < BLOCKS_PER_CHUNK; j++)
 			{
 				if (GET_STATE(entry, j) != UNAVAILABLE)
 				{
-					BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j);
+					BITMAP_SET(bitmap, i*BLOCKS_PER_CHUNK + j);
 					n_pages += 1;
 				}
 			}
@@ -670,7 +686,7 @@ lfc_get_state(size_t max_entries)
 		}
 		Assert(i == n_entries);
 		fcs->n_pages = n_pages;
-		Assert(pg_popcount((char*)bitmap, ((n_entries << lfc_chunk_size_log) + 7)/8) == n_pages);
+		Assert(pg_popcount((char*)bitmap, ((n_entries << BLOCKS_PER_CHUNK_LOG) + 7)/8) == n_pages);
 		elog(LOG, "LFC: save state of %d chunks %d pages", (int)n_entries, (int)n_pages);
 	}

@@ -726,7 +742,7 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
 	}

 	fcs_chunk_size_log = fcs->chunk_size_log;
-	if (fcs_chunk_size_log > MAX_BLOCKS_PER_CHUNK_LOG)
+	if (fcs_chunk_size_log > BLOCKS_PER_CHUNK_LOG)
 	{
 		elog(ERROR, "LFC: Invalid chunk size log: %u", fcs->chunk_size_log);
 	}
@@ -945,7 +961,7 @@ lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
 {
 	BufferTag	tag;
 	FileCacheEntry *entry;
-	uint32		hash;
+	uint64		hash;

 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return;
@@ -958,14 +974,14 @@ lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
 	if (LFC_ENABLED())
 	{
-		for (BlockNumber blkno = 0; blkno < nblocks; blkno += lfc_blocks_per_chunk)
+		for (BlockNumber blkno = 0; blkno < nblocks; blkno += BLOCKS_PER_CHUNK)
 		{
 			tag.blockNum = blkno;
-			hash = get_hash_value(lfc_hash, &tag);
-			entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+			hash = file_cache_hash_get_hash_value(lfc_hash, &tag);
+			entry = file_cache_hash_find(lfc_hash, &tag, hash);
 			if (entry != NULL)
 			{
-				for (int i = 0; i < lfc_blocks_per_chunk; i++)
+				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 				{
 					if (GET_STATE(entry, i) == AVAILABLE)
 					{
@@ -990,7 +1006,7 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	FileCacheEntry *entry;
 	int			chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
 	bool		found = false;
-	uint32		hash;
+	uint64		hash;

 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return false;
@@ -1000,12 +1016,12 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	tag.blockNum = blkno - chunk_offs;

 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
-	hash = get_hash_value(lfc_hash, &tag);
+	hash = file_cache_hash_get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_SHARED);
 	if (LFC_ENABLED())
 	{
-		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+		entry = file_cache_hash_find(lfc_hash, &tag, hash);
 		found = entry != NULL && GET_STATE(entry, chunk_offs) != UNAVAILABLE;
 	}
 	LWLockRelease(lfc_lock);
@@ -1024,7 +1040,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	FileCacheEntry *entry;
 	uint32		chunk_offs;
 	int			found = 0;
-	uint32		hash;
+	uint64		hash;
 	int			i = 0;

 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
@@ -1037,7 +1053,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
 	tag.blockNum = blkno - chunk_offs;
-	hash = get_hash_value(lfc_hash, &tag);
+	hash = file_cache_hash_get_hash_value(lfc_hash, &tag);

 	LWLockAcquire(lfc_lock, LW_SHARED);

@@ -1048,12 +1064,12 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	}
 	while (true)
 	{
-		int		this_chunk = Min(nblocks - i, lfc_blocks_per_chunk - chunk_offs);
-		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+		int		this_chunk = Min(nblocks - i, BLOCKS_PER_CHUNK - chunk_offs);
+		entry = file_cache_hash_find(lfc_hash, &tag, hash);

 		if (entry != NULL)
 		{
-			for (; chunk_offs < lfc_blocks_per_chunk && i < nblocks; chunk_offs++, i++)
+			for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
 			{
 				if (GET_STATE(entry, chunk_offs) != UNAVAILABLE)
 				{
@@ -1079,7 +1095,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		 */
 		chunk_offs = BLOCK_TO_CHUNK_OFF(blkno + i);
 		tag.blockNum = (blkno + i) - chunk_offs;
-		hash = get_hash_value(lfc_hash, &tag);
+		hash = file_cache_hash_get_hash_value(lfc_hash, &tag);
 	}

 	LWLockRelease(lfc_lock);
@@ -1128,7 +1144,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	BufferTag	tag;
 	FileCacheEntry *entry;
 	ssize_t		rc;
-	uint32		hash;
+	uint64		hash;
 	uint64		generation;
 	uint32		entry_offset;
 	int			blocks_read = 0;
@@ -1154,9 +1170,9 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	while (nblocks > 0)
 	{
 		struct iovec iov[PG_IOV_MAX];
-		uint8	chunk_mask[MAX_BLOCKS_PER_CHUNK / 8] = {0};
+		uint8	chunk_mask[BLOCKS_PER_CHUNK / 8] = {0};
 		int		chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
-		int		blocks_in_chunk = Min(nblocks, lfc_blocks_per_chunk - chunk_offs);
+		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
 		int		iteration_hits = 0;
 		int		iteration_misses = 0;
 		uint64	io_time_us = 0;
@@ -1206,7 +1222,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		Assert(iov_last_used - first_block_in_chunk_read >= n_blocks_to_read);

 		tag.blockNum = blkno - chunk_offs;
-		hash = get_hash_value(lfc_hash, &tag);
+		hash = file_cache_hash_get_hash_value(lfc_hash, &tag);
 		cv = &lfc_ctl->cv[hash % N_COND_VARS];

 		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -1219,13 +1235,13 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			return blocks_read;
 		}

-		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+		entry = file_cache_hash_find(lfc_hash, &tag, hash);

 		/* Approximate working set for the blocks assumed in this entry */
 		for (int i = 0; i < blocks_in_chunk; i++)
 		{
 			tag.blockNum = blkno + i;
-			addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+			addSHLL(&lfc_ctl->wss_estimation, file_cache_hash_get_hash_value(lfc_hash, &tag));
 		}

 		if (entry == NULL)
@@ -1296,7 +1312,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		if (iteration_hits != 0)
 		{
 			/* chunk offset (# of pages) into the LFC file */
-			off_t	first_read_offset = (off_t) entry_offset * lfc_blocks_per_chunk;
+			off_t	first_read_offset = (off_t) entry_offset * BLOCKS_PER_CHUNK;
 			int		nwrite = iov_last_used - first_block_in_chunk_read;
 			/* offset of first IOV */
 			first_read_offset += chunk_offs + first_block_in_chunk_read;
@@ -1373,14 +1389,14 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 * Returns false if there are no unpinned entries and chunk can not be added.
 */
 static bool
-lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
+lfc_init_new_entry(FileCacheEntry *entry)
 {
 	/*-----------
 	 * If the chunk wasn't already in the LFC then we have these
 	 * options, in order of preference:
 	 *
 	 * Unless there is no space available, we can:
-	 *  1. Use an entry from the `holes` list, and
+	 *  1. Use an entry from the freelist, and
 	 *  2. Create a new entry.
 	 * We can always, regardless of space in the LFC:
 	 *  3. evict an entry from LRU, and
@@ -1388,17 +1404,10 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
 	 */
 	if (lfc_ctl->used < lfc_ctl->limit)
 	{
-		if (!dlist_is_empty(&lfc_ctl->holes))
+		if (!freelist_is_empty())
 		{
 			/* We can reuse a hole that was left behind when the LFC was shrunk previously */
-			FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node,
-												   dlist_pop_head_node(&lfc_ctl->holes));
-			uint32 offset = hole->offset;
-			bool hole_found;
-
-			hash_search_with_hash_value(lfc_hash, &hole->key,
-										hole->hash, HASH_REMOVE, &hole_found);
-			CriticalAssert(hole_found);
+			uint32 offset = freelist_pop();

 			lfc_ctl->used += 1;
 			entry->offset = offset;			/* reuse the hole */
@@ -1427,7 +1436,7 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
 		FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node,
 												 dlist_pop_head_node(&lfc_ctl->lru));

-		for (int i = 0; i < lfc_blocks_per_chunk; i++)
+		for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 		{
 			bool is_page_cached = GET_STATE(victim, i) == AVAILABLE;
 			lfc_ctl->used_pages -= is_page_cached;
@@ -1436,24 +1445,21 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)

 		CriticalAssert(victim->access_count == 0);
 		entry->offset = victim->offset; /* grab victim's chunk */
-		hash_search_with_hash_value(lfc_hash, &victim->key,
-									victim->hash, HASH_REMOVE, NULL);
+		file_cache_hash_remove_entry(lfc_hash, victim);
 		neon_log(DEBUG2, "Swap file cache page");
 	}
 	else
 	{
 		/* Can't add this chunk - we don't have the space for it */
-		hash_search_with_hash_value(lfc_hash, &entry->key, hash,
-									HASH_REMOVE, NULL);
+		file_cache_hash_remove_entry(lfc_hash, entry);
 		lfc_ctl->prewarm_canceled = true; /* cancel prewarm if LFC limit is reached */
 		return false;
 	}

 	entry->access_count = 1;
-	entry->hash = hash;
 	lfc_ctl->pinned += 1;

-	for (int i = 0; i < lfc_blocks_per_chunk; i++)
+	for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 		SET_STATE(entry, i, UNAVAILABLE);

 	return true;
@@ -1490,7 +1496,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	FileCacheEntry *entry;
 	ssize_t		rc;
 	bool		found;
-	uint32		hash;
+	uint64		hash;
 	uint64		generation;
 	uint32		entry_offset;
 	instr_time io_start, io_end;
@@ -1509,9 +1515,10 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);

 	tag.blockNum = blkno - chunk_offs;
-	hash = get_hash_value(lfc_hash, &tag);
+	hash = file_cache_hash_get_hash_value(lfc_hash, &tag);
 	cv = &lfc_ctl->cv[hash % N_COND_VARS];

+ retry:
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);

 	if (!LFC_ENABLED() || !lfc_ensure_opened())
@@ -1520,6 +1527,9 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		return false;
 	}

+	if (!freelist_prepare_pop())
+		goto retry;
+
 	lwlsn = neon_get_lwlsn(rinfo, forknum, blkno);

 	if (lwlsn > lsn)
@@ -1530,12 +1540,12 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		return false;
 	}

-	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
+	entry = file_cache_hash_enter(lfc_hash, &tag, hash, &found);

 	if (lfc_prewarm_update_ws_estimation)
 	{
 		tag.blockNum = blkno;
-		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+		addSHLL(&lfc_ctl->wss_estimation, file_cache_hash_get_hash_value(lfc_hash, &tag));
 	}
 	if (found)
 	{
@@ -1557,7 +1567,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	}
 	else
 	{
-		if (!lfc_init_new_entry(entry, hash))
+		if (!lfc_init_new_entry(entry))
 		{
 			/*
 			 * We can't process this chunk due to lack of space in LFC,
@@ -1578,7 +1588,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
 	INSTR_TIME_SET_CURRENT(io_start);
 	rc = pwrite(lfc_desc, buffer, BLCKSZ,
-				((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
+				((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
 	INSTR_TIME_SET_CURRENT(io_end);
 	pgstat_report_wait_end();

@@ -1640,7 +1650,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	FileCacheEntry *entry;
 	ssize_t		rc;
 	bool		found;
-	uint32		hash;
+	uint64		hash;
 	uint64		generation;
 	uint32		entry_offset;
 	int			buf_offset = 0;
@@ -1653,6 +1663,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);

+ retry:
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);

 	if (!LFC_ENABLED() || !lfc_ensure_opened())
@@ -1662,6 +1673,9 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	}
 	generation = lfc_ctl->generation;

+	if (!freelist_prepare_pop())
+		goto retry;
+
 	/*
 	 * For every chunk that has blocks we're interested in, we
 	 * 1. get the chunk header
@@ -1675,7 +1689,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	{
 		struct iovec iov[PG_IOV_MAX];
 		int		chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
-		int		blocks_in_chunk = Min(nblocks, lfc_blocks_per_chunk - chunk_offs);
+		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
 		instr_time io_start, io_end;
 		ConditionVariable* cv;

@@ -1688,16 +1702,16 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		}

 		tag.blockNum = blkno - chunk_offs;
-		hash = get_hash_value(lfc_hash, &tag);
+		hash = file_cache_hash_get_hash_value(lfc_hash, &tag);
 		cv = &lfc_ctl->cv[hash % N_COND_VARS];

-		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
+		entry = file_cache_hash_enter(lfc_hash, &tag, hash, &found);

 		/* Approximate working set for the blocks assumed in this entry */
 		for (int i = 0; i < blocks_in_chunk; i++)
 		{
 			tag.blockNum = blkno + i;
-			addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+			addSHLL(&lfc_ctl->wss_estimation, file_cache_hash_get_hash_value(lfc_hash, &tag));
 		}

 		if (found)
@@ -1714,7 +1728,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		}
 		else
 		{
-			if (!lfc_init_new_entry(entry, hash))
+			if (!lfc_init_new_entry(entry))
 			{
 				/*
 				 * We can't process this chunk due to lack of space in LFC,
@@ -1763,7 +1777,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
 		INSTR_TIME_SET_CURRENT(io_start);
 		rc = pwritev(lfc_desc, iov, blocks_in_chunk,
-					 ((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
+					 ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
 		INSTR_TIME_SET_CURRENT(io_end);
 		pgstat_report_wait_end();

@@ -1823,6 +1837,140 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	LWLockRelease(lfc_lock);
 }

+/**** freelist management ****/
+
+
+/*
+ * Prerequisites:
+ * - The caller is holding 'lfc_lock'. XXX
+ */
+static bool
+freelist_prepare_pop(void)
+{
+	/*
+	 * If the in-memory freelist is empty, but there are more blocks available, load them.
+	 *
+	 * TODO: if there
+	 */
+	if (lfc_ctl->num_free_pages == 0 && lfc_ctl->freelist_head != INVALID_OFFSET)
+	{
+		uint32		freelist_head;
+		FreeListChunk *freelist_chunk;
+		size_t		bytes_read;
+
+		LWLockRelease(lfc_lock);
+		LWLockAcquire(lfc_freelist_lock, LW_EXCLUSIVE);
+
+		if (!(lfc_ctl->num_free_pages == 0 && lfc_ctl->freelist_head != INVALID_OFFSET))
+		{
+			/* someone else did the work for us while we were not holding the lock */
+			LWLockRelease(lfc_freelist_lock);
+			return false;
+		}
+
+		freelist_head = lfc_ctl->freelist_head;
+		freelist_chunk = palloc(BLOCKS_PER_CHUNK * BLCKSZ);
+
+		bytes_read = 0;
+		while (bytes_read < BLOCKS_PER_CHUNK * BLCKSZ)
+		{
+			ssize_t		rc;
+
+			rc = pread(lfc_desc, freelist_chunk, BLOCKS_PER_CHUNK * BLCKSZ - bytes_read, (off_t) freelist_head * BLOCKS_PER_CHUNK * BLCKSZ + bytes_read);
+			if (rc < 0)
+			{
+				lfc_disable("read freelist page");
+				return false;
+			}
+			bytes_read += rc;
+		}
+
+		LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+		if (lfc_generation != lfc_ctl->generation)
+		{
+			LWLockRelease(lfc_lock);
+			return false;
+		}
+
+		Assert(lfc_ctl->freelist_head == freelist_head);
+		Assert(lfc_ctl->num_free_pages == 0);
+		lfc_ctl->freelist_head = freelist_chunk->next;
+		lfc_ctl->num_free_pages = freelist_chunk->num_free_pages;
+		memcpy(lfc_ctl->free_pages, freelist_chunk->free_pages, lfc_ctl->num_free_pages * sizeof(uint32));
+		pfree(freelist_chunk);
+
+		LWLockRelease(lfc_lock);
+		LWLockRelease(lfc_freelist_lock);
+		return false;
+	}
+
+	return true;
+}
+
+/*
+ * Prerequisites:
+ * - The caller is holding 'lfc_lock' and 'lfc_freelist_lock'.
+ *
+ * Returns 'false' on error.
+ */
+static bool
+freelist_push(uint32 offset)
+{
+	Assert(lfc_ctl->freelist_size == FREELIST_ENTRIES_PER_CHUNK);
+	if (lfc_ctl->num_free_pages == lfc_ctl->freelist_size)
+	{
+		FreeListChunk *freelist_chunk;
+		struct iovec iov;
+		ssize_t		rc;
+
+		freelist_chunk = palloc(BLOCKS_PER_CHUNK * BLCKSZ);
+
+		/* write the existing entries to the chunk on disk */
+		freelist_chunk->next = lfc_ctl->freelist_head;
+		freelist_chunk->num_free_pages = lfc_ctl->num_free_pages;
+		memcpy(freelist_chunk->free_pages, lfc_ctl->free_pages, lfc_ctl->num_free_pages * sizeof(uint32));
+
+		/* Use the passed-in offset to hold the freelist chunk itself */
+		iov.iov_base = freelist_chunk;
+		iov.iov_len = BLOCKS_PER_CHUNK * BLCKSZ;
+		rc = pg_pwritev_with_retry(lfc_desc, &iov, 1, (off_t) offset * BLOCKS_PER_CHUNK * BLCKSZ);
+
+		pfree(freelist_chunk);
+
+		if (rc < 0)
+			return false;
+
+		lfc_ctl->freelist_head = offset;
+		lfc_ctl->num_free_pages = 0;
+	}
+	else
+	{
+		lfc_ctl->free_pages[lfc_ctl->num_free_pages] = offset;
+		lfc_ctl->num_free_pages++;
+	}
+	return true;
+}
+
+static uint32
+freelist_pop(void)
+{
+	uint32		result;
+
+	/* The caller should've checked that the list is not empty */
+	Assert(lfc_ctl->num_free_pages > 0);
+
+	result = lfc_ctl->free_pages[lfc_ctl->num_free_pages - 1];
+	lfc_ctl->num_free_pages--;
+
+	return result;
+}
+
+static bool
+freelist_is_empty(void)
+{
+	return lfc_ctl->num_free_pages == 0;
+}
+
 typedef struct
 {
 	TupleDesc	tupdesc;
@@ -1919,7 +2067,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 			break;
 		case 8:
 			key = "file_cache_chunk_size_pages";
-			value = lfc_blocks_per_chunk;
+			value = BLOCKS_PER_CHUNK;
 			break;
 		case 9:
 			key = "file_cache_chunks_pinned";
@@ -1990,7 +2138,6 @@ local_cache_pages(PG_FUNCTION_ARGS)

 	if (SRF_IS_FIRSTCALL())
 	{
-		HASH_SEQ_STATUS status;
 		FileCacheEntry *entry;
 		uint32		n_pages = 0;

@@ -2046,15 +2193,16 @@ local_cache_pages(PG_FUNCTION_ARGS)

 			if (LFC_ENABLED())
 			{
-				hash_seq_init(&status, lfc_hash);
-				while ((entry = hash_seq_search(&status)) != NULL)
+				uint32		num_buckets = file_cache_hash_get_num_buckets(lfc_hash);
+
+				for (uint32 pos = 0; pos < num_buckets; pos++)
 				{
-					/* Skip hole tags */
-					if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
-					{
-						for (int i = 0; i < lfc_blocks_per_chunk; i++)
-							n_pages += GET_STATE(entry, i) == AVAILABLE;
-					}
+					entry = file_cache_hash_get_at_pos(lfc_hash, pos);
+					if (entry == NULL)
+						continue;
+
+					for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+						n_pages += GET_STATE(entry, i) == AVAILABLE;
 				}
 			}
 		}
@@ -2076,25 +2224,28 @@ local_cache_pages(PG_FUNCTION_ARGS)
 			 * in the fctx->record structure.
 			 */
 			uint32		n = 0;
+			uint32		num_buckets = file_cache_hash_get_num_buckets(lfc_hash);

-			hash_seq_init(&status, lfc_hash);
-			while ((entry = hash_seq_search(&status)) != NULL)
+			for (uint32 pos = 0; pos < num_buckets; pos++)
 			{
-				for (int i = 0; i < lfc_blocks_per_chunk; i++)
+				entry = file_cache_hash_get_at_pos(lfc_hash, pos);
+				if (entry == NULL)
+					continue;
+
+				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 				{
-					if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
+					const BufferTag *key = file_cache_hash_get_key_for_entry(lfc_hash, entry);
+
+					if (GET_STATE(entry, i) == AVAILABLE)
 					{
-						if (GET_STATE(entry, i) == AVAILABLE)
-						{
-							fctx->record[n].pageoffs = entry->offset * lfc_blocks_per_chunk + i;
-							fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
-							fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
-							fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
-							fctx->record[n].forknum = entry->key.forkNum;
-							fctx->record[n].blocknum = entry->key.blockNum + i;
-							fctx->record[n].accesscount = entry->access_count;
-							n += 1;
-						}
+						fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
+						fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(*key));
+						fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(*key));
+						fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(*key));
+						fctx->record[n].forknum = key->forkNum;
+						fctx->record[n].blocknum = key->blockNum + i;
+						fctx->record[n].accesscount = entry->access_count;
+						n += 1;
 					}
 				}
 			}
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -16,6 +16,7 @@
 #if PG_MAJORVERSION_NUM >= 15
 #include "access/xlogrecovery.h"
 #endif
+#include "executor/instrument.h"
 #include "replication/logical.h"
 #include "replication/logicallauncher.h"
 #include "replication/slot.h"
@@ -33,6 +34,7 @@
 #include "file_cache.h"
 #include "neon.h"
 #include "neon_lwlsncache.h"
+#include "neon_perf_counters.h"
 #include "control_plane_connector.h"
 #include "logical_replication_monitor.h"
 #include "unstable_extensions.h"
@@ -46,6 +48,13 @@ void		_PG_init(void);


 static int  running_xacts_overflow_policy;
+static bool monitor_query_exec_time = false;
+
+static ExecutorStart_hook_type prev_ExecutorStart = NULL;
+static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
+
+static void neon_ExecutorStart(QueryDesc *queryDesc, int eflags);
+static void neon_ExecutorEnd(QueryDesc *queryDesc);

 #if PG_MAJORVERSION_NUM >= 16
 static shmem_startup_hook_type prev_shmem_startup_hook;
@@ -470,6 +479,16 @@ _PG_init(void)
 							0,
 							NULL, NULL, NULL);

+	DefineCustomBoolVariable(
+							"neon.monitor_query_exec_time",
+							"Collect infortmation about query execution time",
+							NULL,
+							&monitor_query_exec_time,
+							false,
+							PGC_USERSET,
+							0,
+							NULL, NULL, NULL);
+
 	DefineCustomBoolVariable(
 							"neon.allow_replica_misconfig",
 							"Allow replica startup when some critical GUCs have smaller value than on primary node",
@@ -508,6 +527,11 @@ _PG_init(void)
 	EmitWarningsOnPlaceholders("neon");

 	ReportSearchPath();
+
+	prev_ExecutorStart = ExecutorStart_hook;
+	ExecutorStart_hook = neon_ExecutorStart;
+	prev_ExecutorEnd = ExecutorEnd_hook;
+	ExecutorEnd_hook = neon_ExecutorEnd;
 }

 PG_FUNCTION_INFO_V1(pg_cluster_size);
@@ -581,3 +605,55 @@ neon_shmem_startup_hook(void)
 #endif
 }
 #endif
+
+/*
+ * ExecutorStart hook: start up tracking if needed
+ */
+static void
+neon_ExecutorStart(QueryDesc *queryDesc, int eflags)
+{
+	if (prev_ExecutorStart)
+		prev_ExecutorStart(queryDesc, eflags);
+	else
+		standard_ExecutorStart(queryDesc, eflags);
+
+	if (monitor_query_exec_time)
+	{
+		/*
+		 * Set up to track total elapsed time in ExecutorRun.  Make sure the
+		 * space is allocated in the per-query context so it will go away at
+		 * ExecutorEnd.
+		 */
+		if (queryDesc->totaltime == NULL)
+		{
+			MemoryContext oldcxt;
+
+			oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
+			queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_TIMER, false);
+			MemoryContextSwitchTo(oldcxt);
+		}
+	}
+}
+
+/*
+ * ExecutorEnd hook: store results if needed
+ */
+static void
+neon_ExecutorEnd(QueryDesc *queryDesc)
+{
+	if (monitor_query_exec_time && queryDesc->totaltime)
+	{
+		/*
+		 * Make sure stats accumulation is done.  (Note: it's okay if several
+		 * levels of hook all do this.)
+		 */
+		InstrEndLoop(queryDesc->totaltime);
+
+		inc_query_time(queryDesc->totaltime->total*1000000); /* convert to usec */
+	}
+
+	if (prev_ExecutorEnd)
+		prev_ExecutorEnd(queryDesc);
+	else
+		standard_ExecutorEnd(queryDesc);
+}
--- a/pgxn/neon/neon_perf_counters.c
+++ b/pgxn/neon/neon_perf_counters.c
@@ -71,6 +71,27 @@ inc_iohist(IOHistogram hist, uint64 latency_us)
 	hist->wait_us_count++;
 }

+static inline void
+inc_qthist(QTHistogram hist, uint64 elapsed_us)
+{
+	int			lo = 0;
+	int			hi = NUM_QT_BUCKETS - 1;
+
+	/* Find the right bucket with binary search */
+	while (lo < hi)
+	{
+		int			mid = (lo + hi) / 2;
+
+		if (elapsed_us < qt_bucket_thresholds[mid])
+			hi = mid;
+		else
+			lo = mid + 1;
+	}
+	hist->elapsed_us_bucket[lo]++;
+	hist->elapsed_us_sum += elapsed_us;
+	hist->elapsed_us_count++;
+}
+
 /*
 * Count a GetPage wait operation.
 */
@@ -98,6 +119,13 @@ inc_page_cache_write_wait(uint64 latency)
 	inc_iohist(&MyNeonCounters->file_cache_write_hist, latency);
 }

+
+void
+inc_query_time(uint64 elapsed)
+{
+	inc_qthist(&MyNeonCounters->query_time_hist, elapsed);
+}
+
 /*
 * Support functions for the views, neon_backend_perf_counters and
 * neon_perf_counters.
@@ -112,11 +140,11 @@ typedef struct
 } metric_t;

 static int
-histogram_to_metrics(IOHistogram histogram,
-					 metric_t *metrics,
-					 const char *count,
-					 const char *sum,
-					 const char *bucket)
+io_histogram_to_metrics(IOHistogram histogram,
+						metric_t *metrics,
+						const char *count,
+						const char *sum,
+						const char *bucket)
 {
 	int		i = 0;
 	uint64	bucket_accum = 0;
@@ -145,10 +173,44 @@ histogram_to_metrics(IOHistogram histogram,
 	return i;
 }

+static int
+qt_histogram_to_metrics(QTHistogram histogram,
+						metric_t *metrics,
+						const char *count,
+						const char *sum,
+						const char *bucket)
+{
+	int		i = 0;
+	uint64	bucket_accum = 0;
+
+	metrics[i].name = count;
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) histogram->elapsed_us_count;
+	i++;
+	metrics[i].name = sum;
+	metrics[i].is_bucket = false;
+	metrics[i].value = (double) histogram->elapsed_us_sum / 1000000.0;
+	i++;
+	for (int bucketno = 0; bucketno < NUM_QT_BUCKETS; bucketno++)
+	{
+		uint64		threshold = qt_bucket_thresholds[bucketno];
+
+		bucket_accum += histogram->elapsed_us_bucket[bucketno];
+
+		metrics[i].name = bucket;
+		metrics[i].is_bucket = true;
+		metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0;
+		metrics[i].value = (double) bucket_accum;
+		i++;
+	}
+
+	return i;
+}
+
 static metric_t *
 neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
 {
-#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 12)
+#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + (2 + NUM_QT_BUCKETS) + 12)
 	metric_t   *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t));
 	int			i = 0;

@@ -159,10 +221,10 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
 		i++; \
 	} while (false)

-	i += histogram_to_metrics(&counters->getpage_hist, &metrics[i],
-							  "getpage_wait_seconds_count",
-							  "getpage_wait_seconds_sum",
-							  "getpage_wait_seconds_bucket");
+	i += io_histogram_to_metrics(&counters->getpage_hist, &metrics[i],
+								 "getpage_wait_seconds_count",
+								 "getpage_wait_seconds_sum",
+								 "getpage_wait_seconds_bucket");

 	APPEND_METRIC(getpage_prefetch_requests_total);
 	APPEND_METRIC(getpage_sync_requests_total);
@@ -178,14 +240,19 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters)

 	APPEND_METRIC(file_cache_hits_total);

-	i += histogram_to_metrics(&counters->file_cache_read_hist, &metrics[i],
-							  "file_cache_read_wait_seconds_count",
-							  "file_cache_read_wait_seconds_sum",
-							  "file_cache_read_wait_seconds_bucket");
-	i += histogram_to_metrics(&counters->file_cache_write_hist, &metrics[i],
-							  "file_cache_write_wait_seconds_count",
-							  "file_cache_write_wait_seconds_sum",
-							  "file_cache_write_wait_seconds_bucket");
+	i += io_histogram_to_metrics(&counters->file_cache_read_hist, &metrics[i],
+								 "file_cache_read_wait_seconds_count",
+								 "file_cache_read_wait_seconds_sum",
+								 "file_cache_read_wait_seconds_bucket");
+	i += io_histogram_to_metrics(&counters->file_cache_write_hist, &metrics[i],
+								 "file_cache_write_wait_seconds_count",
+								 "file_cache_write_wait_seconds_sum",
+								 "file_cache_write_wait_seconds_bucket");
+
+	i += qt_histogram_to_metrics(&counters->query_time_hist, &metrics[i],
+								 "query_time_seconds_count",
+								 "query_time_seconds_sum",
+								 "query_time_seconds_bucket");

 	Assert(i == NUM_METRICS);

@@ -257,7 +324,7 @@ neon_get_backend_perf_counters(PG_FUNCTION_ARGS)
 }

 static inline void
-histogram_merge_into(IOHistogram into, IOHistogram from)
+io_histogram_merge_into(IOHistogram into, IOHistogram from)
 {
 	into->wait_us_count += from->wait_us_count;
 	into->wait_us_sum += from->wait_us_sum;
@@ -265,6 +332,15 @@ histogram_merge_into(IOHistogram into, IOHistogram from)
 		into->wait_us_bucket[bucketno] += from->wait_us_bucket[bucketno];
 }

+static inline void
+qt_histogram_merge_into(QTHistogram into, QTHistogram from)
+{
+	into->elapsed_us_count += from->elapsed_us_count;
+	into->elapsed_us_sum += from->elapsed_us_sum;
+	for (int bucketno = 0; bucketno < NUM_QT_BUCKETS; bucketno++)
+		into->elapsed_us_bucket[bucketno] += from->elapsed_us_bucket[bucketno];
+}
+
 PG_FUNCTION_INFO_V1(neon_get_perf_counters);
 Datum
 neon_get_perf_counters(PG_FUNCTION_ARGS)
@@ -283,7 +359,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
 	{
 		neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno];

-		histogram_merge_into(&totals.getpage_hist, &counters->getpage_hist);
+		io_histogram_merge_into(&totals.getpage_hist, &counters->getpage_hist);
 		totals.getpage_prefetch_requests_total += counters->getpage_prefetch_requests_total;
 		totals.getpage_sync_requests_total += counters->getpage_sync_requests_total;
 		totals.getpage_prefetch_misses_total += counters->getpage_prefetch_misses_total;
@@ -294,13 +370,13 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
 		totals.pageserver_open_requests += counters->pageserver_open_requests;
 		totals.getpage_prefetches_buffered += counters->getpage_prefetches_buffered;
 		totals.file_cache_hits_total += counters->file_cache_hits_total;
-		histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist);
-		histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist);
-
 		totals.compute_getpage_stuck_requests_total += counters->compute_getpage_stuck_requests_total;
 		totals.compute_getpage_max_inflight_stuck_time_ms = Max(
 			totals.compute_getpage_max_inflight_stuck_time_ms,
 			counters->compute_getpage_max_inflight_stuck_time_ms);
+		io_histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist);
+		io_histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist);
+		qt_histogram_merge_into(&totals.query_time_hist, &counters->query_time_hist);
 	}

 	metrics = neon_perf_counters_to_metrics(&totals);
--- a/pgxn/neon/neon_perf_counters.h
+++ b/pgxn/neon/neon_perf_counters.h
@@ -36,6 +36,28 @@ typedef struct IOHistogramData

 typedef IOHistogramData *IOHistogram;

+static const uint64 qt_bucket_thresholds[] = {
+	       2,        3,        6,        10,  /* 0 us   - 10 us */
+	      20,       30,       60,       100,  /* 10 us  - 100 us */
+	     200,      300,      600,	   1000,  /* 100 us - 1 ms */
+	    2000,     3000,     6000,     10000,  /* 1 ms   - 10 ms */
+	   20000,    30000,    60000,    100000,  /* 10 ms  - 100 ms */
+	  200000,   300000,   600000,   1000000,  /* 100 ms - 1 s */
+	 2000000,  3000000,  6000000,  10000000,  /* 1 s - 10 s */
+	20000000, 30000000, 60000000, 100000000,  /* 10 s - 100 s */
+	UINT64_MAX,
+};
+#define NUM_QT_BUCKETS (lengthof(qt_bucket_thresholds))
+
+typedef struct QTHistogramData
+{
+	uint64		elapsed_us_count;
+	uint64		elapsed_us_sum;
+	uint64		elapsed_us_bucket[NUM_QT_BUCKETS];
+} QTHistogramData;
+
+typedef QTHistogramData *QTHistogram;
+
 typedef struct
 {
 	/*
@@ -127,6 +149,11 @@ typedef struct
 	/* LFC I/O time buckets */
 	IOHistogramData file_cache_read_hist;
 	IOHistogramData file_cache_write_hist;
+
+	/*
+	 * Histogram of query execution time.
+	 */
+	QTHistogramData query_time_hist;
 } neon_per_backend_counters;

 /* Pointer to the shared memory array of neon_per_backend_counters structs */
@@ -149,6 +176,7 @@ extern neon_per_backend_counters *neon_per_backend_counters_shared;
 extern void inc_getpage_wait(uint64 latency);
 extern void inc_page_cache_read_wait(uint64 latency);
 extern void inc_page_cache_write_wait(uint64 latency);
+extern void inc_query_time(uint64 elapsed);

 extern Size NeonPerfCountersShmemSize(void);
 extern void NeonPerfCountersShmemInit(void);
--- a/pgxn/neon/neon_pgversioncompat.c
+++ b/pgxn/neon/neon_pgversioncompat.c
@@ -5,6 +5,7 @@

 #include "funcapi.h"
 #include "miscadmin.h"
+#include "access/xlog.h"
 #include "utils/tuplestore.h"

 #include "neon_pgversioncompat.h"
@@ -41,5 +42,12 @@ InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags)
 	rsinfo->setDesc = stored_tupdesc;
 	MemoryContextSwitchTo(old_context);
 }
+
+TimeLineID GetWALInsertionTimeLine(void)
+{
+	return ThisTimeLineID + 1;
+}
+
+
 #endif

--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -162,6 +162,7 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,

 #if PG_MAJORVERSION_NUM < 15
 extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
+extern TimeLineID GetWALInsertionTimeLine(void);
 #endif

 #endif							/* NEON_PGVERSIONCOMPAT_H */
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -69,6 +69,7 @@ struct NeonWALReader
 	WALSegmentContext segcxt;
 	WALOpenSegment seg;
 	int			wre_errno;
+	TimeLineID	local_active_tlid;
 	/* Explains failure to read, static for simplicity. */
 	char		err_msg[NEON_WALREADER_ERR_MSG_LEN];

@@ -106,7 +107,7 @@ struct NeonWALReader

 /* palloc and initialize NeonWALReader */
 NeonWALReader *
-NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix)
+NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix, TimeLineID tlid)
 {
 	NeonWALReader *reader;

@@ -118,6 +119,7 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_
 		MemoryContextAllocZero(TopMemoryContext, sizeof(NeonWALReader));

 	reader->available_lsn = available_lsn;
+	reader->local_active_tlid = tlid;
 	reader->seg.ws_file = -1;
 	reader->seg.ws_segno = 0;
 	reader->seg.ws_tli = 0;
@@ -577,6 +579,17 @@ NeonWALReaderIsRemConnEstablished(NeonWALReader *state)
 	return state->rem_state == RS_ESTABLISHED;
 }

+/*
+ * Whether remote connection is established. Once this is done, until successful
+ * local read or error socket is stable and user can update socket events
+ * instead of readding it each time.
+ */
+TimeLineID
+NeonWALReaderLocalActiveTimeLineID(NeonWALReader *state)
+{
+	return state->local_active_tlid;
+}
+
 /*
 * Returns events user should wait on connection socket or 0 if remote
 * connection is not active.
--- a/pgxn/neon/neon_walreader.h
+++ b/pgxn/neon/neon_walreader.h
@@ -19,9 +19,10 @@ typedef enum
 	NEON_WALREAD_ERROR,
 } NeonWALReadResult;

-extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix);
+extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix, TimeLineID tlid);
 extern void NeonWALReaderFree(NeonWALReader *state);
 extern void NeonWALReaderResetRemote(NeonWALReader *state);
+extern TimeLineID NeonWALReaderLocalActiveTimeLineID(NeonWALReader *state);
 extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
 extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
 extern uint32 NeonWALReaderEvents(NeonWALReader *state);
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -98,6 +98,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	wp = palloc0(sizeof(WalProposer));
 	wp->config = config;
 	wp->api = api;
+	wp->localTimeLineID = config->pgTimeline;
 	wp->state = WPS_COLLECTING_TERMS;
 	wp->mconf.generation = INVALID_GENERATION;
 	wp->mconf.members.len = 0;
@@ -119,6 +120,10 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 		{
 			wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m");
 		}
+		if (*endptr != ':')
+		{
+			wp_log(FATAL, "failed to parse neon.safekeepers: no colon after generation");
+		}
 		/* Skip past : to the first hostname. */
 		host = endptr + 1;
 	}
@@ -1380,7 +1385,7 @@ ProcessPropStartPos(WalProposer *wp)
 	 * we must bail out, as clog and other non rel data is inconsistent.
 	 */
 	walprop_shared = wp->api.get_shmem_state(wp);
-	if (!wp->config->syncSafekeepers)
+	if (!wp->config->syncSafekeepers && !walprop_shared->replica_promote)
 	{
 		/*
 		 * Basebackup LSN always points to the beginning of the record (not
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -391,6 +391,7 @@ typedef struct WalproposerShmemState
 	/* last feedback from each shard */
 	PageserverFeedback shard_ps_feedback[MAX_SHARDS];
 	int			num_shards;
+	bool		replica_promote;

 	/* aggregated feedback with min LSNs across shards */
 	PageserverFeedback min_ps_feedback;
@@ -806,6 +807,9 @@ typedef struct WalProposer
 	/* Safekeepers walproposer is connecting to. */
 	Safekeeper	safekeeper[MAX_SAFEKEEPERS];

+	/* Current local TimeLineId in use */
+	TimeLineID	localTimeLineID;
+
 	/* WAL has been generated up to this point */
 	XLogRecPtr	availableLsn;

--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -35,6 +35,7 @@
 #include "storage/proc.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
 #include "storage/shmem.h"
 #include "storage/spin.h"
 #include "tcop/tcopprot.h"
@@ -159,12 +160,19 @@ WalProposerMain(Datum main_arg)
 {
 	WalProposer *wp;

+	if (*wal_acceptors_list == '\0')
+	{
+		wpg_log(WARNING, "Safekeepers list is empty");
+		return;
+	}
+
 	init_walprop_config(false);
 	walprop_pg_init_bgworker();
 	am_walproposer = true;
 	walprop_pg_load_libpqwalreceiver();

 	wp = WalProposerCreate(&walprop_config, walprop_pg);
+	wp->localTimeLineID = GetWALInsertionTimeLine();
 	wp->last_reconnect_attempt = walprop_pg_get_current_timestamp(wp);

 	walprop_pg_init_walsender();
@@ -272,6 +280,30 @@ split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
 	return n_safekeepers;
 }

+static char *split_off_safekeepers_generation(char *safekeepers_list, uint32 *generation)
+{
+	char	   *endptr;
+
+	if (strncmp(safekeepers_list, "g#", 2) != 0)
+	{
+		return safekeepers_list;
+	}
+	else
+	{
+		errno = 0;
+		*generation = strtoul(safekeepers_list + 2, &endptr, 10);
+		if (errno != 0)
+		{
+			wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m");
+		}
+		if (*endptr != ':')
+		{
+			wp_log(FATAL, "failed to parse neon.safekeepers: no colon after generation");
+		}
+		return endptr + 1;
+	}
+}
+
 /*
 * Accept two coma-separated strings with list of safekeeper host:port addresses.
 * Split them into arrays and return false if two sets do not match, ignoring the order.
@@ -283,6 +315,16 @@ safekeepers_cmp(char *old, char *new)
 	char	   *safekeepers_new[MAX_SAFEKEEPERS];
 	int			len_old = 0;
 	int			len_new = 0;
+	uint32		gen_old = INVALID_GENERATION;
+	uint32		gen_new = INVALID_GENERATION;
+
+	old = split_off_safekeepers_generation(old, &gen_old);
+	new = split_off_safekeepers_generation(new, &gen_new);
+
+	if (gen_old != gen_new)
+	{
+		return false;
+	}

 	len_old = split_safekeepers_list(old, safekeepers_old);
 	len_new = split_safekeepers_list(new, safekeepers_new);
@@ -316,6 +358,9 @@ assign_neon_safekeepers(const char *newval, void *extra)
 	char	   *newval_copy;
 	char	   *oldval;

+	if (newval && *newval != '\0' && UsedShmemSegAddr && walprop_shared && RecoveryInProgress())
+		walprop_shared->replica_promote = true;
+
 	if (!am_walproposer)
 		return;

@@ -506,16 +551,15 @@ BackpressureThrottlingTime(void)

 /*
 * Register a background worker proposing WAL to wal acceptors.
+ * We start walproposer bgworker even for replicas in order to support possible replica promotion.
+ * When pg_promote() function is called, then walproposer bgworker registered with BgWorkerStart_RecoveryFinished
+ * is automatically launched when promotion is completed.
 */
 static void
 walprop_register_bgworker(void)
 {
 	BackgroundWorker bgw;

-	/* If no wal acceptors are specified, don't start the background worker. */
-	if (*wal_acceptors_list == '\0')
-		return;
-
 	memset(&bgw, 0, sizeof(bgw));
 	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
 	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
@@ -1292,9 +1336,7 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)

 #if PG_VERSION_NUM < 150000
 	if (ThisTimeLineID == 0)
-		ereport(ERROR,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
+		ThisTimeLineID = 1;
 #endif

 	/*
@@ -1508,7 +1550,7 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk)

 	snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port);
 	Assert(!sk->xlogreader);
-	sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propTermStartLsn, log_prefix);
+	sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propTermStartLsn, log_prefix, sk->wp->localTimeLineID);
 	if (sk->xlogreader == NULL)
 		wpg_log(FATAL, "failed to allocate xlog reader");
 }
@@ -1522,7 +1564,7 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count,
 					  buf,
 					  startptr,
 					  count,
-					  walprop_pg_get_timeline_id());
+					  sk->wp->localTimeLineID);

 	if (res == NEON_WALREAD_SUCCESS)
 	{
--- a/pgxn/neon/walsender_hooks.c
+++ b/pgxn/neon/walsender_hooks.c
@@ -111,7 +111,7 @@ NeonWALPageRead(
 											readBuf,
 											targetPagePtr,
 											count,
-											walprop_pg_get_timeline_id());
+											NeonWALReaderLocalActiveTimeLineID(wal_reader));

 		if (res == NEON_WALREAD_SUCCESS)
 		{
@@ -202,7 +202,7 @@ NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr)
 		{
 			elog(ERROR, "unable to start walsender when basebackupLsn is 0");
 		}
-		wal_reader = NeonWALReaderAllocate(wal_segment_size, basebackupLsn, "[walsender] ");
+		wal_reader = NeonWALReaderAllocate(wal_segment_size, basebackupLsn, "[walsender] ", 1);
 	}
 	xlr->page_read = NeonWALPageRead;
 	xlr->segment_open = NeonWALReadSegmentOpen;
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -15,9 +15,9 @@ use crate::context::RequestContext;
 use crate::control_plane::client::cplane_proxy_v1;
 use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
 use crate::error::{ReportableError, UserFacingError};
+use crate::pglb::connect_compute::ComputeConnectBackend;
 use crate::pqproto::BeMessage;
 use crate::proxy::NeonOptions;
-use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::stream::PqStream;
 use crate::types::RoleName;
 use crate::{auth, compute, waiters};
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -25,9 +25,9 @@ use crate::control_plane::{
    RoleAccessControl,
 };
 use crate::intern::EndpointIdInt;
+use crate::pglb::connect_compute::ComputeConnectBackend;
 use crate::pqproto::BeMessage;
 use crate::proxy::NeonOptions;
-use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::stream::Stream;
 use crate::types::{EndpointCacheKey, EndpointId, RoleName};
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -221,8 +221,7 @@ struct ProxyCliArgs {
    is_private_access_proxy: bool,

    /// Configure whether all incoming requests have a Proxy Protocol V2 packet.
-    // TODO(conradludgate): switch default to rejected or required once we've updated all deployments
-    #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)]
+    #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Rejected)]
    proxy_protocol_v2: ProxyProtocolV2,

    /// Time the proxy waits for the webauth session to be confirmed by the control plane.
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -39,8 +39,6 @@ pub struct ComputeConfig {
 pub enum ProxyProtocolV2 {
    /// Connection will error if PROXY protocol v2 header is missing
    Required,
-    /// Connection will parse PROXY protocol v2 header, but accept the connection if it's missing.
-    Supported,
    /// Connection will error if PROXY protocol v2 header is provided
    Rejected,
 }
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -11,10 +11,10 @@ use crate::config::{ProxyConfig, ProxyProtocolV2};
 use crate::context::RequestContext;
 use crate::error::ReportableError;
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
+use crate::pglb::connect_compute::{TcpMechanism, connect_to_compute};
+use crate::pglb::handshake::{HandshakeData, handshake};
+use crate::pglb::passthrough::ProxyPassthrough;
 use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol};
-use crate::proxy::connect_compute::{TcpMechanism, connect_to_compute};
-use crate::proxy::handshake::{HandshakeData, handshake};
-use crate::proxy::passthrough::ProxyPassthrough;
 use crate::proxy::{
    ClientRequestError, ErrorSource, prepare_client_connection, run_until_cancelled,
 };
@@ -54,30 +54,24 @@ pub async fn task_main(
        debug!(protocol = "tcp", %session_id, "accepted new TCP connection");

        connections.spawn(async move {
-            let (socket, peer_addr) = match read_proxy_protocol(socket).await {
-                Err(e) => {
-                    error!("per-client task finished with an error: {e:#}");
-                    return;
+            let (socket, conn_info) = match config.proxy_protocol_v2 {
+                ProxyProtocolV2::Required => {
+                    match read_proxy_protocol(socket).await {
+                        Err(e) => {
+                            error!("per-client task finished with an error: {e:#}");
+                            return;
+                        }
+                        // our load balancers will not send any more data. let's just exit immediately
+                        Ok((_socket, ConnectHeader::Local)) => {
+                            debug!("healthcheck received");
+                            return;
+                        }
+                        Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
+                    }
                }
-                // our load balancers will not send any more data. let's just exit immediately
-                Ok((_socket, ConnectHeader::Local)) => {
-                    debug!("healthcheck received");
-                    return;
-                }
-                Ok((_socket, ConnectHeader::Missing))
-                    if config.proxy_protocol_v2 == ProxyProtocolV2::Required =>
-                {
-                    error!("missing required proxy protocol header");
-                    return;
-                }
-                Ok((_socket, ConnectHeader::Proxy(_)))
-                    if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected =>
-                {
-                    error!("proxy protocol header not supported");
-                    return;
-                }
-                Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
-                Ok((socket, ConnectHeader::Missing)) => (
+                // ignore the header - it cannot be confused for a postgres or http connection so will
+                // error later.
+                ProxyProtocolV2::Rejected => (
                    socket,
                    ConnectionInfo {
                        addr: peer_addr,
@@ -86,7 +80,7 @@ pub async fn task_main(
                ),
            };

-            match socket.inner.set_nodelay(true) {
+            match socket.set_nodelay(true) {
                Ok(()) => {}
                Err(e) => {
                    error!(
@@ -98,7 +92,7 @@ pub async fn task_main(

            let ctx = RequestContext::new(
                session_id,
-                peer_addr,
+                conn_info,
                crate::metrics::Protocol::Tcp,
                &config.region,
            );
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -2,7 +2,6 @@ use async_trait::async_trait;
 use tokio::time;
 use tracing::{debug, info, warn};

-use super::retry::ShouldRetryWakeCompute;
 use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
 use crate::compute::{self, COULD_NOT_CONNECT, PostgresConnection};
 use crate::config::{ComputeConfig, RetryConfig};
@@ -15,7 +14,7 @@ use crate::metrics::{
    ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType,
 };
 use crate::pqproto::StartupMessageParams;
-use crate::proxy::retry::{CouldRetry, retry_after, should_retry};
+use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute, retry_after, should_retry};
 use crate::proxy::wake_compute::wake_compute;
 use crate::types::Host;

--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
--- a/proxy/src/pglb/mod.rs
+++ b/proxy/src/pglb/mod.rs
@@ -1 +1,5 @@
+pub mod connect_compute;
+pub mod copy_bidirectional;
+pub mod handshake;
 pub mod inprocess;
+pub mod passthrough;
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -53,7 +53,7 @@ pub(crate) async fn proxy_pass(

    // Starting from here we only proxy the client's traffic.
    debug!("performing the proxy pass...");
-    let _ = crate::proxy::copy_bidirectional::copy_bidirectional_client_compute(
+    let _ = crate::pglb::copy_bidirectional::copy_bidirectional_client_compute(
        &mut client,
        &mut compute,
    )
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -4,60 +4,13 @@
 use core::fmt;
 use std::io;
 use std::net::{Ipv4Addr, Ipv6Addr, SocketAddr};
-use std::pin::Pin;
-use std::task::{Context, Poll};

-use bytes::{Buf, Bytes, BytesMut};
-use pin_project_lite::pin_project;
+use bytes::Buf;
 use smol_str::SmolStr;
 use strum_macros::FromRepr;
-use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
+use tokio::io::{AsyncRead, AsyncReadExt};
 use zerocopy::{FromBytes, Immutable, KnownLayout, Unaligned, network_endian};

-pin_project! {
-    /// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough
-    pub(crate) struct ChainRW<T> {
-        #[pin]
-        pub(crate) inner: T,
-        buf: BytesMut,
-    }
-}
-
-impl<T: AsyncWrite> AsyncWrite for ChainRW<T> {
-    #[inline]
-    fn poll_write(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        buf: &[u8],
-    ) -> Poll<Result<usize, io::Error>> {
-        self.project().inner.poll_write(cx, buf)
-    }
-
-    #[inline]
-    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
-        self.project().inner.poll_flush(cx)
-    }
-
-    #[inline]
-    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
-        self.project().inner.poll_shutdown(cx)
-    }
-
-    #[inline]
-    fn poll_write_vectored(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        bufs: &[io::IoSlice<'_>],
-    ) -> Poll<Result<usize, io::Error>> {
-        self.project().inner.poll_write_vectored(cx, bufs)
-    }
-
-    #[inline]
-    fn is_write_vectored(&self) -> bool {
-        self.inner.is_write_vectored()
-    }
-}
-
 /// Proxy Protocol Version 2 Header
 const SIGNATURE: [u8; 12] = [
    0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A,
@@ -79,7 +32,6 @@ pub struct ConnectionInfo {

 #[derive(PartialEq, Eq, Clone, Debug)]
 pub enum ConnectHeader {
-    Missing,
    Local,
    Proxy(ConnectionInfo),
 }
@@ -106,47 +58,24 @@ pub enum ConnectionInfoExtra {

 pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
    mut read: T,
-) -> std::io::Result<(ChainRW<T>, ConnectHeader)> {
-    let mut buf = BytesMut::with_capacity(128);
-    let header = loop {
-        let bytes_read = read.read_buf(&mut buf).await?;
-
-        // exit for bad header signature
-        let len = usize::min(buf.len(), SIGNATURE.len());
-        if buf[..len] != SIGNATURE[..len] {
-            return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing));
-        }
-
-        // if no more bytes available then exit
-        if bytes_read == 0 {
-            return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing));
-        }
-
-        // check if we have enough bytes to continue
-        if let Some(header) = buf.try_get::<ProxyProtocolV2Header>() {
-            break header;
-        }
-    };
-
-    let remaining_length = usize::from(header.len.get());
-
-    while buf.len() < remaining_length {
-        if read.read_buf(&mut buf).await? == 0 {
-            return Err(io::Error::new(
-                io::ErrorKind::UnexpectedEof,
-                "stream closed while waiting for proxy protocol addresses",
-            ));
-        }
+) -> std::io::Result<(T, ConnectHeader)> {
+    let mut header = [0; size_of::<ProxyProtocolV2Header>()];
+    read.read_exact(&mut header).await?;
+    let header: ProxyProtocolV2Header = zerocopy::transmute!(header);
+    if header.signature != SIGNATURE {
+        return Err(std::io::Error::other("invalid proxy protocol header"));
    }
-    let payload = buf.split_to(remaining_length);

-    let res = process_proxy_payload(header, payload)?;
-    Ok((ChainRW { inner: read, buf }, res))
+    let mut payload = vec![0; usize::from(header.len.get())];
+    read.read_exact(&mut payload).await?;
+
+    let res = process_proxy_payload(header, &payload)?;
+    Ok((read, res))
 }

 fn process_proxy_payload(
    header: ProxyProtocolV2Header,
-    mut payload: BytesMut,
+    mut payload: &[u8],
 ) -> std::io::Result<ConnectHeader> {
    match header.version_and_command {
        // the connection was established on purpose by the proxy
@@ -162,13 +91,12 @@ fn process_proxy_payload(
        PROXY_V2 => {}
        // other values are unassigned and must not be emitted by senders. Receivers
        // must drop connections presenting unexpected values here.
-        #[rustfmt::skip] // https://github.com/rust-lang/rustfmt/issues/6384
-        _ => return Err(io::Error::other(
-            format!(
+        _ => {
+            return Err(io::Error::other(format!(
                "invalid proxy protocol command 0x{:02X}. expected local (0x20) or proxy (0x21)",
                header.version_and_command
-            ),
-        )),
+            )));
+        }
    }

    let size_err =
@@ -206,7 +134,7 @@ fn process_proxy_payload(
                }
                let subtype = tlv.value.get_u8();
                match Pp2AwsType::from_repr(subtype) {
-                    Some(Pp2AwsType::VpceId) => match std::str::from_utf8(&tlv.value) {
+                    Some(Pp2AwsType::VpceId) => match std::str::from_utf8(tlv.value) {
                        Ok(s) => {
                            extra = Some(ConnectionInfoExtra::Aws { vpce_id: s.into() });
                        }
@@ -282,65 +210,28 @@ enum Pp2AzureType {
    PrivateEndpointLinkId = 0x01,
 }

-impl<T: AsyncRead> AsyncRead for ChainRW<T> {
-    #[inline]
-    fn poll_read(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        buf: &mut ReadBuf<'_>,
-    ) -> Poll<io::Result<()>> {
-        if self.buf.is_empty() {
-            self.project().inner.poll_read(cx, buf)
-        } else {
-            self.read_from_buf(buf)
-        }
-    }
-}
-
-impl<T: AsyncRead> ChainRW<T> {
-    #[cold]
-    fn read_from_buf(self: Pin<&mut Self>, buf: &mut ReadBuf<'_>) -> Poll<io::Result<()>> {
-        debug_assert!(!self.buf.is_empty());
-        let this = self.project();
-
-        let write = usize::min(this.buf.len(), buf.remaining());
-        let slice = this.buf.split_to(write).freeze();
-        buf.put_slice(&slice);
-
-        // reset the allocation so it can be freed
-        if this.buf.is_empty() {
-            *this.buf = BytesMut::new();
-        }
-
-        Poll::Ready(Ok(()))
-    }
-}
-
 #[derive(Debug)]
-struct Tlv {
+struct Tlv<'a> {
    kind: u8,
-    value: Bytes,
+    value: &'a [u8],
 }

-fn read_tlv(b: &mut BytesMut) -> Option<Tlv> {
+fn read_tlv<'a>(b: &mut &'a [u8]) -> Option<Tlv<'a>> {
    let tlv_header = b.try_get::<TlvHeader>()?;
    let len = usize::from(tlv_header.len.get());
-    if b.len() < len {
-        return None;
-    }
    Some(Tlv {
        kind: tlv_header.kind,
-        value: b.split_to(len).freeze(),
+        value: b.split_off(..len)?,
    })
 }

 trait BufExt: Sized {
    fn try_get<T: FromBytes>(&mut self) -> Option<T>;
 }
-impl BufExt for BytesMut {
+impl BufExt for &[u8] {
    fn try_get<T: FromBytes>(&mut self) -> Option<T> {
-        let (res, _) = T::read_from_prefix(self).ok()?;
-        self.advance(size_of::<T>());
+        let (res, rest) = T::read_from_prefix(self).ok()?;
+        *self = rest;
        Some(res)
    }
 }
@@ -481,27 +372,19 @@ mod tests {
    }

    #[tokio::test]
+    #[should_panic = "invalid proxy protocol header"]
    async fn test_invalid() {
        let data = [0x55; 256];

-        let (mut read, info) = read_proxy_protocol(data.as_slice()).await.unwrap();
-
-        let mut bytes = vec![];
-        read.read_to_end(&mut bytes).await.unwrap();
-        assert_eq!(bytes, data);
-        assert_eq!(info, ConnectHeader::Missing);
+        read_proxy_protocol(data.as_slice()).await.unwrap();
    }

    #[tokio::test]
+    #[should_panic = "early eof"]
    async fn test_short() {
        let data = [0x55; 10];

-        let (mut read, info) = read_proxy_protocol(data.as_slice()).await.unwrap();
-
-        let mut bytes = vec![];
-        read.read_to_end(&mut bytes).await.unwrap();
-        assert_eq!(bytes, data);
-        assert_eq!(info, ConnectHeader::Missing);
+        read_proxy_protocol(data.as_slice()).await.unwrap();
    }

    #[tokio::test]
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -1,15 +1,10 @@
 #[cfg(test)]
 mod tests;

-pub(crate) mod connect_compute;
-mod copy_bidirectional;
-pub(crate) mod handshake;
-pub(crate) mod passthrough;
 pub(crate) mod retry;
 pub(crate) mod wake_compute;
 use std::sync::Arc;

-pub use copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute};
 use futures::FutureExt;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
@@ -21,16 +16,17 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::sync::CancellationToken;
 use tracing::{Instrument, debug, error, info, warn};

-use self::connect_compute::{TcpMechanism, connect_to_compute};
-use self::passthrough::ProxyPassthrough;
 use crate::cancellation::{self, CancellationHandler};
 use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
 use crate::context::RequestContext;
 use crate::error::{ReportableError, UserFacingError};
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
+use crate::pglb::connect_compute::{TcpMechanism, connect_to_compute};
+pub use crate::pglb::copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute};
+use crate::pglb::handshake::{HandshakeData, HandshakeError, handshake};
+use crate::pglb::passthrough::ProxyPassthrough;
 use crate::pqproto::{BeMessage, CancelKeyData, StartupMessageParams};
 use crate::protocol2::{ConnectHeader, ConnectionInfo, ConnectionInfoExtra, read_proxy_protocol};
-use crate::proxy::handshake::{HandshakeData, handshake};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::stream::{PqStream, Stream};
 use crate::types::EndpointCacheKey;
@@ -102,30 +98,24 @@ pub async fn task_main(
        let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();

        connections.spawn(async move {
-            let (socket, conn_info) = match read_proxy_protocol(socket).await {
-                Err(e) => {
-                    warn!("per-client task finished with an error: {e:#}");
-                    return;
+            let (socket, conn_info) = match config.proxy_protocol_v2 {
+                ProxyProtocolV2::Required => {
+                    match read_proxy_protocol(socket).await {
+                        Err(e) => {
+                            warn!("per-client task finished with an error: {e:#}");
+                            return;
+                        }
+                        // our load balancers will not send any more data. let's just exit immediately
+                        Ok((_socket, ConnectHeader::Local)) => {
+                            debug!("healthcheck received");
+                            return;
+                        }
+                        Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
+                    }
                }
-                // our load balancers will not send any more data. let's just exit immediately
-                Ok((_socket, ConnectHeader::Local)) => {
-                    debug!("healthcheck received");
-                    return;
-                }
-                Ok((_socket, ConnectHeader::Missing))
-                    if config.proxy_protocol_v2 == ProxyProtocolV2::Required =>
-                {
-                    warn!("missing required proxy protocol header");
-                    return;
-                }
-                Ok((_socket, ConnectHeader::Proxy(_)))
-                    if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected =>
-                {
-                    warn!("proxy protocol header not supported");
-                    return;
-                }
-                Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
-                Ok((socket, ConnectHeader::Missing)) => (
+                // ignore the header - it cannot be confused for a postgres or http connection so will
+                // error later.
+                ProxyProtocolV2::Rejected => (
                    socket,
                    ConnectionInfo {
                        addr: peer_addr,
@@ -134,7 +124,7 @@ pub async fn task_main(
                ),
            };

-            match socket.inner.set_nodelay(true) {
+            match socket.set_nodelay(true) {
                Ok(()) => {}
                Err(e) => {
                    error!(
@@ -248,7 +238,7 @@ pub(crate) enum ClientRequestError {
    #[error("{0}")]
    Cancellation(#[from] cancellation::CancelError),
    #[error("{0}")]
-    Handshake(#[from] handshake::HandshakeError),
+    Handshake(#[from] HandshakeError),
    #[error("{0}")]
    HandshakeTimeout(#[from] tokio::time::error::Elapsed),
    #[error("{0}")]
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -17,7 +17,6 @@ use rustls::pki_types;
 use tokio::io::DuplexStream;
 use tracing_test::traced_test;

-use super::connect_compute::ConnectMechanism;
 use super::retry::CouldRetry;
 use super::*;
 use crate::auth::backend::{
@@ -28,6 +27,7 @@ use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient};
 use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status};
 use crate::control_plane::{self, CachedNodeInfo, NodeInfo, NodeInfoCache};
 use crate::error::ErrorKind;
+use crate::pglb::connect_compute::ConnectMechanism;
 use crate::tls::client_config::compute_client_config_with_certs;
 use crate::tls::postgres_rustls::MakeRustlsConnect;
 use crate::tls::server_config::CertResolver;
@@ -173,7 +173,6 @@ async fn dummy_proxy(
    tls: Option<TlsConfig>,
    auth: impl TestAuth + Send,
 ) -> anyhow::Result<()> {
-    let (client, _) = read_proxy_protocol(client).await?;
    let mut stream = match handshake(&RequestContext::test(), client, tls.as_ref(), false).await? {
        HandshakeData::Startup(stream, _) => stream,
        HandshakeData::Cancel(_) => bail!("cancellation not supported"),
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,6 +1,5 @@
 use tracing::{error, info};

-use super::connect_compute::ComputeConnectBackend;
 use crate::config::RetryConfig;
 use crate::context::RequestContext;
 use crate::control_plane::CachedNodeInfo;
@@ -9,6 +8,7 @@ use crate::error::ReportableError;
 use crate::metrics::{
    ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,
 };
+use crate::pglb::connect_compute::ComputeConnectBackend;
 use crate::proxy::retry::{retry_after, should_retry};

 // Use macro to retain original callsite.
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -35,7 +35,7 @@ use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
 use crate::control_plane::locks::ApiLocks;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::intern::EndpointIdInt;
-use crate::proxy::connect_compute::ConnectMechanism;
+use crate::pglb::connect_compute::ConnectMechanism;
 use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::types::{EndpointId, Host, LOCAL_PROXY_SUFFIX};
@@ -182,7 +182,7 @@ impl PoolingBackend {
        tracing::Span::current().record("conn_id", display(conn_id));
        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
        let backend = self.auth_backend.as_ref().map(|()| keys);
-        crate::proxy::connect_compute::connect_to_compute(
+        crate::pglb::connect_compute::connect_to_compute(
            ctx,
            &TokioMechanism {
                conn_id,
@@ -226,7 +226,7 @@ impl PoolingBackend {
            },
            keys: crate::auth::backend::ComputeCredentialKeys::None,
        });
-        crate::proxy::connect_compute::connect_to_compute(
+        crate::pglb::connect_compute::connect_to_compute(
            ctx,
            &HyperMechanism {
                conn_id,
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -49,7 +49,7 @@ use crate::config::{ProxyConfig, ProxyProtocolV2};
 use crate::context::RequestContext;
 use crate::ext::TaskExt;
 use crate::metrics::Metrics;
-use crate::protocol2::{ChainRW, ConnectHeader, ConnectionInfo, read_proxy_protocol};
+use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol};
 use crate::proxy::run_until_cancelled;
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
@@ -207,12 +207,12 @@ pub(crate) type AsyncRW = Pin<Box<dyn AsyncReadWrite>>;

 #[async_trait]
 trait MaybeTlsAcceptor: Send + Sync + 'static {
-    async fn accept(&self, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW>;
+    async fn accept(&self, conn: TcpStream) -> std::io::Result<AsyncRW>;
 }

 #[async_trait]
 impl MaybeTlsAcceptor for &'static ArcSwapOption<crate::config::TlsConfig> {
-    async fn accept(&self, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW> {
+    async fn accept(&self, conn: TcpStream) -> std::io::Result<AsyncRW> {
        match &*self.load() {
            Some(config) => Ok(Box::pin(
                TlsAcceptor::from(config.http_config.clone())
@@ -235,33 +235,30 @@ async fn connection_startup(
    peer_addr: SocketAddr,
 ) -> Option<(AsyncRW, ConnectionInfo)> {
    // handle PROXY protocol
-    let (conn, peer) = match read_proxy_protocol(conn).await {
-        Ok(c) => c,
-        Err(e) => {
-            tracing::warn!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
-            return None;
+    let (conn, conn_info) = match config.proxy_protocol_v2 {
+        ProxyProtocolV2::Required => {
+            match read_proxy_protocol(conn).await {
+                Err(e) => {
+                    warn!("per-client task finished with an error: {e:#}");
+                    return None;
+                }
+                // our load balancers will not send any more data. let's just exit immediately
+                Ok((_conn, ConnectHeader::Local)) => {
+                    tracing::debug!("healthcheck received");
+                    return None;
+                }
+                Ok((conn, ConnectHeader::Proxy(info))) => (conn, info),
+            }
        }
-    };
-
-    let conn_info = match peer {
-        // our load balancers will not send any more data. let's just exit immediately
-        ConnectHeader::Local => {
-            tracing::debug!("healthcheck received");
-            return None;
-        }
-        ConnectHeader::Missing if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
-            tracing::warn!("missing required proxy protocol header");
-            return None;
-        }
-        ConnectHeader::Proxy(_) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
-            tracing::warn!("proxy protocol header not supported");
-            return None;
-        }
-        ConnectHeader::Proxy(info) => info,
-        ConnectHeader::Missing => ConnectionInfo {
-            addr: peer_addr,
-            extra: None,
-        },
+        // ignore the header - it cannot be confused for a postgres or http connection so will
+        // error later.
+        ProxyProtocolV2::Rejected => (
+            conn,
+            ConnectionInfo {
+                addr: peer_addr,
+                extra: None,
+            },
+        ),
    };

    let has_private_peer_addr = match conn_info.addr.ip() {
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -357,31 +357,6 @@ class PgProtocol:
        return TimelineId(cast("str", self.safe_psql("show neon.timeline_id")[0][0]))


-class PageserverWalReceiverProtocol(StrEnum):
-    VANILLA = "vanilla"
-    INTERPRETED = "interpreted"
-
-    @staticmethod
-    def to_config_key_value(proto) -> tuple[str, dict[str, Any]]:
-        if proto == PageserverWalReceiverProtocol.VANILLA:
-            return (
-                "wal_receiver_protocol",
-                {
-                    "type": "vanilla",
-                },
-            )
-        elif proto == PageserverWalReceiverProtocol.INTERPRETED:
-            return (
-                "wal_receiver_protocol",
-                {
-                    "type": "interpreted",
-                    "args": {"format": "protobuf", "compression": {"zstd": {"level": 1}}},
-                },
-            )
-        else:
-            raise ValueError(f"Unknown protocol type: {proto}")
-
-
@dataclass
 class PageserverTracingConfig:
    sampling_ratio: tuple[int, int]
@@ -423,6 +398,7 @@ class PageserverImportConfig:
            "import_job_concurrency": self.import_job_concurrency,
            "import_job_soft_size_limit": self.import_job_soft_size_limit,
            "import_job_checkpoint_threshold": self.import_job_checkpoint_threshold,
+            "import_job_max_byte_range_size": 4 * 1024 * 1024,  # Pageserver default
        }
        return ("timeline_import_config", value)

@@ -474,7 +450,6 @@ class NeonEnvBuilder:
        safekeeper_extra_opts: list[str] | None = None,
        storage_controller_port_override: int | None = None,
        pageserver_virtual_file_io_mode: str | None = None,
-        pageserver_wal_receiver_protocol: PageserverWalReceiverProtocol | None = None,
        pageserver_get_vectored_concurrent_io: str | None = None,
        pageserver_tracing_config: PageserverTracingConfig | None = None,
        pageserver_import_config: PageserverImportConfig | None = None,
@@ -551,11 +526,6 @@ class NeonEnvBuilder:

        self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode

-        if pageserver_wal_receiver_protocol is not None:
-            self.pageserver_wal_receiver_protocol = pageserver_wal_receiver_protocol
-        else:
-            self.pageserver_wal_receiver_protocol = PageserverWalReceiverProtocol.INTERPRETED
-
        assert test_name.startswith("test_"), (
            "Unexpectedly instantiated from outside a test function"
        )
@@ -1201,7 +1171,6 @@ class NeonEnv:

        self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
        self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode
-        self.pageserver_wal_receiver_protocol = config.pageserver_wal_receiver_protocol
        self.pageserver_get_vectored_concurrent_io = config.pageserver_get_vectored_concurrent_io
        self.pageserver_tracing_config = config.pageserver_tracing_config
        if config.pageserver_import_config is None:
@@ -1333,13 +1302,6 @@ class NeonEnv:
                        for key, value in override.items():
                            ps_cfg[key] = value

-            if self.pageserver_wal_receiver_protocol is not None:
-                key, value = PageserverWalReceiverProtocol.to_config_key_value(
-                    self.pageserver_wal_receiver_protocol
-                )
-                if key not in ps_cfg:
-                    ps_cfg[key] = value
-
            if self.pageserver_tracing_config is not None:
                key, value = self.pageserver_tracing_config.to_config_key_value()

@@ -4710,7 +4672,7 @@ class EndpointFactory:
        origin: Endpoint,
        endpoint_id: str | None = None,
        config_lines: list[str] | None = None,
-    ):
+    ) -> Endpoint:
        branch_name = origin.branch_name
        assert origin in self.endpoints
        assert branch_name is not None
@@ -4729,7 +4691,7 @@ class EndpointFactory:
        origin: Endpoint,
        endpoint_id: str | None = None,
        config_lines: list[str] | None = None,
-    ):
+    ) -> Endpoint:
        branch_name = origin.branch_name
        assert origin in self.endpoints
        assert branch_name is not None
--- a/test_runner/performance/test_sharded_ingest.py
+++ b/test_runner/performance/test_sharded_ingest.py
@@ -15,19 +15,10 @@ from fixtures.neon_fixtures import (

@pytest.mark.timeout(1200)
@pytest.mark.parametrize("shard_count", [1, 8, 32])
-@pytest.mark.parametrize(
-    "wal_receiver_protocol",
-    [
-        "vanilla",
-        "interpreted-bincode-compressed",
-        "interpreted-protobuf-compressed",
-    ],
-)
 def test_sharded_ingest(
    neon_env_builder: NeonEnvBuilder,
    zenbenchmark: NeonBenchmarker,
    shard_count: int,
-    wal_receiver_protocol: str,
 ):
    """
    Benchmarks sharded ingestion throughput, by ingesting a large amount of WAL into a Safekeeper
@@ -39,36 +30,6 @@ def test_sharded_ingest(
    neon_env_builder.num_pageservers = shard_count
    env = neon_env_builder.init_configs()

-    for ps in env.pageservers:
-        if wal_receiver_protocol == "vanilla":
-            ps.patch_config_toml_nonrecursive(
-                {
-                    "wal_receiver_protocol": {
-                        "type": "vanilla",
-                    }
-                }
-            )
-        elif wal_receiver_protocol == "interpreted-bincode-compressed":
-            ps.patch_config_toml_nonrecursive(
-                {
-                    "wal_receiver_protocol": {
-                        "type": "interpreted",
-                        "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
-                    }
-                }
-            )
-        elif wal_receiver_protocol == "interpreted-protobuf-compressed":
-            ps.patch_config_toml_nonrecursive(
-                {
-                    "wal_receiver_protocol": {
-                        "type": "interpreted",
-                        "args": {"format": "protobuf", "compression": {"zstd": {"level": 1}}},
-                    }
-                }
-            )
-        else:
-            raise AssertionError("Test must use explicit wal receiver protocol config")
-
    env.start()

    # Create a sharded tenant and timeline, and migrate it to the respective pageservers. Ensure
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -182,10 +182,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
        "lsn_lease_length": "1m",
        "lsn_lease_length_for_ts": "5s",
        "timeline_offloading": False,
-        "wal_receiver_protocol_override": {
-            "type": "interpreted",
-            "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
-        },
        "rel_size_v2_enabled": True,
        "relsize_snapshot_cache_capacity": 10000,
        "gc_compaction_enabled": True,
--- a/test_runner/regress/test_basebackup.py
+++ b/test_runner/regress/test_basebackup.py
@@ -26,6 +26,10 @@ def test_basebackup_cache(neon_env_builder: NeonEnvBuilder):
    ps = env.pageserver
    ps_http = ps.http_client()

+    storcon_managed_timelines = (env.storage_controller_config or {}).get(
+        "timelines_onto_safekeepers", False
+    )
+
    # 1. Check that we always hit the cache after compute restart.
    for i in range(3):
        ep.start()
@@ -33,15 +37,26 @@ def test_basebackup_cache(neon_env_builder: NeonEnvBuilder):

        def check_metrics(i=i):
            metrics = ps_http.get_metrics()
-            # Never miss.
-            # The first time compute_ctl sends `get_basebackup` with lsn=None, we do not cache such requests.
-            # All other requests should be a hit
-            assert (
-                metrics.query_one(
-                    "pageserver_basebackup_cache_read_total", {"result": "miss"}
-                ).value
-                == 0
-            )
+            if storcon_managed_timelines:
+                # We do not cache the initial basebackup yet,
+                # so the first compute startup should be a miss.
+                assert (
+                    metrics.query_one(
+                        "pageserver_basebackup_cache_read_total", {"result": "miss"}
+                    ).value
+                    == 1
+                )
+            else:
+                # If the timeline is not initialized on safekeeprs,
+                # the compute_ctl sends `get_basebackup` with lsn=None for the first startup.
+                # We do not use cache for such requests, so it's niether a hit nor a miss.
+                assert (
+                    metrics.query_one(
+                        "pageserver_basebackup_cache_read_total", {"result": "miss"}
+                    ).value
+                    == 0
+                )
+
            # All but the first requests are hits.
            assert (
                metrics.query_one("pageserver_basebackup_cache_read_total", {"result": "hit"}).value
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -10,7 +10,6 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnvBuilder,
-    PageserverWalReceiverProtocol,
    generate_uploads_and_deletions,
 )
 from fixtures.pageserver.http import PageserverApiException
@@ -68,14 +67,9 @@ PREEMPT_GC_COMPACTION_TENANT_CONF = {


@skip_in_debug_build("only run with release build")
-@pytest.mark.parametrize(
-    "wal_receiver_protocol",
-    [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
-)
@pytest.mark.timeout(900)
 def test_pageserver_compaction_smoke(
    neon_env_builder: NeonEnvBuilder,
-    wal_receiver_protocol: PageserverWalReceiverProtocol,
 ):
    """
    This is a smoke test that compaction kicks in. The workload repeatedly churns
@@ -85,8 +79,6 @@ def test_pageserver_compaction_smoke(
    observed bounds.
    """

-    neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol
-
    # Effectively disable the page cache to rely only on image layers
    # to shorten reads.
    neon_env_builder.pageserver_config_override = """
--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -466,8 +466,13 @@ def test_perf_counters(neon_simple_env: NeonEnv):
    #
    # 1.5 is the minimum version to contain these views.
    cur.execute("CREATE EXTENSION neon VERSION '1.5'")
+    cur.execute("set neon.monitor_query_exec_time = on")
    cur.execute("SELECT * FROM neon_perf_counters")
    cur.execute("SELECT * FROM neon_backend_perf_counters")
+    cur.execute(
+        "select value from neon_backend_perf_counters where metric='query_time_seconds_count' and pid=pg_backend_pid()"
+    )
+    assert cur.fetchall()[0][0] == 2


 def collect_metric(
--- a/test_runner/regress/test_crafted_wal_end.py
+++ b/test_runner/regress/test_crafted_wal_end.py
@@ -1,9 +1,13 @@
 from __future__ import annotations

+from typing import TYPE_CHECKING
+
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_cli import WalCraft
-from fixtures.neon_fixtures import NeonEnvBuilder, PageserverWalReceiverProtocol
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnvBuilder

 # Restart nodes with WAL end having specially crafted shape, like last record
 # crossing segment boundary, to test decoding issues.
@@ -19,17 +23,10 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PageserverWalReceiverProtocol
        "wal_record_crossing_segment_followed_by_small_one",
    ],
 )
-@pytest.mark.parametrize(
-    "wal_receiver_protocol",
-    [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
-)
 def test_crafted_wal_end(
    neon_env_builder: NeonEnvBuilder,
    wal_type: str,
-    wal_receiver_protocol: PageserverWalReceiverProtocol,
 ):
-    neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol
-
    env = neon_env_builder.init_start()
    env.create_branch("test_crafted_wal_end")
    env.pageserver.allowed_errors.extend(
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -159,7 +159,8 @@ def test_remote_extensions(

    # Setup a mock nginx S3 gateway which will return our test extension.
    (host, port) = httpserver_listen_address
-    extensions_endpoint = f"http://{host}:{port}/pg-ext-s3-gateway"
+    remote_ext_base_url = f"http://{host}:{port}/pg-ext-s3-gateway"
+    log.info(f"remote extensions base URL: {remote_ext_base_url}")

    extension.build(pg_config, test_output_dir)
    tarball = extension.package(test_output_dir)
@@ -221,7 +222,7 @@ def test_remote_extensions(

    endpoint.create_remote_extension_spec(spec)

-    endpoint.start(remote_ext_base_url=extensions_endpoint)
+    endpoint.start(remote_ext_base_url=remote_ext_base_url)

    with endpoint.connect() as conn:
        with conn.cursor() as cur:
@@ -249,7 +250,7 @@ def test_remote_extensions(
    # Remove the extension files to force a redownload of the extension.
    extension.remove(test_output_dir, pg_version)

-    endpoint.start(remote_ext_base_url=extensions_endpoint)
+    endpoint.start(remote_ext_base_url=remote_ext_base_url)

    # Test that ALTER EXTENSION UPDATE statements also fetch remote extensions.
    with endpoint.connect() as conn:
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -74,8 +74,9 @@ def test_hot_standby(neon_simple_env: NeonEnv):
                for query in queries:
                    with s_con.cursor() as secondary_cursor:
                        secondary_cursor.execute(query)
-                        response = secondary_cursor.fetchone()
-                        assert response is not None
+                        res = secondary_cursor.fetchone()
+                        assert res is not None
+                        response = res
                        assert response == responses[query]

            # Check for corrupted WAL messages which might otherwise go unnoticed if
@@ -164,7 +165,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):

            s_cur.execute("SELECT COUNT(*) FROM test")
            res = s_cur.fetchone()
-            assert res[0] == 10000
+            assert res == (10000,)

            # Clear the cache in the standby, so that when we
            # re-execute the query, it will make GetPage
@@ -195,7 +196,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
            s_cur.execute("SELECT COUNT(*) FROM test")
            log_replica_lag(primary, secondary)
            res = s_cur.fetchone()
-            assert res[0] == 10000
+            assert res == (10000,)


 def run_pgbench(connstr: str, pg_bin: PgBin):
--- a/test_runner/regress/test_replica_promotes.py
+++ b/test_runner/regress/test_replica_promotes.py
@@ -0,0 +1,133 @@
+"""
+File with secondary->primary promotion testing.
+
+This far, only contains a test that we don't break and that the data is persisted.
+"""
+
+import psycopg2
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import Endpoint, NeonEnv, wait_replica_caughtup
+from fixtures.pg_version import PgVersion
+from pytest import raises
+
+
+def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
+    """
+    Test that a replica safely promotes, and can commit data updates which
+    show up when the primary boots up after the promoted secondary endpoint
+    shut down.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env: NeonEnv = neon_simple_env
+    primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+
+    with primary.connect() as primary_conn:
+        primary_cur = primary_conn.cursor()
+        primary_cur.execute(
+            "create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)"
+        )
+        primary_cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
+        primary_cur.execute(
+            """
+            SELECT pg_current_wal_insert_lsn(),
+                   pg_current_wal_lsn(),
+                   pg_current_wal_flush_lsn()
+            """
+        )
+        log.info(f"Primary: Current LSN after workload is {primary_cur.fetchone()}")
+        primary_cur.execute("show neon.safekeepers")
+        safekeepers = primary_cur.fetchall()[0][0]
+
+    wait_replica_caughtup(primary, secondary)
+
+    with secondary.connect() as secondary_conn:
+        secondary_cur = secondary_conn.cursor()
+        secondary_cur.execute("select count(*) from t")
+
+        assert secondary_cur.fetchone() == (100,)
+
+        with raises(psycopg2.Error):
+            secondary_cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200)")
+            secondary_conn.commit()
+
+        secondary_conn.rollback()
+        secondary_cur.execute("select count(*) from t")
+        assert secondary_cur.fetchone() == (100,)
+
+    primary.stop_and_destroy(mode="immediate")
+
+    # Reconnect to the secondary to make sure we get a read-write connection
+    promo_conn = secondary.connect()
+    promo_cur = promo_conn.cursor()
+    promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'")
+    promo_cur.execute("select pg_reload_conf()")
+
+    promo_cur.execute("SELECT * FROM pg_promote()")
+    assert promo_cur.fetchone() == (True,)
+    promo_cur.execute(
+        """
+            SELECT pg_current_wal_insert_lsn(),
+                   pg_current_wal_lsn(),
+                   pg_current_wal_flush_lsn()
+            """
+    )
+    log.info(f"Secondary: LSN after promotion is {promo_cur.fetchone()}")
+
+    # Reconnect to the secondary to make sure we get a read-write connection
+    with secondary.connect() as new_primary_conn:
+        new_primary_cur = new_primary_conn.cursor()
+        new_primary_cur.execute("select count(*) from t")
+        assert new_primary_cur.fetchone() == (100,)
+
+        new_primary_cur.execute(
+            "INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload"
+        )
+        assert new_primary_cur.fetchall() == [(it,) for it in range(101, 201)]
+
+        new_primary_cur = new_primary_conn.cursor()
+        new_primary_cur.execute("select payload from t")
+        assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]
+
+        new_primary_cur.execute("select count(*) from t")
+        assert new_primary_cur.fetchone() == (200,)
+        new_primary_cur.execute(
+            """
+            SELECT pg_current_wal_insert_lsn(),
+                   pg_current_wal_lsn(),
+                   pg_current_wal_flush_lsn()
+            """
+        )
+        log.info(f"Secondary: LSN after workload is {new_primary_cur.fetchone()}")
+
+    with secondary.connect() as second_viewpoint_conn:
+        new_primary_cur = second_viewpoint_conn.cursor()
+        new_primary_cur.execute("select payload from t")
+        assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]
+
+    # wait_for_last_flush_lsn(env, secondary, env.initial_tenant, env.initial_timeline)
+
+    secondary.stop_and_destroy()
+
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+
+    with primary.connect() as new_primary:
+        new_primary_cur = new_primary.cursor()
+        new_primary_cur.execute(
+            """
+            SELECT pg_current_wal_insert_lsn(),
+                   pg_current_wal_lsn(),
+                   pg_current_wal_flush_lsn()
+            """
+        )
+        log.info(f"New primary: Boot LSN is {new_primary_cur.fetchone()}")
+
+        new_primary_cur.execute("select count(*) from t")
+        assert new_primary_cur.fetchone() == (200,)
+        new_primary_cur.execute("INSERT INTO t (payload) SELECT generate_series(201, 300)")
+        new_primary_cur.execute("select count(*) from t")
+        assert new_primary_cur.fetchone() == (300,)
+
+    primary.stop(mode="immediate")
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1388,10 +1388,12 @@ def test_sharding_split_failures(
    with pytest.raises(failure.expect_exception()):
        env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)

-    def assert_shard_count(shard_count: int, exclude_ps_id: int | None = None) -> None:
+    # We expect that the overall operation will fail, but some split requests
+    # will have succeeded: the net result should be to return to a clean state, including
+    # detaching any child shards.
+    def assert_rolled_back(exclude_ps_id=None) -> None:
        secondary_count = 0
        attached_count = 0
-        log.info(f"Iterating over {len(env.pageservers)} pageservers to check shard count")
        for ps in env.pageservers:
            if exclude_ps_id is not None and ps.id == exclude_ps_id:
                continue
@@ -1402,23 +1404,35 @@ def test_sharding_split_failures(
                if tenant_shard_id.tenant_id != tenant_id:
                    continue  # skip bystanders
                log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
-                assert tenant_shard_id.shard_count == shard_count
+                assert tenant_shard_id.shard_count == initial_shard_count
                if loc[1]["mode"] == "Secondary":
                    secondary_count += 1
                else:
                    attached_count += 1

-        assert secondary_count == shard_count
-        assert attached_count == shard_count
-
-    # We expect that the overall operation will fail, but some split requests
-    # will have succeeded: the net result should be to return to a clean state, including
-    # detaching any child shards.
-    def assert_rolled_back(exclude_ps_id: int | None = None) -> None:
-        assert_shard_count(initial_shard_count, exclude_ps_id)
+        assert secondary_count == initial_shard_count
+        assert attached_count == initial_shard_count

    def assert_split_done(exclude_ps_id: int | None = None) -> None:
-        assert_shard_count(split_shard_count, exclude_ps_id)
+        secondary_count = 0
+        attached_count = 0
+        for ps in env.pageservers:
+            if exclude_ps_id is not None and ps.id == exclude_ps_id:
+                continue
+
+            locations = ps.http_client().tenant_list_locations()["tenant_shards"]
+            for loc in locations:
+                tenant_shard_id = TenantShardId.parse(loc[0])
+                if tenant_shard_id.tenant_id != tenant_id:
+                    continue  # skip bystanders
+                log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
+                assert tenant_shard_id.shard_count == split_shard_count
+                if loc[1]["mode"] == "Secondary":
+                    secondary_count += 1
+                else:
+                    attached_count += 1
+        assert attached_count == split_shard_count
+        assert secondary_count == split_shard_count

    def finish_split():
        # Having failed+rolled back, we should be able to split again
@@ -1454,7 +1468,6 @@ def test_sharding_split_failures(

        # The split should appear to be rolled back from the point of view of all pageservers
        # apart from the one that is offline
-        env.storage_controller.reconcile_until_idle()
        wait_until(lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))

        finish_split()
@@ -1469,7 +1482,6 @@ def test_sharding_split_failures(
        log.info("Clearing failure...")
        failure.clear(env)

-        env.storage_controller.reconcile_until_idle()
        wait_until(assert_rolled_back)

        # Having rolled back, the tenant should be working
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
David Freifeld	cee8c10582	Add skeleton of parallel xxHash implementation	2025-07-03 13:12:41 -07:00
David Freifeld	362fa1af8f	Merge branch 'quantumish/lfc-resizable-map' into quantumish/lfc-soa-map	2025-06-24 14:36:43 -07:00
David Freifeld	24e6c68772	Remove prev entry tracking, refactor HashMapInit into proper builder	2025-06-24 13:34:22 -07:00
David Freifeld	93a45708ff	Change `finish_shrink` to remap entries in shrunk space	2025-06-23 16:15:43 -07:00
David Freifeld	6a76bc63f9	Change to a SoA structure for map buckets	2025-06-23 15:38:49 -07:00
David Freifeld	610ea22c46	Generalize map to allow arbitrary hash fns, add clear() helper method	2025-06-20 11:46:02 -07:00
David Freifeld	477648b8cd	Clean up hashmap implementation, add bucket tests	2025-06-17 11:23:10 -07:00
David Freifeld	bb1e359872	Add testing utilities for hash map, freelist bugfixes	2025-06-16 16:02:39 -07:00
David Freifeld	ac87544e79	Implement shrinking, add basic tests for core operations	2025-06-16 13:13:38 -07:00
David Freifeld	b6b122e07b	nw: add shrinking and deletion skeletons	2025-06-16 10:20:30 -07:00
Heikki Linnakangas	16d6898e44	git add missing file	2025-06-12 02:37:59 +03:00
Heikki Linnakangas	10b936bf03	Use a custom Rust implementation to replace the LFC hash table The new implementation lives in a separately allocated shared memory area, which could be resized. Resizing it isn't actually implemented yet, though. It would require some co-operation from the LFC code.	2025-06-05 18:31:29 +03:00
Heikki Linnakangas	6145cfd1c2	Move neon-shmem facility to separate module within the crate	2025-06-05 18:13:03 +03:00
Heikki Linnakangas	96b4de1de6	Make LFC chunk size a compile-time constant A runtime setting is nicer, but the next commit will replace the hash table with a different implementation that requires the value size to be a compile-time constant.	2025-06-05 18:08:40 +03:00
Heikki Linnakangas	9fdf5fbb7e	Use a separate freelist to track LFC "holes" When the LFC is shrunk, we punch holes in the underlying file to release the disk space to the OS. We tracked it in the same hash table as the in-use entries, because that was convenient. However, I'm working on being able to shrink the hash table too, and once we do that, we'll need some other place to track the holes. Implement a simple scheme of an in-memory array and a chain of on-disk blocks for that.	2025-06-05 18:08:35 +03:00
a-masterov	f64eb0cbaf	Remove the Flaky Test computed-columns from postgis v16 (#12132 ) ## Problem The `computed_columns` test assumes that computed columns are always faster than the request itself. However, this is not always the case on Neon, which can lead to flaky results. ## Summary of changes The `computed_columns` test is excluded from the PostGIS test for PostgreSQL v16, accompanied by related patch refactoring.	2025-06-05 15:02:38 +00:00
Alexey Kondratov	6ae4b89000	feat(compute_ctl): Implement graceful compute monitor exit (#11911 ) ## Problem After introducing a naive downtime calculation for the Postgres process inside compute in https://github.com/neondatabase/neon/pull/11346, I noticed that some amount of computes regularly report short downtime. After checking some particular cases, it looks like all of them report downtime close to the end of the life of the compute, i.e., when the control plane calls a `/terminate` and we are waiting for Postgres to exit. Compute monitor also produces a lot of error logs because Postgres stops accepting connections, but it's expected during the termination process. ## Summary of changes Regularly check the compute status inside the main compute monitor loop and exit gracefully when the compute is in some terminal or soon-to-be-terminal state. --------- Co-authored-by: Tristan Partin <tristan@neon.tech>	2025-06-05 12:17:28 +00:00
Dmitrii Kovalkov	f7ec7668a2	pageserver, tests: prepare test_basebackup_cache for --timelines-onto-safekeepers (#12143 ) ## Problem - `test_basebackup_cache` fails in https://github.com/neondatabase/neon/pull/11712 because after the timelines on safekeepers are managed by storage controller, they do contain proper start_lsn and the compute_ctl tool sends the first basebackup request with this LSN. - `Failed to prepare basebackup` log messages during timeline initialization, because the timeline is not yet in the global timeline map. - Relates to https://github.com/neondatabase/cloud/issues/29353 ## Summary of changes - Account for `timeline_onto_safekeepers` storcon's option in the test. - Do not trigger basebackup prepare during the timeline initialization.	2025-06-05 12:04:37 +00:00
a-masterov	038e967daf	Configure the dynamic loader for the extension-tests image (#12142 ) ## Problem The same problem, fixed in https://github.com/neondatabase/neon/issues/11857, but for the image `neon-extesions-test` ## Summary of changes The config file was added to use our library	2025-06-05 12:03:51 +00:00
Erik Grinaker	6a43f23eca	pagebench: add batch support (#12133 ) ## Problem The new gRPC page service protocol supports client-side batches. The current libpq protocol only does best-effort server-side batching. To compare these approaches, Pagebench should support submitting contiguous page batches, similar to how Postgres will submit them (e.g. with prefetches or vectored reads). ## Summary of changes Add a `--batch-size` parameter specifying the size of contiguous page batches. One batch counts as 1 RPS and 1 queue depth. For the libpq protocol, a batch is submitted as individual requests and we rely on the server to batch them for us. This will give a realistic comparison of how these would be processed in the wild (e.g. when Postgres sends 100 prefetch requests). This patch also adds some basic validation of responses.	2025-06-05 11:52:52 +00:00
Vlad Lazar	868f194a3b	pageserver: remove handling of vanilla protocol (#12126 ) ## Problem We support two ingest protocols on the pageserver: vanilla and interpreted. Interpreted has been the only protocol in use for a long time. ## Summary of changes * Remove the ingest handling of the vanilla protocol * Remove tenant and pageserver configuration for it * Update all tests that tweaked the ingest protocol ## Compatibility Backward compatibility: * The new pageserver version can read the existing pageserver configuration and it will ignore the unknown field. * When the tenant config is read from the storcon db or from the pageserver disk, the extra field will be ignored. Forward compatiblity: * Both the pageserver config and the tenant config map missing fields to their default value. I'm not aware of any tenant level override that was made for this knob.	2025-06-05 11:43:04 +00:00
Konstantin Knizhnik	9c6c780201	Replica promote (#12090 ) ## Problem This PR is part of larger computes support activity: https://www.notion.so/neondatabase/Larger-computes-114f189e00478080ba01e8651ab7da90 Epic: https://github.com/neondatabase/cloud/issues/19010 In case of planned node restart, we are going to 1. create new read-only replica 2. capture LFC state at primary 3. use this state to prewarm replica 4. stop old primary 5. promote replica to primary Steps 1-3 are currently implemented and support from compute side. This PR provides compute level implementation of replica promotion. Support replica promotion ## Summary of changes Right now replica promotion is done in three steps: 1. Set safekeepers list (now it is empty for replica) 2. Call `pg_promote()` top promote replica 3. Update endpoint setting to that it ids not more treated as replica. May be all this three steps should be done by some function in compute_ctl. But right now this logic is only implement5ed in test. Postgres submodules PRs: https://github.com/neondatabase/postgres/pull/648 https://github.com/neondatabase/postgres/pull/649 https://github.com/neondatabase/postgres/pull/650 https://github.com/neondatabase/postgres/pull/651 --------- Co-authored-by: Matthias van de Meent <matthias@neon.tech> Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2025-06-05 11:27:14 +00:00
Konstantin Knizhnik	6123fe2d5e	Add query execution time histogram (#10050 ) ## Problem It will be useful to understand what kind of queries our clients are executed. And one of the most important characteristic of query is query execution time - at least it allows to distinguish OLAP and OLTP queries. Also monitoring query execution time can help to detect problem with performance (assuming that workload is more or less stable). ## Summary of changes Add query execution time histogram. --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2025-06-05 11:23:39 +00:00
Folke Behrens	1577665c20	proxy: Move PGLB-related modules into pglb root module. (#12144 ) Split the modules responsible for passing data and connecting to compute from auth and waking for PGLB. This PR just moves files. The waking is going to get removed from pglb after this.	2025-06-05 11:00:23 +00:00
Alex Chi Z.	d8ebd1d771	feat(pageserver): report tenant properties to posthog (#12113 ) ## Problem Part of https://github.com/neondatabase/neon/issues/11813 In PostHog UI, we need to create the properties before using them as a filter. We report all variants automatically when we start the pageserver. In the future, we can report all real tenants instead of fake tenants (we do that now to save money + we don't need real tenants in the UI). ## Summary of changes * Collect `region`, `availability_zone`, `pageserver_id` properties and use them in the feature evaluation. * Report 10 fake tenants on each pageserver startup. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-06-05 07:48:36 +00:00
Conrad Ludgate	c8a96cf722	update proxy protocol parsing to not a rw wrapper (#12035 ) ## Problem I believe in all environments we now specify either required/rejected for proxy-protocol V2 as required. We no longer rely on the supported flow. This means we no longer need to keep around read bytes incase they're not in a header. While I designed ChainRW to be fast (the hot path with an empty buffer is very easy to branch predict), it's still unnecessary. ## Summary of changes * Remove the ChainRW wrapper * Refactor how we read the proxy-protocol header using read_exact. Slightly worse perf but it's hardly significant. * Don't try and parse the header if it's rejected.	2025-06-05 07:12:00 +00:00
Konstantin Knizhnik	56d505bce6	Update online_advisor (#12045 ) ## Problem Investigate crash of online_advisor in image check ## Summary of changes --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2025-06-05 05:48:25 +00:00
Arpad Müller	dae203ef69	pgxn: support generations in safekeepers_cmp (#12129 ) `safekeepers_cmp` was added by #8840 to make changes of the safekeeper set order independent: a `sk1,sk2,sk3` specifier changed to `sk3,sk1,sk2` should not cause a walproposer restart. However, this check didn't support generations, in the sense that it would see the `g#123:` as part of the first safekeeper in the list, and if the first safekeeper changes, it would also restart the walproposer. Therefore, parse the generation properly and make it not be a part of the generation. This PR doesn't add a specific test, but I have confirmed locally that `test_safekeepers_reconfigure_reorder` is fixed with the changes of PR #11712 applied thanks to this PR. Part of https://github.com/neondatabase/neon/issues/11670	2025-06-04 23:02:31 +00:00
Conrad Ludgate	1fb1315aed	compute-ctl: add spec for enable_tls, separate from compute-ctl config (#12109 ) ## Problem Inbetween adding the TLS config for compute-ctl, and adding the TLS config in controlplane, we switched from using a provision flag to a bind flag. This happened to work in all of my testing in preview regions as they have no VM pool, so each bind was also a provision. However, in staging I found that the TLS config is still only processed during provision, even though it's only sent on bind. ## Summary of changes * Add a new feature flag value, `tls_experimental`, which tells postgres/pgbouncer/local_proxy to use the TLS certificates on bind. * compute_ctl on provision will be told where the certificates are, instead of being told on bind.	2025-06-04 20:07:47 +00:00
Suhas Thalanki	838622c594	compute: Add manifest.yml for default Postgres configuration settings (#11820 ) Adds a `manifest.yml` file that contains the default settings for compute. Currently, it comes from cplane code [here](`0cda3d4b01/goapp/controlplane/internal/pkg/compute/computespec/pg_settings.go (L110)`). Related RFC: https://github.com/neondatabase/neon/blob/main/docs/rfcs/038-independent-compute-release.md Related Issue: https://github.com/neondatabase/cloud/issues/11698	2025-06-04 18:03:59 +00:00
Tristan Partin	3fd5a94a85	Use Url::join() when creating the final remote extension URL (#12121 ) Url::to_string() adds a trailing slash on the base URL, so when we did the format!(), we were adding a double forward slash. Signed-off-by: Tristan Partin <tristan@neon.tech>	2025-06-04 15:56:12 +00:00
Erik Grinaker	e7d6f525b3	pageserver: support `get_vectored_concurrent_io` with gRPC (#12131 ) ## Problem The gRPC page service doesn't respect `get_vectored_concurrent_io` and always uses sequential IO. ## Summary of changes Spawn a sidecar task for concurrent IO when enabled. Cancellation will be addressed separately.	2025-06-04 15:14:17 +00:00
a-masterov	e4ca3ac745	Fix codestyle for compute.sh for docker-compose (#12128 ) ## Problem The script `compute.sh` had a non-consistent coding style and didn't follow best practices for modern bash scripts ## Summary of changes The coding style was fixed to follow best practices.	2025-06-04 15:07:48 +00:00
Vlad Lazar	b69d103b90	pageserver: make import job max byte range size configurable (#12117 ) ## Problem We want to repro an OOM situation, but large partial reads are required. ## Summary of Changes Make the max partial read size configurable for import jobs.	2025-06-04 10:44:23 +00:00
a-masterov	208cbd52d4	Add postgis to the test image (#11672 ) ## Problem We don't currently run tests for PostGIS in our test environment. ## Summary of Changes - Added PostGIS test support for PostgreSQL v16 and v17 - Configured different PostGIS versions based on PostgreSQL version: - PostgreSQL v17: PostGIS 3.5.0 - PostgreSQL v14/v15/v16: PostGIS 3.3.3 - Added necessary test scripts and configurations This ensures our PostgreSQL implementation remains compatible with this widely-used extension. --------- Co-authored-by: Alexander Bayandin <alexander@neon.tech> Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>	2025-06-04 09:57:31 +00:00
Alex Chi Z.	c567ed0de0	feat(pageserver): feature flag counter metrics (#12112 ) ## Problem Part of https://github.com/neondatabase/neon/issues/11813 ## Summary of changes Add a counter on the feature evaluation outcome and we will set up alerts for too many failed evaluations in the future. Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-06-04 06:41:42 +00:00
Mikhail	c698cee19a	ComputeSpec: prewarm_lfc_on_startup -> autoprewarm (#12120 ) https://github.com/neondatabase/cloud/pull/29472 https://github.com/neondatabase/cloud/issues/26346	2025-06-04 05:38:03 +00:00
Tristan Partin	4a3f32bf4a	Clean up compute_tools::http::JsonResponse::invalid_status() (#12110 ) JsonResponse::error() properly logs an error message which can be read in the compute logs. invalid_status() was not going through that helper function, thus not logging anything. Signed-off-by: Tristan Partin <tristan@neon.tech>	2025-06-03 16:00:56 +00:00