Fix LSN lease background thread with grpc

The spawned thread didn't have the tokio runtime active, which lead to this error: ERROR lsn_lease_bg_task{tenant_id=1bb647cb7d3974b52e74f7442fa7d059 timeline_id=cf41456d3202e1c3940cb8f372d160ab lsn=0/1576000}:panic{thread=<unnamed> location=compute_tools/src/lsn_lease.rs:201:5}: there is no reactor running, must be called from the context of a Tokio 1.x runtime Fixes `test_readonly_node_gc`
Make LFC prewarming test case less sensitive to LFC chunk size
2026-03-11 20:30:37 +00:00 · 2025-08-01 00:37:22 +03:00 · 2025-08-01 00:24:01 +03:00 · 2025-08-01 00:12:06 +03:00 · 2025-08-01 00:11:50 +03:00 · 2025-07-31 23:44:11 +03:00
143 changed files with 8816 additions and 1388 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@ neon.iml
 /.neon
 /integration_tests/.neon
 compaction-suite-results.*
 pgxn/neon/communicator/communicator_bindings.h
 docker-compose/docker-compose-parallel.yml
 # Coverage
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -259,6 +259,17 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
 [[package]]
 name = "atomic_enum"
 version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661"
 dependencies = [
 "proc-macro2",
 "quote",
 "syn 2.0.100",
 ]
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -1296,13 +1307,29 @@ dependencies = [
 name = "communicator"
 version = "0.1.0"
 dependencies = [
 "atomic_enum",
 "axum",
 "bytes",
 "cbindgen",
 "clashmap",
 "http 1.3.1",
 "libc",
 "measured",
 "neon-shmem",
 "nix 0.30.1",
 "pageserver_api",
 "pageserver_client_grpc",
 "pageserver_page_api",
 "prometheus",
 "prost 0.13.5",
 "strum_macros",
 "thiserror 1.0.69",
 "tokio",
 "tokio-pipe",
 "tonic",
 "tracing",
 "tracing-subscriber",
 "uring-common",
 "utils",
 "workspace_hack",
 ]
@@ -1643,9 +1670,9 @@ dependencies = [
 [[package]]
 name = "crossbeam-utils"
-version = "0.8.19"
+version = "0.8.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
 [[package]]
 name = "crossterm"
@@ -2361,6 +2388,12 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 [[package]]
 name = "foldhash"
 version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
 [[package]]
 name = "form_urlencoded"
 version = "1.2.1"
@@ -2742,6 +2775,16 @@ version = "0.15.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
 [[package]]
 name = "hashbrown"
 version = "0.15.4"
 source = "git+https://github.com/quantumish/hashbrown.git?rev=6610e6d#6610e6d2b1f288ef7b0709a3efefbc846395dc5e"
 dependencies = [
 "allocator-api2",
 "equivalent",
 "foldhash",
 ]
 [[package]]
 name = "hashlink"
 version = "0.9.1"
@@ -3822,7 +3865,7 @@ dependencies = [
 "prometheus",
 "rand 0.9.1",
 "rand_distr",
- "twox-hash",
+ "twox-hash 1.6.3",
 ]
 [[package]]
@@ -3928,15 +3971,21 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 name = "neon-shmem"
 version = "0.1.0"
 dependencies = [
 "ahash",
 "criterion",
 "hashbrown 0.15.4",
 "libc",
 "lock_api",
 "nix 0.30.1",
 "rand 0.9.1",
 "rand_distr",
 "rustc-hash 2.1.1",
 "seahash",
 "tempfile",
 "thiserror 1.0.69",
 "twox-hash 2.1.1",
 "workspace_hack",
 "xxhash-rust",
 ]
 [[package]]
@@ -4391,13 +4440,16 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-trait",
 "axum",
 "bytes",
 "camino",
 "clap",
 "futures",
 "hdrhistogram",
 "http 1.3.1",
 "humantime",
 "humantime-serde",
 "metrics",
 "pageserver_api",
 "pageserver_client",
 "pageserver_client_grpc",
@@ -4487,6 +4539,7 @@ dependencies = [
 "pageserver_client",
 "pageserver_compaction",
 "pageserver_page_api",
 "peekable",
 "pem",
 "pin-project-lite",
 "postgres-protocol",
@@ -4500,6 +4553,7 @@ dependencies = [
 "pprof",
 "pq_proto",
 "procfs",
 "prost 0.13.5",
 "rand 0.9.1",
 "range-set-blaze",
 "regex",
@@ -4536,7 +4590,7 @@ dependencies = [
 "tower 0.5.2",
 "tracing",
 "tracing-utils",
- "twox-hash",
+ "twox-hash 1.6.3",
 "url",
 "utils",
 "uuid",
@@ -4748,7 +4802,7 @@ dependencies = [
 "paste",
 "seq-macro",
 "thrift",
- "twox-hash",
+ "twox-hash 1.6.3",
 "zstd",
 "zstd-sys",
 ]
@@ -4794,6 +4848,15 @@ dependencies = [
 "sha2",
 ]
 [[package]]
 name = "peekable"
 version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b"
 dependencies = [
 "smallvec",
 ]
 [[package]]
 name = "pem"
 version = "3.0.3"
@@ -6493,6 +6556,12 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "621e3680f3e07db4c9c2c3fb07c6223ab2fab2e54bd3c04c3ae037990f428c32"
 [[package]]
 name = "seahash"
 version = "4.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
 [[package]]
 name = "sec1"
 version = "0.3.0"
@@ -7646,6 +7715,16 @@ dependencies = [
 "syn 2.0.100",
 ]
 [[package]]
 name = "tokio-pipe"
 version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784"
 dependencies = [
 "libc",
 "tokio",
 ]
 [[package]]
 name = "tokio-postgres"
 version = "0.7.10"
@@ -8183,6 +8262,15 @@ dependencies = [
 "static_assertions",
 ]
 [[package]]
 name = "twox-hash"
 version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b907da542cbced5261bd3256de1b3a1bf340a3d37f93425a07362a1d687de56"
 dependencies = [
 "rand 0.9.1",
 ]
 [[package]]
 name = "typed-json"
 version = "0.1.1"
@@ -9013,8 +9101,8 @@ dependencies = [
 "clap",
 "clap_builder",
 "const-oid",
 "criterion",
 "crossbeam-epoch",
 "crossbeam-utils",
 "crypto-bigint 0.5.5",
 "der 0.7.8",
 "deranged",
@@ -9057,7 +9145,6 @@ dependencies = [
 "num-iter",
 "num-rational",
 "num-traits",
 "once_cell",
 "p256 0.13.2",
 "parquet",
 "portable-atomic",
@@ -9166,6 +9253,12 @@ version = "0.13.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd"
 [[package]]
 name = "xxhash-rust"
 version = "0.8.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3"
 [[package]]
 name = "yansi"
 version = "1.0.1"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -93,6 +93,7 @@ clap = { version = "4.0", features = ["derive", "env"] }
 clashmap = { version = "1.0", features = ["raw-api"] }
 comfy-table = "7.1"
 const_format = "0.2"
 crossbeam-utils = "0.8.21"
 crc32c = "0.6"
 diatomic-waker = { version = "0.2.3" }
 either = "1.8"
@@ -152,6 +153,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pem = "3.0.3"
 peekable = "0.3.0"
 pin-project-lite = "0.2"
 pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
 procfs = "0.16"
@@ -190,6 +192,7 @@ smallvec = "1.11"
 smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 spki = "0.7.3"
 spin = "0.9.8"
 strum = "0.26"
 strum_macros = "0.26"
 "subtle"  = "2.5.0"
@@ -201,7 +204,6 @@ thiserror = "1.0"
 tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
 tokio = { version = "1.43.1", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
 tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
@@ -242,6 +244,9 @@ zeroize = "1.8"
 env_logger = "0.11"
 log = "0.4"
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
--- a/compute/etc/sql_exporter/checkpoints_req.17.sql
+++ b/compute/etc/sql_exporter/checkpoints_req.17.sql
@@ -1 +1 @@
-SELECT num_requested AS checkpoints_req FROM pg_stat_checkpointer;
+SELECT num_requested AS checkpoints_req FROM pg_catalog.pg_stat_checkpointer;
--- a/compute/etc/sql_exporter/checkpoints_req.sql
+++ b/compute/etc/sql_exporter/checkpoints_req.sql
@@ -1 +1 @@
-SELECT checkpoints_req FROM pg_stat_bgwriter;
+SELECT checkpoints_req FROM pg_catalog.pg_stat_bgwriter;
--- a/compute/etc/sql_exporter/checkpoints_timed.sql
+++ b/compute/etc/sql_exporter/checkpoints_timed.sql
@@ -1 +1 @@
-SELECT checkpoints_timed FROM pg_stat_bgwriter;
+SELECT checkpoints_timed FROM pg_catalog.pg_stat_bgwriter;
--- a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds_total.sql
@@ -1 +1 @@
-SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled;
+SELECT (neon.backpressure_throttling_time()::pg_catalog.float8 / 1000000) AS throttled;
--- a/compute/etc/sql_exporter/compute_current_lsn.sql
+++ b/compute/etc/sql_exporter/compute_current_lsn.sql
@@ -1,4 +1,4 @@
 SELECT CASE
-  WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
+  WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_replay_lsn() - '0/0')::pg_catalog.FLOAT8
-  ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8
+  ELSE (pg_catalog.pg_current_wal_lsn() - '0/0')::pg_catalog.FLOAT8
 END AS lsn;
--- a/compute/etc/sql_exporter/compute_logical_snapshot_files.sql
+++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql
@@ -1,7 +1,7 @@
 SELECT
-  (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
+  (SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
  -- These temporary snapshot files are renamed to the actual snapshot files
  -- after they are completely built. We only WAL-log the completely built
  -- snapshot files
-  (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
+  (SELECT COUNT(*) FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
--- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql
@@ -1,7 +1,7 @@
 SELECT
-  (SELECT current_setting('neon.timeline_id')) AS timeline_id,
+  (SELECT pg_catalog.current_setting('neon.timeline_id')) AS timeline_id,
  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
  -- These temporary snapshot files are renamed to the actual snapshot files
  -- after they are completely built. We only WAL-log the completely built
  -- snapshot files
-  (SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;
+  (SELECT COALESCE(pg_catalog.sum(size), 0) FROM pg_catalog.pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;
--- a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql
@@ -1,9 +1,9 @@
 SELECT
-  (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
+  (SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
  -- These temporary snapshot files are renamed to the actual snapshot files
  -- after they are completely built. We only WAL-log the completely built
  -- snapshot files
-  (SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
+  (SELECT COALESCE(pg_catalog.sum((pg_catalog.pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
-    FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
+   FROM (SELECT * FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
  ) AS logical_snapshots_bytes;
--- a/compute/etc/sql_exporter/compute_max_connections.sql
+++ b/compute/etc/sql_exporter/compute_max_connections.sql
@@ -1 +1 @@
-SELECT current_setting('max_connections') as max_connections;
+SELECT pg_catalog.current_setting('max_connections') AS max_connections;
--- a/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql
+++ b/compute/etc/sql_exporter/compute_pg_oldest_frozen_xid_age.sql
@@ -1,4 +1,4 @@
 SELECT datname database_name,
-  age(datfrozenxid) frozen_xid_age
+   pg_catalog.age(datfrozenxid) frozen_xid_age
-FROM pg_database
+FROM pg_catalog.pg_database
 ORDER BY frozen_xid_age DESC LIMIT 10;
--- a/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql
+++ b/compute/etc/sql_exporter/compute_pg_oldest_mxid_age.sql
@@ -1,4 +1,4 @@
 SELECT datname database_name,
-  mxid_age(datminmxid) min_mxid_age
+  pg_catalog.mxid_age(datminmxid) min_mxid_age
-FROM pg_database
+FROM pg_catalog.pg_database
 ORDER BY min_mxid_age DESC LIMIT 10;
--- a/compute/etc/sql_exporter/compute_receive_lsn.sql
+++ b/compute/etc/sql_exporter/compute_receive_lsn.sql
@@ -1,4 +1,4 @@
 SELECT CASE
-  WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
+  WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_receive_lsn() - '0/0')::pg_catalog.FLOAT8
  ELSE 0
 END AS lsn;
--- a/compute/etc/sql_exporter/compute_subscriptions_count.sql
+++ b/compute/etc/sql_exporter/compute_subscriptions_count.sql
@@ -1 +1 @@
-SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled;
+SELECT subenabled::pg_catalog.text AS enabled, pg_catalog.count(*) AS subscriptions_count FROM pg_catalog.pg_subscription GROUP BY subenabled;
--- a/compute/etc/sql_exporter/connection_counts.sql
+++ b/compute/etc/sql_exporter/connection_counts.sql
@@ -1 +1 @@
-SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state;
+SELECT datname, state, pg_catalog.count(*) AS count FROM pg_catalog.pg_stat_activity WHERE state <> '' GROUP BY datname, state;
--- a/compute/etc/sql_exporter/db_total_size.sql
+++ b/compute/etc/sql_exporter/db_total_size.sql
@@ -1,5 +1,5 @@
-SELECT sum(pg_database_size(datname)) AS total
+SELECT pg_catalog.sum(pg_catalog.pg_database_size(datname)) AS total
-FROM pg_database
+FROM pg_catalog.pg_database
 -- Ignore invalid databases, as we will likely have problems with
 -- getting their size from the Pageserver.
 WHERE datconnlimit != -2;
--- a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql
@@ -3,6 +3,6 @@
 -- minutes.
 SELECT
-  x::text as duration_seconds,
+  x::pg_catalog.text AS duration_seconds,
  neon.approximate_working_set_size_seconds(x) AS size
 FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x);
--- a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql
@@ -3,6 +3,6 @@
 SELECT
  x AS duration,
-  neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM (
+  neon.approximate_working_set_size_seconds(extract('epoch' FROM x::pg_catalog.interval)::pg_catalog.int4) AS size FROM (
    VALUES ('5m'), ('15m'), ('1h')
  ) AS t (x);
--- a/compute/etc/sql_exporter/lfc_cache_size_limit.sql
+++ b/compute/etc/sql_exporter/lfc_cache_size_limit.sql
@@ -1 +1 @@
-SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;
+SELECT pg_catalog.pg_size_bytes(pg_catalog.current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;
--- a/compute/etc/sql_exporter/logical_slot_restart_lsn.sql
+++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql
@@ -1,3 +1,3 @@
-SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
+SELECT slot_name, (restart_lsn - '0/0')::pg_catalog.FLOAT8 AS restart_lsn
-FROM pg_replication_slots
+FROM pg_catalog.pg_replication_slots
 WHERE slot_type = 'logical';
--- a/compute/etc/sql_exporter/max_cluster_size.sql
+++ b/compute/etc/sql_exporter/max_cluster_size.sql
@@ -1 +1 @@
-SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size';
+SELECT setting::pg_catalog.int4 AS max_cluster_size FROM pg_catalog.pg_settings WHERE name = 'neon.max_cluster_size';
--- a/compute/etc/sql_exporter/pg_stats_userdb.sql
+++ b/compute/etc/sql_exporter/pg_stats_userdb.sql
@@ -1,13 +1,13 @@
 -- We export stats for 10 non-system databases. Without this limit it is too
 -- easy to abuse the system by creating lots of databases.
-SELECT pg_database_size(datname) AS db_size,
+SELECT pg_catalog.pg_database_size(datname) AS db_size,
  deadlocks,
  tup_inserted AS inserted,
  tup_updated AS updated,
  tup_deleted AS deleted,
  datname
-FROM pg_stat_database
+FROM pg_catalog.pg_stat_database
 WHERE datname IN (
  SELECT datname FROM pg_database
  -- Ignore invalid databases, as we will likely have problems with
--- a/compute/etc/sql_exporter/replication_delay_bytes.sql
+++ b/compute/etc/sql_exporter/replication_delay_bytes.sql
@@ -3,4 +3,4 @@
 -- replay LSN may have advanced past the receive LSN we are using for the
 -- calculation.
-SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
+SELECT GREATEST(0, pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_last_wal_receive_lsn(), pg_catalog.pg_last_wal_replay_lsn())) AS replication_delay_bytes;
--- a/compute/etc/sql_exporter/replication_delay_seconds.sql
+++ b/compute/etc/sql_exporter/replication_delay_seconds.sql
@@ -1,5 +1,5 @@
 SELECT
  CASE
-    WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
+    WHEN pg_catalog.pg_last_wal_receive_lsn() = pg_catalog.pg_last_wal_replay_lsn() THEN 0
-    ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
+    ELSE GREATEST(0, EXTRACT (EPOCH FROM pg_catalog.now() - pg_catalog.pg_last_xact_replay_timestamp()))
  END AS replication_delay_seconds;
--- a/compute/etc/sql_exporter/retained_wal.sql
+++ b/compute/etc/sql_exporter/retained_wal.sql
@@ -1,10 +1,10 @@
 SELECT
  slot_name,
-  pg_wal_lsn_diff(
+  pg_catalog.pg_wal_lsn_diff(
    CASE
-      WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn()
+      WHEN pg_catalog.pg_is_in_recovery() THEN pg_catalog.pg_last_wal_replay_lsn()
-      ELSE pg_current_wal_lsn()
+      ELSE pg_catalog.pg_current_wal_lsn()
    END,
-    restart_lsn)::FLOAT8 AS retained_wal
+    restart_lsn)::pg_catalog.FLOAT8 AS retained_wal
-FROM pg_replication_slots
+FROM pg_catalog.pg_replication_slots
 WHERE active = false;
--- a/compute/etc/sql_exporter/wal_is_lost.sql
+++ b/compute/etc/sql_exporter/wal_is_lost.sql
@@ -4,4 +4,4 @@ SELECT
    WHEN wal_status = 'lost' THEN 1
    ELSE 0
  END AS wal_is_lost
-FROM pg_replication_slots;
+FROM pg_catalog.pg_replication_slots;
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -279,7 +279,7 @@ fn main() -> Result<()> {
        config,
    )?;
-    let exit_code = compute_node.run()?;
+    let exit_code = compute_node.run().context("running compute node")?;
    scenario.teardown();
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -24,9 +24,9 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
    });
    let query = "
-    INSERT INTO health_check VALUES (1, now())
+    INSERT INTO public.health_check VALUES (1, pg_catalog.now())
        ON CONFLICT (id) DO UPDATE
-         SET updated_at = now();";
+         SET updated_at = pg_catalog.now();";
    match client.simple_query(query).await {
        Result::Ok(result) => {
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -32,8 +32,12 @@ use std::sync::{Arc, Condvar, Mutex, RwLock};
 use std::time::{Duration, Instant};
 use std::{env, fs};
 use tokio::{spawn, sync::watch, task::JoinHandle, time};
 use tokio_util::sync::CancellationToken;
 use tracing::{Instrument, debug, error, info, instrument, warn};
 use url::Url;
 use utils::backoff::{
    DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff_duration,
 };
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 use utils::measured_stream::MeasuredReader;
@@ -192,6 +196,7 @@ pub struct ComputeState {
    pub startup_span: Option<tracing::span::Span>,
    pub lfc_prewarm_state: LfcPrewarmState,
    pub lfc_prewarm_token: CancellationToken,
    pub lfc_offload_state: LfcOffloadState,
    /// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
@@ -217,6 +222,7 @@ impl ComputeState {
            lfc_offload_state: LfcOffloadState::default(),
            terminate_flush_lsn: None,
            promote_state: None,
            lfc_prewarm_token: CancellationToken::new(),
        }
    }
@@ -583,7 +589,7 @@ impl ComputeNode {
        // that can affect `compute_ctl` and prevent it from properly configuring the database schema.
        // Unset them via connection string options before connecting to the database.
        // N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`.
-        const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0 -c pgaudit.log=none";
+        const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0 -c pgaudit.log=none";
        let options = match conn_conf.get_options() {
            // Allow the control plane to override any options set by the
            // compute
@@ -1554,6 +1560,41 @@ impl ComputeNode {
        Ok(lsn)
    }
    fn sync_safekeepers_with_retries(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
        let max_retries = 5;
        let mut attempts = 0;
        loop {
            let result = self.sync_safekeepers(storage_auth_token.clone());
            match &result {
                Ok(_) => {
                    if attempts > 0 {
                        tracing::info!("sync_safekeepers succeeded after {attempts} retries");
                    }
                    return result;
                }
                Err(e) if attempts < max_retries => {
                    tracing::info!(
                        "sync_safekeepers failed, will retry (attempt {attempts}): {e:#}"
                    );
                }
                Err(err) => {
                    tracing::warn!(
                        "sync_safekeepers still failed after {attempts} retries, giving up: {err:?}"
                    );
                    return result;
                }
            }
            // sleep and retry
            let backoff = exponential_backoff_duration(
                attempts,
                DEFAULT_BASE_BACKOFF_SECONDS,
                DEFAULT_MAX_BACKOFF_SECONDS,
            );
            std::thread::sleep(backoff);
            attempts += 1;
        }
    }
    /// Do all the preparations like PGDATA directory creation, configuration,
    /// safekeepers sync, basebackup, etc.
    #[instrument(skip_all)]
@@ -1589,7 +1630,7 @@ impl ComputeNode {
                    lsn
                } else {
                    info!("starting safekeepers syncing");
-                    self.sync_safekeepers(pspec.storage_auth_token.clone())
+                    self.sync_safekeepers_with_retries(pspec.storage_auth_token.clone())
                        .with_context(|| "failed to sync safekeepers")?
                };
                info!("safekeepers synced at LSN {}", lsn);
@@ -1884,7 +1925,7 @@ impl ComputeNode {
                    // It doesn't matter what were the options before, here we just want
                    // to connect and create a new superuser role.
-                    const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0";
+                    const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0";
                    zenith_admin_conf.options(ZENITH_OPTIONS);
                    let mut client =
@@ -2339,13 +2380,13 @@ impl ComputeNode {
        let result = client
            .simple_query(
                "SELECT
-    row_to_json(pg_stat_statements)
+    pg_catalog.row_to_json(pss)
 FROM
-    pg_stat_statements
+    public.pg_stat_statements pss
 WHERE
-    userid != 'cloud_admin'::regrole::oid
+    pss.userid != 'cloud_admin'::pg_catalog.regrole::pg_catalog.oid
 ORDER BY
-    (mean_exec_time + mean_plan_time) DESC
+    (pss.mean_exec_time + pss.mean_plan_time) DESC
 LIMIT 100",
            )
            .await;
@@ -2473,11 +2514,11 @@ LIMIT 100",
        // check the role grants first - to gracefully handle read-replicas.
        let select = "SELECT privilege_type
-            FROM pg_namespace
+            FROM pg_catalog.pg_namespace
-                JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true
+                JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) AS acl ON true
-                JOIN pg_user users ON acl.grantee = users.usesysid
+                JOIN pg_catalog.pg_user users ON acl.grantee = users.usesysid
-            WHERE users.usename = $1
+            WHERE users.usename OPERATOR(pg_catalog.=) $1::pg_catalog.name
-                AND nspname = $2";
+                AND nspname OPERATOR(pg_catalog.=) $2::pg_catalog.name";
        let rows = db_client
            .query(select, &[role_name, schema_name])
            .await
@@ -2546,8 +2587,9 @@ LIMIT 100",
                .await
                .with_context(|| format!("Failed to execute query: {query}"))?;
        } else {
-            let query =
+            let query = format!(
-                format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}");
+                "CREATE EXTENSION IF NOT EXISTS {ext_name} WITH SCHEMA public VERSION {quoted_version}"
            );
            db_client
                .simple_query(&query)
                .await
--- a/compute_tools/src/compute_prewarm.rs
+++ b/compute_tools/src/compute_prewarm.rs
@@ -7,7 +7,8 @@ use http::StatusCode;
 use reqwest::Client;
 use std::mem::replace;
 use std::sync::Arc;
-use tokio::{io::AsyncReadExt, spawn};
+use tokio::{io::AsyncReadExt, select, spawn};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info};
 #[derive(serde::Serialize, Default)]
@@ -92,34 +93,35 @@ impl ComputeNode {
    /// If there is a prewarm request ongoing, return `false`, `true` otherwise.
    /// Has a failpoint "compute-prewarm"
    pub fn prewarm_lfc(self: &Arc<Self>, from_endpoint: Option<String>) -> bool {
        let token: CancellationToken;
        {
-            let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
+            let state = &mut self.state.lock().unwrap();
-            if let LfcPrewarmState::Prewarming = replace(state, LfcPrewarmState::Prewarming) {
+            token = state.lfc_prewarm_token.clone();
            if let LfcPrewarmState::Prewarming =
                replace(&mut state.lfc_prewarm_state, LfcPrewarmState::Prewarming)
            {
                return false;
            }
        }
        crate::metrics::LFC_PREWARMS.inc();
-        let cloned = self.clone();
+        let this = self.clone();
        spawn(async move {
-            let state = match cloned.prewarm_impl(from_endpoint).await {
+            let prewarm_state = match this.prewarm_impl(from_endpoint, token).await {
-                Ok(true) => LfcPrewarmState::Completed,
+                Ok(state) => state,
                Ok(false) => {
                    info!(
                        "skipping LFC prewarm because LFC state is not found in endpoint storage"
                    );
                    LfcPrewarmState::Skipped
                }
                Err(err) => {
                    crate::metrics::LFC_PREWARM_ERRORS.inc();
                    error!(%err, "could not prewarm LFC");
-                    LfcPrewarmState::Failed {
+                    let error = format!("{err:#}");
-                        error: format!("{err:#}"),
+                    LfcPrewarmState::Failed { error }
                    }
                }
            };
-            cloned.state.lock().unwrap().lfc_prewarm_state = state;
+            let state = &mut this.state.lock().unwrap();
            if let LfcPrewarmState::Cancelled = prewarm_state {
                state.lfc_prewarm_token = CancellationToken::new();
            }
            state.lfc_prewarm_state = prewarm_state;
        });
        true
    }
@@ -132,47 +134,70 @@ impl ComputeNode {
    /// Request LFC state from endpoint storage and load corresponding pages into Postgres.
    /// Returns a result with `false` if the LFC state is not found in endpoint storage.
-    async fn prewarm_impl(&self, from_endpoint: Option<String>) -> Result<bool> {
+    async fn prewarm_impl(
-        let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?;
+        &self,
        from_endpoint: Option<String>,
        token: CancellationToken,
    ) -> Result<LfcPrewarmState> {
        let EndpointStoragePair {
            url,
            token: storage_token,
        } = self.endpoint_storage_pair(from_endpoint)?;
        #[cfg(feature = "testing")]
-        fail::fail_point!("compute-prewarm", |_| {
+        fail::fail_point!("compute-prewarm", |_| bail!("compute-prewarm failpoint"));
            bail!("prewarm configured to fail because of a failpoint")
        });
        info!(%url, "requesting LFC state from endpoint storage");
-        let request = Client::new().get(&url).bearer_auth(token);
+        let request = Client::new().get(&url).bearer_auth(storage_token);
-        let res = request.send().await.context("querying endpoint storage")?;
+        let response = select! {
-        match res.status() {
+            _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled),
            response = request.send() => response
        }
        .context("querying endpoint storage")?;
        match response.status() {
            StatusCode::OK => (),
-            StatusCode::NOT_FOUND => {
+            StatusCode::NOT_FOUND => return Ok(LfcPrewarmState::Skipped),
                return Ok(false);
            }
            status => bail!("{status} querying endpoint storage"),
        }
        let mut uncompressed = Vec::new();
-        let lfc_state = res
+        let lfc_state = select! {
-            .bytes()
+            _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled),
-            .await
+            lfc_state = response.bytes() => lfc_state
-            .context("getting request body from endpoint storage")?;
+        }
-        ZstdDecoder::new(lfc_state.iter().as_slice())
+        .context("getting request body from endpoint storage")?;
-            .read_to_end(&mut uncompressed)
+
-            .await
+        let mut decoder = ZstdDecoder::new(lfc_state.iter().as_slice());
-            .context("decoding LFC state")?;
+        select! {
            _ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled),
            read = decoder.read_to_end(&mut uncompressed) => read
        }
        .context("decoding LFC state")?;
        let uncompressed_len = uncompressed.len();
        info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}");
-        info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into Postgres");
+        // Client connection and prewarm info querying are fast and therefore don't need
-
+        // cancellation
-        ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
+        let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
            .await
-            .context("connecting to postgres")?
+            .context("connecting to postgres")?;
-            .query_one("select neon.prewarm_local_cache($1)", &[&uncompressed])
+        let pg_token = client.cancel_token();
            .await
            .context("loading LFC state into postgres")
            .map(|_| ())?;
-        Ok(true)
+        let params: Vec<&(dyn postgres_types::ToSql + Sync)> = vec![&uncompressed];
        select! {
            res = client.query_one("select neon.prewarm_local_cache($1)", &params) => res,
            _ = token.cancelled() => {
                pg_token.cancel_query(postgres::NoTls).await
                    .context("cancelling neon.prewarm_local_cache()")?;
                return Ok(LfcPrewarmState::Cancelled)
            }
        }
        .context("loading LFC state into postgres")
        .map(|_| ())?;
        Ok(LfcPrewarmState::Completed)
    }
    /// If offload request is ongoing, return false, true otherwise
@@ -200,20 +225,20 @@ impl ComputeNode {
    async fn offload_lfc_with_state_update(&self) {
        crate::metrics::LFC_OFFLOADS.inc();
-
+        let state = match self.offload_lfc_impl().await {
-        let Err(err) = self.offload_lfc_impl().await else {
+            Ok(state) => state,
-            self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
+            Err(err) => {
-            return;
+                crate::metrics::LFC_OFFLOAD_ERRORS.inc();
                error!(%err, "could not offload LFC");
                let error = format!("{err:#}");
                LfcOffloadState::Failed { error }
            }
        };
-        crate::metrics::LFC_OFFLOAD_ERRORS.inc();
+        self.state.lock().unwrap().lfc_offload_state = state;
        error!(%err, "could not offload LFC state to endpoint storage");
        self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
            error: format!("{err:#}"),
        };
    }
-    async fn offload_lfc_impl(&self) -> Result<()> {
+    async fn offload_lfc_impl(&self) -> Result<LfcOffloadState> {
        let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?;
        info!(%url, "requesting LFC state from Postgres");
@@ -228,7 +253,7 @@ impl ComputeNode {
            .context("deserializing LFC state")?;
        let Some(state) = state else {
            info!(%url, "empty LFC state, not exporting");
-            return Ok(());
+            return Ok(LfcOffloadState::Skipped);
        };
        let mut compressed = Vec::new();
@@ -242,7 +267,7 @@ impl ComputeNode {
        let request = Client::new().put(url).bearer_auth(token).body(compressed);
        match request.send().await {
-            Ok(res) if res.status() == StatusCode::OK => Ok(()),
+            Ok(res) if res.status() == StatusCode::OK => Ok(LfcOffloadState::Completed),
            Ok(res) => bail!(
                "Request to endpoint storage failed with status: {}",
                res.status()
@@ -250,4 +275,8 @@ impl ComputeNode {
            Err(err) => Err(err).context("writing to endpoint storage"),
        }
    }
    pub fn cancel_prewarm(self: &Arc<Self>) {
        self.state.lock().unwrap().lfc_prewarm_token.cancel();
    }
 }
--- a/compute_tools/src/compute_promote.rs
+++ b/compute_tools/src/compute_promote.rs
@@ -78,7 +78,7 @@ impl ComputeNode {
        const RETRIES: i32 = 20;
        for i in 0..=RETRIES {
            let row = client
-                .query_one("SELECT pg_last_wal_replay_lsn()", &[])
+                .query_one("SELECT pg_catalog.pg_last_wal_replay_lsn()", &[])
                .await
                .context("getting last replay lsn")?;
            let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
@@ -103,7 +103,7 @@ impl ComputeNode {
            .await
            .context("setting safekeepers")?;
        client
-            .query("SELECT pg_reload_conf()", &[])
+            .query("SELECT pg_catalog.pg_reload_conf()", &[])
            .await
            .context("reloading postgres config")?;
@@ -113,7 +113,7 @@ impl ComputeNode {
        });
        let row = client
-            .query_one("SELECT * FROM pg_promote()", &[])
+            .query_one("SELECT * FROM pg_catalog.pg_promote()", &[])
            .await
            .context("pg_promote")?;
        if !row.get::<usize, bool>(0) {
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -8,7 +8,7 @@ use std::path::Path;
 use compute_api::responses::TlsConfig;
 use compute_api::spec::{
-    ComputeAudit, ComputeMode, ComputeSpec, DatabricksSettings, GenericOption,
+    ComputeAudit, ComputeMode, ComputeSpec, DatabricksSettings, GenericOption, PageserverProtocol,
 };
 use crate::compute::ComputeNodeParams;
@@ -65,15 +65,30 @@ pub fn write_postgres_conf(
        writeln!(file, "{conf}")?;
    }
    // Stripe size GUC should be defined prior to connection string
    if let Some(stripe_size) = spec.shard_stripe_size {
        writeln!(file, "neon.stripe_size={stripe_size}")?;
    }
    // Add options for connecting to storage
    writeln!(file, "# Neon storage settings")?;
    writeln!(file)?;
    if let Some(conninfo) = &spec.pageserver_connection_info {
        match conninfo.prefer_protocol {
            PageserverProtocol::Libpq => {
                writeln!(file, "neon.use_communicator_worker=false")?;
            }
            PageserverProtocol::Grpc => {
                writeln!(file, "neon.use_communicator_worker=true")?;
            }
        }
        // Stripe size GUC should be defined prior to connection string
        if let Some(stripe_size) = conninfo.stripe_size {
            writeln!(
                file,
                "# from compute spec's pageserver_connection_info.stripe_size field"
            )?;
            writeln!(file, "neon.stripe_size={stripe_size}")?;
        }
        let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
        let mut grpc_urls: Option<Vec<String>> = Some(Vec::new());
        let num_shards = if conninfo.shard_count.0 == 0 {
            1 // unsharded, treat it as a single shard
        } else {
@@ -106,11 +121,19 @@ pub fn write_postgres_conf(
            } else {
                libpq_urls = None
            }
            // Similarly for gRPC URLs
            if let Some(url) = &first_pageserver.grpc_url {
                if let Some(ref mut urls) = grpc_urls {
                    urls.push(url.clone());
                }
            } else {
                grpc_urls = None
            }
        }
        if let Some(libpq_urls) = libpq_urls {
            writeln!(
                file,
-                "# derived from compute spec's pageserver_conninfo field"
+                "# derived from compute spec's pageserver_connection_info field"
            )?;
            writeln!(
                file,
@@ -120,24 +143,31 @@ pub fn write_postgres_conf(
        } else {
            writeln!(file, "# no neon.pageserver_connstring")?;
        }
-
+        if let Some(grpc_urls) = grpc_urls {
        if let Some(stripe_size) = conninfo.stripe_size {
            writeln!(
                file,
-                "# from compute spec's pageserver_conninfo.stripe_size field"
+                "# derived from compute spec's pageserver_conninfo field"
            )?;
-            writeln!(file, "neon.stripe_size={stripe_size}")?;
+            writeln!(
                file,
                "neon.pageserver_grpc_urls={}",
                escape_conf_value(&grpc_urls.join(","))
            )?;
        } else {
            writeln!(file, "# no neon.pageserver_grpc_urls")?;
        }
    } else {
-        if let Some(s) = &spec.pageserver_connstring {
+        writeln!(file, "neon.use_communicator_worker=false")?;
            writeln!(file, "# from compute spec's pageserver_connstring field")?;
            writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
        }
        // Stripe size GUC should be defined prior to connection string
        if let Some(stripe_size) = spec.shard_stripe_size {
            writeln!(file, "# from compute spec's shard_stripe_size field")?;
            writeln!(file, "neon.stripe_size={stripe_size}")?;
        }
        if let Some(s) = &spec.pageserver_connstring {
            writeln!(file, "# from compute spec's pageserver_connstring field")?;
            writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
        }
    }
    if !spec.safekeeper_connstrings.is_empty() {
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -139,6 +139,15 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/LfcPrewarmState"
    delete:
      tags:
        - Prewarm
      summary: Cancel ongoing LFC prewarm
      description: ""
      operationId: cancelLfcPrewarm
      responses:
        202:
          description: Prewarm cancelled
  /lfc/offload:
    post:
@@ -636,7 +645,7 @@ components:
      properties:
        status:
          description: LFC offload status
-          enum: [not_offloaded, offloading, completed, failed]
+          enum: [not_offloaded, offloading, completed, skipped, failed]
          type: string
        error:
          description: LFC offload error, if any
--- a/compute_tools/src/http/routes/lfc.rs
+++ b/compute_tools/src/http/routes/lfc.rs
@@ -46,3 +46,8 @@ pub(in crate::http) async fn offload(compute: Compute) -> Response {
        )
    }
 }
 pub(in crate::http) async fn cancel_prewarm(compute: Compute) -> StatusCode {
    compute.cancel_prewarm();
    StatusCode::ACCEPTED
 }
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -99,7 +99,12 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
                    );
                let authenticated_router = Router::<Arc<ComputeNode>>::new()
-                    .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
+                    .route(
                        "/lfc/prewarm",
                        get(lfc::prewarm_state)
                            .post(lfc::prewarm)
                            .delete(lfc::cancel_prewarm),
                    )
                    .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload))
                    .route("/promote", post(promote::promote))
                    .route("/check_writability", post(check_writability::is_writable))
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -19,7 +19,7 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>, PostgresError> {
        .query(
            "SELECT datname FROM pg_catalog.pg_database
                WHERE datallowconn
-                AND datconnlimit <> - 2
+                AND datconnlimit OPERATOR(pg_catalog.<>) (OPERATOR(pg_catalog.-) 2::pg_catalog.int4)
                LIMIT 500",
            &[],
        )
@@ -67,7 +67,7 @@ pub async fn get_installed_extensions(
        let extensions: Vec<(String, String, i32)> = client
            .query(
-                "SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension",
+                "SELECT extname, extversion, extowner::pg_catalog.int4 FROM pg_catalog.pg_extension",
                &[],
            )
            .await?
--- a/compute_tools/src/lsn_lease.rs
+++ b/compute_tools/src/lsn_lease.rs
@@ -28,7 +28,10 @@ pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc<ComputeNode>) {
    let compute = compute.clone();
    let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn);
    let runtime = tokio::runtime::Handle::current();
    thread::spawn(move || {
        let _rt_guard = runtime.enter();
        let _entered = span.entered();
        if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) {
            // TODO: might need stronger error feedback than logging an warning.
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -76,7 +76,7 @@ impl<'m> MigrationRunner<'m> {
        self.client
            .simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration")
            .await?;
-        self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)").await?;
+        self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key pg_catalog.int4 NOT NULL PRIMARY KEY, id pg_catalog.int8 NOT NULL DEFAULT 0)").await?;
        self.client
            .simple_query(
                "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING",
--- a/compute_tools/src/migrations/0002-alter_roles.sql
+++ b/compute_tools/src/migrations/0002-alter_roles.sql
@@ -15,17 +15,17 @@ DO $$
 DECLARE
    role_name text;
 BEGIN
-    FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, '{privileged_role_name}', 'member')
+    FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member')
    LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
+        RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', pg_catalog.quote_ident(role_name);
-        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
+        EXECUTE pg_catalog.format('ALTER ROLE %I INHERIT;', role_name);
    END LOOP;
-    FOR role_name IN SELECT rolname FROM pg_roles
+    FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles
        WHERE
-            NOT pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT starts_with(rolname, 'pg_')
+            NOT pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT pg_catalog.starts_with(rolname, 'pg_')
    LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', pg_catalog.quote_ident(role_name);
-        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
+        EXECUTE pg_catalog.format('ALTER ROLE %I NOBYPASSRLS;', role_name);
    END LOOP;
 END $$;
--- a/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql
+++ b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql
@@ -1,6 +1,6 @@
 DO $$
 BEGIN
-    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
+    IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name = 'server_version_num') THEN
        EXECUTE 'GRANT pg_create_subscription TO {privileged_role_name}';
    END IF;
 END $$;
--- a/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql
+++ b/compute_tools/src/migrations/0009-revoke_replication_for_previously_allowed_roles.sql
@@ -5,9 +5,9 @@ DO $$
 DECLARE
    role_name TEXT;
 BEGIN
-    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
+    FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE rolreplication IS TRUE
    LOOP
-        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', pg_catalog.quote_ident(role_name);
-        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
+        EXECUTE pg_catalog.format('ALTER ROLE %I NOREPLICATION;', role_name);
    END LOOP;
 END $$;
--- a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql
+++ b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql
@@ -1,6 +1,6 @@
 DO $$
 BEGIN
-    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
+    IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name OPERATOR(pg_catalog.=) 'server_version_num'::pg_catalog.text) THEN
       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO {privileged_role_name}';
       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO {privileged_role_name}';
    END IF;
--- a/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql
+++ b/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql
@@ -2,7 +2,7 @@ DO $$
 DECLARE
    bypassrls boolean;
 BEGIN
-    SELECT rolbypassrls INTO bypassrls FROM pg_roles WHERE rolname = 'neon_superuser';
+    SELECT rolbypassrls INTO bypassrls FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser';
    IF NOT bypassrls THEN
        RAISE EXCEPTION 'neon_superuser cannot bypass RLS';
    END IF;
--- a/compute_tools/src/migrations/tests/0002-alter_roles.sql
+++ b/compute_tools/src/migrations/tests/0002-alter_roles.sql
@@ -4,8 +4,8 @@ DECLARE
 BEGIN
    FOR role IN
        SELECT rolname AS name, rolinherit AS inherit
-        FROM pg_roles
+        FROM pg_catalog.pg_roles
-        WHERE pg_has_role(rolname, 'neon_superuser', 'member')
+        WHERE pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member')
    LOOP
        IF NOT role.inherit THEN
            RAISE EXCEPTION '% cannot inherit', quote_ident(role.name);
@@ -14,12 +14,12 @@ BEGIN
    FOR role IN
        SELECT rolname AS name, rolbypassrls AS bypassrls
-        FROM pg_roles
+        FROM pg_catalog.pg_roles
-        WHERE NOT pg_has_role(rolname, 'neon_superuser', 'member')
+        WHERE NOT pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member')
-            AND NOT starts_with(rolname, 'pg_')
+            AND NOT pg_catalog.starts_with(rolname, 'pg_')
    LOOP
        IF role.bypassrls THEN
-            RAISE EXCEPTION  '% can bypass RLS', quote_ident(role.name);
+            RAISE EXCEPTION  '% can bypass RLS', pg_catalog.quote_ident(role.name);
        END IF;
    END LOOP;
 END $$;
--- a/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql
+++ b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql
@@ -1,10 +1,10 @@
 DO $$
 BEGIN
-    IF (SELECT current_setting('server_version_num')::numeric < 160000) THEN
+    IF (SELECT pg_catalog.current_setting('server_version_num')::pg_catalog.numeric < 160000) THEN
        RETURN;
    END IF;
-    IF NOT (SELECT pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN
+    IF NOT (SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN
        RAISE EXCEPTION 'neon_superuser cannot execute pg_create_subscription';
    END IF;
 END $$;
--- a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql
+++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql
@@ -2,12 +2,12 @@ DO $$
 DECLARE
    monitor record;
 BEGIN
-    SELECT pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member,
+    SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member,
            admin_option AS admin
        INTO monitor
-        FROM pg_auth_members
+        FROM pg_catalog.pg_auth_members
-        WHERE roleid = 'pg_monitor'::regrole
+        WHERE roleid = 'pg_monitor'::pg_catalog.regrole
-            AND member = 'neon_superuser'::regrole;
+            AND member = 'neon_superuser'::pg_catalog.regrole;
    IF monitor IS NULL THEN
        RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor';
--- a/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql
+++ b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql
@@ -2,11 +2,11 @@ DO $$
 DECLARE
    can_execute boolean;
 BEGIN
-    SELECT bool_and(has_function_privilege('neon_superuser', oid, 'execute'))
+    SELECT pg_catalog.bool_and(pg_catalog.has_function_privilege('neon_superuser', oid, 'execute'))
       INTO can_execute
-       FROM pg_proc
+       FROM pg_catalog.pg_proc
       WHERE proname IN ('pg_export_snapshot', 'pg_log_standby_snapshot')
-           AND pronamespace = 'pg_catalog'::regnamespace;
+           AND pronamespace = 'pg_catalog'::pg_catalog.regnamespace;
    IF NOT can_execute THEN
        RAISE EXCEPTION 'neon_superuser cannot execute both pg_export_snapshot and pg_log_standby_snapshot';
    END IF;
--- a/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql
+++ b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql
@@ -2,9 +2,9 @@ DO $$
 DECLARE
    can_execute boolean;
 BEGIN
-    SELECT has_function_privilege('neon_superuser', oid, 'execute')
+    SELECT pg_catalog.has_function_privilege('neon_superuser', oid, 'execute')
       INTO can_execute
-       FROM pg_proc
+       FROM pg_catalog.pg_proc
       WHERE proname = 'pg_show_replication_origin_status'
           AND pronamespace = 'pg_catalog'::regnamespace;
    IF NOT can_execute THEN
--- a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql
+++ b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql
@@ -2,10 +2,10 @@ DO $$
 DECLARE
    signal_backend record;
 BEGIN
-    SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
+    SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
            admin_option AS admin
        INTO signal_backend
-        FROM pg_auth_members
+        FROM pg_catalog.pg_auth_members
        WHERE roleid = 'pg_signal_backend'::regrole
            AND member = 'neon_superuser'::regrole;
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -407,9 +407,9 @@ fn get_database_stats(cli: &mut Client) -> anyhow::Result<(f64, i64)> {
    // like `postgres_exporter` use it to query Postgres statistics.
    // Use explicit 8 bytes type casts to match Rust types.
    let stats = cli.query_one(
-        "SELECT coalesce(sum(active_time), 0.0)::float8 AS total_active_time,
+        "SELECT pg_catalog.coalesce(pg_catalog.sum(active_time), 0.0)::pg_catalog.float8 AS total_active_time,
-            coalesce(sum(sessions), 0)::bigint AS total_sessions
+            pg_catalog.coalesce(pg_catalog.sum(sessions), 0)::pg_catalog.bigint AS total_sessions
-        FROM pg_stat_database
+        FROM pg_catalog.pg_stat_database
        WHERE datname NOT IN (
                'postgres',
                'template0',
@@ -445,11 +445,11 @@ fn get_backends_state_change(cli: &mut Client) -> anyhow::Result<Option<DateTime
    let mut last_active: Option<DateTime<Utc>> = None;
    // Get all running client backends except ourself, use RFC3339 DateTime format.
    let backends = cli.query(
-        "SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
+        "SELECT state, pg_catalog.to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"'::pg_catalog.text) AS state_change
                FROM pg_stat_activity
-                    WHERE backend_type = 'client backend'
+                    WHERE backend_type OPERATOR(pg_catalog.=) 'client backend'::pg_catalog.text
-                    AND pid != pg_backend_pid()
+                    AND pid OPERATOR(pg_catalog.!=) pg_catalog.pg_backend_pid()
-                    AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
+                    AND usename OPERATOR(pg_catalog.!=) 'cloud_admin'::pg_catalog.name;", // XXX: find a better way to filter other monitors?
        &[],
    );
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -299,9 +299,9 @@ pub async fn get_existing_dbs_async(
        .query_raw::<str, &String, &[String; 0]>(
            "SELECT
                datname AS name,
-                (SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner,
+                (SELECT rolname FROM pg_catalog.pg_roles WHERE oid OPERATOR(pg_catalog.=) datdba) AS owner,
                NOT datallowconn AS restrict_conn,
-                datconnlimit = - 2 AS invalid
+                datconnlimit OPERATOR(pg_catalog.=) (OPERATOR(pg_catalog.-) 2) AS invalid
            FROM
                pg_catalog.pg_database;",
            &[],
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -82,7 +82,7 @@ impl ComputeNode {
                info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id);
                drop_subscriptions_done = match
-                    client.query("select 1 from neon.drop_subscriptions_done where timeline_id = $1", &[&timeline_id.to_string()]).await {
+                    client.query("select 1 from neon.drop_subscriptions_done where timeline_id OPERATOR(pg_catalog.=) $1", &[&timeline_id.to_string()]).await {
                    Ok(result) => !result.is_empty(),
                    Err(e) =>
                    {
@@ -1142,7 +1142,9 @@ async fn get_operations<'a>(
            if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
                if libs.contains("pg_stat_statements") {
                    return Ok(Box::new(once(Operation {
-                        query: String::from("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"),
+                        query: String::from(
                            "CREATE EXTENSION IF NOT EXISTS pg_stat_statements WITH SCHEMA public",
                        ),
                        comment: Some(String::from("create system extensions")),
                    })));
                }
@@ -1150,11 +1152,13 @@ async fn get_operations<'a>(
            Ok(Box::new(empty()))
        }
        ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation {
-            query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"),
+            query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit WITH SCHEMA public"),
            comment: Some(String::from("create pgaudit extensions")),
        }))),
        ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation {
-            query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"),
+            query: String::from(
                "CREATE EXTENSION IF NOT EXISTS pgauditlogtofile WITH SCHEMA public",
            ),
            comment: Some(String::from("create pgauditlogtofile extensions")),
        }))),
        // Disable pgaudit logging for postgres database.
@@ -1178,7 +1182,7 @@ async fn get_operations<'a>(
                },
                Operation {
                    query: String::from(
-                        "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'",
+                        "UPDATE pg_catalog.pg_extension SET extrelocatable = true WHERE extname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name AND extrelocatable OPERATOR(pg_catalog.=) false",
                    ),
                    comment: Some(String::from("compat/fix: make neon relocatable")),
                },
--- a/compute_tools/src/sql/add_availabilitycheck_tables.sql
+++ b/compute_tools/src/sql/add_availabilitycheck_tables.sql
@@ -3,16 +3,17 @@ BEGIN
    IF NOT EXISTS(
        SELECT 1
        FROM pg_catalog.pg_tables
-        WHERE tablename = 'health_check'
+        WHERE tablename::pg_catalog.name OPERATOR(pg_catalog.=) 'health_check'::pg_catalog.name
        AND schemaname::pg_catalog.name OPERATOR(pg_catalog.=) 'public'::pg_catalog.name
    )
    THEN
-    CREATE TABLE health_check (
+    CREATE TABLE public.health_check (
-        id serial primary key,
+        id pg_catalog.int4 primary key generated by default as identity,
-        updated_at timestamptz default now()
+        updated_at pg_catalog.timestamptz default pg_catalog.now()
    );
-    INSERT INTO health_check VALUES (1, now())
+    INSERT INTO public.health_check VALUES (1, pg_catalog.now())
        ON CONFLICT (id) DO UPDATE
-         SET updated_at = now();
+         SET updated_at = pg_catalog.now();
    END IF;
 END
 $$
--- a/compute_tools/src/sql/anon_ext_fn_reassign.sql
+++ b/compute_tools/src/sql/anon_ext_fn_reassign.sql
@@ -1,12 +0,0 @@
 DO $$
 DECLARE
    query varchar;
 BEGIN
    FOR query IN SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {db_owner};'
    FROM pg_proc p
        JOIN pg_namespace nsp ON p.pronamespace = nsp.oid
    WHERE nsp.nspname = 'anon' LOOP
        EXECUTE query;
    END LOOP;
 END
 $$;
--- a/compute_tools/src/sql/create_privileged_role.sql
+++ b/compute_tools/src/sql/create_privileged_role.sql
@@ -1,6 +1,6 @@
 DO $$
    BEGIN
-        IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{privileged_role_name}')
+        IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname OPERATOR(pg_catalog.=) '{privileged_role_name}'::pg_catalog.name)
        THEN
            CREATE ROLE {privileged_role_name} {privileges} IN ROLE pg_read_all_data, pg_write_all_data;
        END IF;
--- a/compute_tools/src/sql/default_grants.sql
+++ b/compute_tools/src/sql/default_grants.sql
@@ -4,14 +4,14 @@ $$
        IF EXISTS(
            SELECT nspname
            FROM pg_catalog.pg_namespace
-            WHERE nspname = 'public'
+            WHERE nspname OPERATOR(pg_catalog.=) 'public'
        ) AND
-           current_setting('server_version_num')::int / 10000 >= 15
+           pg_catalog.current_setting('server_version_num')::int OPERATOR(pg_catalog./) 10000 OPERATOR(pg_catalog.>=) 15
        THEN
            IF EXISTS(
                SELECT rolname
                FROM pg_catalog.pg_roles
-                WHERE rolname = 'web_access'
+                WHERE rolname OPERATOR(pg_catalog.=) 'web_access'
            )
            THEN
                GRANT CREATE ON SCHEMA public TO web_access;
@@ -20,7 +20,7 @@ $$
        IF EXISTS(
            SELECT nspname
            FROM pg_catalog.pg_namespace
-            WHERE nspname = 'public'
+            WHERE nspname OPERATOR(pg_catalog.=) 'public'
        )
        THEN
            ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;
--- a/compute_tools/src/sql/drop_subscriptions.sql
+++ b/compute_tools/src/sql/drop_subscriptions.sql
@@ -2,11 +2,17 @@ DO ${outer_tag}$
 DECLARE
    subname TEXT;
 BEGIN
-    LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE;
+    LOCK TABLE pg_catalog.pg_subscription IN ACCESS EXCLUSIVE MODE;
-    FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP
+    FOR subname IN
-        EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname);
+        SELECT pg_subscription.subname
-        EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
+        FROM pg_catalog.pg_subscription
-        EXECUTE format('DROP SUBSCRIPTION %I;', subname);
+        WHERE subdbid OPERATOR(pg_catalog.=) (
            SELECT oid FROM pg_database WHERE datname OPERATOR(pg_catalog.=) {datname_str}::pg_catalog.name
        )
    LOOP
        EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I DISABLE;', subname);
        EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
        EXECUTE pg_catalog.format('DROP SUBSCRIPTION %I;', subname);
    END LOOP;
 END;
 ${outer_tag}$;
--- a/compute_tools/src/sql/finalize_drop_subscriptions.sql
+++ b/compute_tools/src/sql/finalize_drop_subscriptions.sql
@@ -3,19 +3,19 @@ BEGIN
    IF NOT EXISTS(
        SELECT 1
        FROM pg_catalog.pg_tables
-        WHERE tablename = 'drop_subscriptions_done'
+        WHERE tablename OPERATOR(pg_catalog.=) 'drop_subscriptions_done'::pg_catalog.name
-        AND schemaname = 'neon'
+        AND schemaname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name
    )
    THEN
        CREATE TABLE neon.drop_subscriptions_done
-        (id serial primary key, timeline_id text);
+        (id pg_catalog.int4 primary key generated by default as identity, timeline_id pg_catalog.text);
    END IF;
    -- preserve the timeline_id of the last drop_subscriptions run
    -- to ensure that the cleanup of a timeline is executed only once.
    -- use upsert to avoid the table bloat in case of cascade branching (branch of a branch)
-    INSERT INTO neon.drop_subscriptions_done VALUES (1, current_setting('neon.timeline_id'))
+    INSERT INTO neon.drop_subscriptions_done VALUES (1, pg_catalog.current_setting('neon.timeline_id'))
    ON CONFLICT (id) DO UPDATE
-    SET timeline_id = current_setting('neon.timeline_id');
+    SET timeline_id = pg_catalog.current_setting('neon.timeline_id')::pg_catalog.text;
 END
 $$
--- a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql
+++ b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql
@@ -15,15 +15,15 @@ BEGIN
        WHERE schema_name IN ('public')
    LOOP
        FOR grantor IN EXECUTE
-            format(
+            pg_catalog.format(
-                'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s',
+                'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee OPERATOR(pg_catalog.=) %s',
                -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
                quote_literal({role_name})
            )
        LOOP
-            EXECUTE format('SET LOCAL ROLE %I', grantor);
+            EXECUTE pg_catalog.format('SET LOCAL ROLE %I', grantor);
-            revoke_query := format(
+            revoke_query := pg_catalog.format(
                'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I',
                schema,
                -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
--- a/compute_tools/src/sql/set_public_schema_owner.sql
+++ b/compute_tools/src/sql/set_public_schema_owner.sql
@@ -5,17 +5,17 @@ DO ${outer_tag}$
        IF EXISTS(
            SELECT nspname
            FROM pg_catalog.pg_namespace
-            WHERE nspname = 'public'
+            WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.name
        )
        THEN
            SELECT nspowner::regrole::text
            FROM pg_catalog.pg_namespace
-            WHERE nspname = 'public'
+            WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.text
            INTO schema_owner;
-            IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
+            IF schema_owner OPERATOR(pg_catalog.=) 'cloud_admin'::pg_catalog.text OR schema_owner OPERATOR(pg_catalog.=) 'zenith_admin'::pg_catalog.text
            THEN
-                EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner});
+                EXECUTE pg_catalog.format('ALTER SCHEMA public OWNER TO %I', {db_owner});
            END IF;
        END IF;
    END
--- a/compute_tools/src/sql/unset_template_for_drop_dbs.sql
+++ b/compute_tools/src/sql/unset_template_for_drop_dbs.sql
@@ -3,10 +3,10 @@ DO ${outer_tag}$
        IF EXISTS(
            SELECT 1
            FROM pg_catalog.pg_database
-            WHERE datname = {datname}
+            WHERE datname OPERATOR(pg_catalog.=) {datname}::pg_catalog.name
        )
        THEN
-            EXECUTE format('ALTER DATABASE %I is_template false', {datname});
+            EXECUTE pg_catalog.format('ALTER DATABASE %I is_template false', {datname});
        END IF;
    END
 ${outer_tag}$;
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -71,8 +71,9 @@ const DEFAULT_PG_VERSION_NUM: &str = "17";
 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
 /// Neon CLI.
 #[derive(clap::Parser)]
-#[command(version = GIT_VERSION, about, name = "Neon CLI")]
+#[command(version = GIT_VERSION, name = "Neon CLI")]
 struct Cli {
    #[command(subcommand)]
    command: NeonLocalCmd,
@@ -107,30 +108,31 @@ enum NeonLocalCmd {
    Stop(StopCmdArgs),
 }
 /// Initialize a new Neon repository, preparing configs for services to start with.
 #[derive(clap::Args)]
 #[clap(about = "Initialize a new Neon repository, preparing configs for services to start with")]
 struct InitCmdArgs {
-    #[clap(long, help("How many pageservers to create (default 1)"))]
+    /// How many pageservers to create (default 1).
    #[clap(long)]
    num_pageservers: Option<u16>,
    #[clap(long)]
    config: Option<PathBuf>,
-    #[clap(long, help("Force initialization even if the repository is not empty"))]
+    /// Force initialization even if the repository is not empty.
    #[clap(long, default_value = "must-not-exist")]
    #[arg(value_parser)]
    #[clap(default_value = "must-not-exist")]
    force: InitForceMode,
 }
 /// Start pageserver and safekeepers.
 #[derive(clap::Args)]
 #[clap(about = "Start pageserver and safekeepers")]
 struct StartCmdArgs {
    #[clap(long = "start-timeout", default_value = "10s")]
    timeout: humantime::Duration,
 }
 /// Stop pageserver and safekeepers.
 #[derive(clap::Args)]
 #[clap(about = "Stop pageserver and safekeepers")]
 struct StopCmdArgs {
    #[arg(value_enum)]
    #[clap(long, default_value_t = StopMode::Fast)]
@@ -143,8 +145,8 @@ enum StopMode {
    Immediate,
 }
 /// Manage tenants.
 #[derive(clap::Subcommand)]
 #[clap(about = "Manage tenants")]
 enum TenantCmd {
    List,
    Create(TenantCreateCmdArgs),
@@ -155,38 +157,36 @@ enum TenantCmd {
 #[derive(clap::Args)]
 struct TenantCreateCmdArgs {
-    #[clap(
+    /// Tenant ID, as a 32-byte hexadecimal string.
-        long = "tenant-id",
+    #[clap(long = "tenant-id")]
        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
    )]
    tenant_id: Option<TenantId>,
-    #[clap(
+    /// Use a specific timeline id when creating a tenant and its initial timeline.
-        long,
+    #[clap(long)]
        help = "Use a specific timeline id when creating a tenant and its initial timeline"
    )]
    timeline_id: Option<TimelineId>,
    #[clap(short = 'c')]
    config: Vec<String>,
    /// Postgres version to use for the initial timeline.
    #[arg(default_value = DEFAULT_PG_VERSION_NUM)]
-    #[clap(long, help = "Postgres version to use for the initial timeline")]
+    #[clap(long)]
    pg_version: PgMajorVersion,
-    #[clap(
+    /// Use this tenant in future CLI commands where tenant_id is needed, but not specified.
-        long,
+    #[clap(long)]
        help = "Use this tenant in future CLI commands where tenant_id is needed, but not specified"
    )]
    set_default: bool,
-    #[clap(long, help = "Number of shards in the new tenant")]
+    /// Number of shards in the new tenant.
    #[clap(long)]
    #[arg(default_value_t = 0)]
    shard_count: u8,
-    #[clap(long, help = "Sharding stripe size in pages")]
+    /// Sharding stripe size in pages.
    #[clap(long)]
    shard_stripe_size: Option<u32>,
-    #[clap(long, help = "Placement policy shards in this tenant")]
+    /// Placement policy shards in this tenant.
    #[clap(long)]
    #[arg(value_parser = parse_placement_policy)]
    placement_policy: Option<PlacementPolicy>,
 }
@@ -195,44 +195,35 @@ fn parse_placement_policy(s: &str) -> anyhow::Result<PlacementPolicy> {
    Ok(serde_json::from_str::<PlacementPolicy>(s)?)
 }
 /// Set a particular tenant as default in future CLI commands where tenant_id is needed, but not
 /// specified.
 #[derive(clap::Args)]
 #[clap(
    about = "Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"
 )]
 struct TenantSetDefaultCmdArgs {
-    #[clap(
+    /// Tenant ID, as a 32-byte hexadecimal string.
-        long = "tenant-id",
+    #[clap(long = "tenant-id")]
        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
    )]
    tenant_id: TenantId,
 }
 #[derive(clap::Args)]
 struct TenantConfigCmdArgs {
-    #[clap(
+    /// Tenant ID, as a 32-byte hexadecimal string.
-        long = "tenant-id",
+    #[clap(long = "tenant-id")]
        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
    )]
    tenant_id: Option<TenantId>,
    #[clap(short = 'c')]
    config: Vec<String>,
 }
 /// Import a tenant that is present in remote storage, and create branches for its timelines.
 #[derive(clap::Args)]
 #[clap(
    about = "Import a tenant that is present in remote storage, and create branches for its timelines"
 )]
 struct TenantImportCmdArgs {
-    #[clap(
+    /// Tenant ID, as a 32-byte hexadecimal string.
-        long = "tenant-id",
+    #[clap(long = "tenant-id")]
        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
    )]
    tenant_id: TenantId,
 }
 /// Manage timelines.
 #[derive(clap::Subcommand)]
 #[clap(about = "Manage timelines")]
 enum TimelineCmd {
    List(TimelineListCmdArgs),
    Branch(TimelineBranchCmdArgs),
@@ -240,98 +231,87 @@ enum TimelineCmd {
    Import(TimelineImportCmdArgs),
 }
 /// List all timelines available to this pageserver.
 #[derive(clap::Args)]
 #[clap(about = "List all timelines available to this pageserver")]
 struct TimelineListCmdArgs {
-    #[clap(
+    /// Tenant ID, as a 32-byte hexadecimal string.
-        long = "tenant-id",
+    #[clap(long = "tenant-id")]
        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
    )]
    tenant_shard_id: Option<TenantShardId>,
 }
 /// Create a new timeline, branching off from another timeline.
 #[derive(clap::Args)]
 #[clap(about = "Create a new timeline, branching off from another timeline")]
 struct TimelineBranchCmdArgs {
-    #[clap(
+    /// Tenant ID, as a 32-byte hexadecimal string.
-        long = "tenant-id",
+    #[clap(long = "tenant-id")]
        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
    )]
    tenant_id: Option<TenantId>,
-
+    /// New timeline's ID, as a 32-byte hexadecimal string.
-    #[clap(long, help = "New timeline's ID")]
+    #[clap(long)]
    timeline_id: Option<TimelineId>,
-
+    /// Human-readable alias for the new timeline.
-    #[clap(long, help = "Human-readable alias for the new timeline")]
+    #[clap(long)]
    branch_name: String,
-
+    /// Use last Lsn of another timeline (and its data) as base when creating the new timeline. The
-    #[clap(
+    /// timeline gets resolved by its branch name.
-        long,
+    #[clap(long)]
        help = "Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name."
    )]
    ancestor_branch_name: Option<String>,
-
+    /// When using another timeline as base, use a specific Lsn in it instead of the latest one.
-    #[clap(
+    #[clap(long)]
        long,
        help = "When using another timeline as base, use a specific Lsn in it instead of the latest one"
    )]
    ancestor_start_lsn: Option<Lsn>,
 }
 /// Create a new blank timeline.
 #[derive(clap::Args)]
 #[clap(about = "Create a new blank timeline")]
 struct TimelineCreateCmdArgs {
-    #[clap(
+    /// Tenant ID, as a 32-byte hexadecimal string.
-        long = "tenant-id",
+    #[clap(long = "tenant-id")]
        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
    )]
    tenant_id: Option<TenantId>,
-
+    /// New timeline's ID, as a 32-byte hexadecimal string.
-    #[clap(long, help = "New timeline's ID")]
+    #[clap(long)]
    timeline_id: Option<TimelineId>,
-
+    /// Human-readable alias for the new timeline.
-    #[clap(long, help = "Human-readable alias for the new timeline")]
+    #[clap(long)]
    branch_name: String,
    /// Postgres version.
    #[arg(default_value = DEFAULT_PG_VERSION_NUM)]
-    #[clap(long, help = "Postgres version")]
+    #[clap(long)]
    pg_version: PgMajorVersion,
 }
 /// Import a timeline from a basebackup directory.
 #[derive(clap::Args)]
 #[clap(about = "Import timeline from a basebackup directory")]
 struct TimelineImportCmdArgs {
-    #[clap(
+    /// Tenant ID, as a 32-byte hexadecimal string.
-        long = "tenant-id",
+    #[clap(long = "tenant-id")]
        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
    )]
    tenant_id: Option<TenantId>,
-
+    /// New timeline's ID, as a 32-byte hexadecimal string.
-    #[clap(long, help = "New timeline's ID")]
+    #[clap(long)]
    timeline_id: TimelineId,
-
+    /// Human-readable alias for the new timeline.
-    #[clap(long, help = "Human-readable alias for the new timeline")]
+    #[clap(long)]
    branch_name: String,
-
+    /// Basebackup tarfile to import.
-    #[clap(long, help = "Basebackup tarfile to import")]
+    #[clap(long)]
    base_tarfile: PathBuf,
-
+    /// LSN the basebackup starts at.
-    #[clap(long, help = "Lsn the basebackup starts at")]
+    #[clap(long)]
    base_lsn: Lsn,
-
+    /// WAL to add after base.
-    #[clap(long, help = "Wal to add after base")]
+    #[clap(long)]
    wal_tarfile: Option<PathBuf>,
-
+    /// LSN the basebackup ends at.
-    #[clap(long, help = "Lsn the basebackup ends at")]
+    #[clap(long)]
    end_lsn: Option<Lsn>,
    /// Postgres version of the basebackup being imported.
    #[arg(default_value = DEFAULT_PG_VERSION_NUM)]
-    #[clap(long, help = "Postgres version of the backup being imported")]
+    #[clap(long)]
    pg_version: PgMajorVersion,
 }
 /// Manage pageservers.
 #[derive(clap::Subcommand)]
 #[clap(about = "Manage pageservers")]
 enum PageserverCmd {
    Status(PageserverStatusCmdArgs),
    Start(PageserverStartCmdArgs),
@@ -339,223 +319,202 @@ enum PageserverCmd {
    Restart(PageserverRestartCmdArgs),
 }
 /// Show status of a local pageserver.
 #[derive(clap::Args)]
 #[clap(about = "Show status of a local pageserver")]
 struct PageserverStatusCmdArgs {
-    #[clap(long = "id", help = "pageserver id")]
+    /// Pageserver ID.
    #[clap(long = "id")]
    pageserver_id: Option<NodeId>,
 }
 /// Start local pageserver.
 #[derive(clap::Args)]
 #[clap(about = "Start local pageserver")]
 struct PageserverStartCmdArgs {
-    #[clap(long = "id", help = "pageserver id")]
+    /// Pageserver ID.
    #[clap(long = "id")]
    pageserver_id: Option<NodeId>,
-
+    /// Timeout until we fail the command.
-    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    #[clap(short = 't', long)]
    #[arg(default_value = "10s")]
    start_timeout: humantime::Duration,
 }
 /// Stop local pageserver.
 #[derive(clap::Args)]
 #[clap(about = "Stop local pageserver")]
 struct PageserverStopCmdArgs {
-    #[clap(long = "id", help = "pageserver id")]
+    /// Pageserver ID.
    #[clap(long = "id")]
    pageserver_id: Option<NodeId>,
-
+    /// If 'immediate', don't flush repository data at shutdown
-    #[clap(
+    #[clap(short = 'm')]
        short = 'm',
        help = "If 'immediate', don't flush repository data at shutdown"
    )]
    #[arg(value_enum, default_value = "fast")]
    stop_mode: StopMode,
 }
 /// Restart local pageserver.
 #[derive(clap::Args)]
 #[clap(about = "Restart local pageserver")]
 struct PageserverRestartCmdArgs {
-    #[clap(long = "id", help = "pageserver id")]
+    /// Pageserver ID.
    #[clap(long = "id")]
    pageserver_id: Option<NodeId>,
-
+    /// Timeout until we fail the command.
-    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    #[clap(short = 't', long)]
    #[arg(default_value = "10s")]
    start_timeout: humantime::Duration,
 }
 /// Manage storage controller.
 #[derive(clap::Subcommand)]
 #[clap(about = "Manage storage controller")]
 enum StorageControllerCmd {
    Start(StorageControllerStartCmdArgs),
    Stop(StorageControllerStopCmdArgs),
 }
 /// Start storage controller.
 #[derive(clap::Args)]
 #[clap(about = "Start storage controller")]
 struct StorageControllerStartCmdArgs {
-    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    /// Timeout until we fail the command.
    #[clap(short = 't', long)]
    #[arg(default_value = "10s")]
    start_timeout: humantime::Duration,
-
+    /// Identifier used to distinguish storage controller instances.
-    #[clap(
+    #[clap(long)]
        long,
        help = "Identifier used to distinguish storage controller instances"
    )]
    #[arg(default_value_t = 1)]
    instance_id: u8,
-
+    /// Base port for the storage controller instance identified by instance-id (defaults to
-    #[clap(
+    /// pageserver cplane api).
-        long,
+    #[clap(long)]
        help = "Base port for the storage controller instance idenfified by instance-id (defaults to pageserver cplane api)"
    )]
    base_port: Option<u16>,
-    #[clap(
+    /// Whether the storage controller should handle pageserver-reported local disk loss events.
-        long,
+    #[clap(long)]
        help = "Whether the storage controller should handle pageserver-reported local disk loss events."
    )]
    handle_ps_local_disk_loss: Option<bool>,
 }
 /// Stop storage controller.
 #[derive(clap::Args)]
 #[clap(about = "Stop storage controller")]
 struct StorageControllerStopCmdArgs {
-    #[clap(
+    /// If 'immediate', don't flush repository data at shutdown
-        short = 'm',
+    #[clap(short = 'm')]
        help = "If 'immediate', don't flush repository data at shutdown"
    )]
    #[arg(value_enum, default_value = "fast")]
    stop_mode: StopMode,
-
+    /// Identifier used to distinguish storage controller instances.
-    #[clap(
+    #[clap(long)]
        long,
        help = "Identifier used to distinguish storage controller instances"
    )]
    #[arg(default_value_t = 1)]
    instance_id: u8,
 }
 /// Manage storage broker.
 #[derive(clap::Subcommand)]
 #[clap(about = "Manage storage broker")]
 enum StorageBrokerCmd {
    Start(StorageBrokerStartCmdArgs),
    Stop(StorageBrokerStopCmdArgs),
 }
 /// Start broker.
 #[derive(clap::Args)]
 #[clap(about = "Start broker")]
 struct StorageBrokerStartCmdArgs {
-    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    /// Timeout until we fail the command.
-    #[arg(default_value = "10s")]
+    #[clap(short = 't', long, default_value = "10s")]
    start_timeout: humantime::Duration,
 }
 /// Stop broker.
 #[derive(clap::Args)]
 #[clap(about = "stop broker")]
 struct StorageBrokerStopCmdArgs {
-    #[clap(
+    /// If 'immediate', don't flush repository data on shutdown.
-        short = 'm',
+    #[clap(short = 'm')]
        help = "If 'immediate', don't flush repository data at shutdown"
    )]
    #[arg(value_enum, default_value = "fast")]
    stop_mode: StopMode,
 }
 /// Manage safekeepers.
 #[derive(clap::Subcommand)]
 #[clap(about = "Manage safekeepers")]
 enum SafekeeperCmd {
    Start(SafekeeperStartCmdArgs),
    Stop(SafekeeperStopCmdArgs),
    Restart(SafekeeperRestartCmdArgs),
 }
 /// Manage object storage.
 #[derive(clap::Subcommand)]
 #[clap(about = "Manage object storage")]
 enum EndpointStorageCmd {
    Start(EndpointStorageStartCmd),
    Stop(EndpointStorageStopCmd),
 }
 /// Start object storage.
 #[derive(clap::Args)]
 #[clap(about = "Start object storage")]
 struct EndpointStorageStartCmd {
-    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    /// Timeout until we fail the command.
    #[clap(short = 't', long)]
    #[arg(default_value = "10s")]
    start_timeout: humantime::Duration,
 }
 /// Stop object storage.
 #[derive(clap::Args)]
 #[clap(about = "Stop object storage")]
 struct EndpointStorageStopCmd {
    /// If 'immediate', don't flush repository data on shutdown.
    #[clap(short = 'm')]
    #[arg(value_enum, default_value = "fast")]
    #[clap(
        short = 'm',
        help = "If 'immediate', don't flush repository data at shutdown"
    )]
    stop_mode: StopMode,
 }
 /// Start local safekeeper.
 #[derive(clap::Args)]
 #[clap(about = "Start local safekeeper")]
 struct SafekeeperStartCmdArgs {
-    #[clap(help = "safekeeper id")]
+    /// Safekeeper ID.
    #[arg(default_value_t = NodeId(1))]
    id: NodeId,
-    #[clap(
+    /// Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo.
-        short = 'e',
+    #[clap(short = 'e', long = "safekeeper-extra-opt")]
        long = "safekeeper-extra-opt",
        help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo"
    )]
    extra_opt: Vec<String>,
-    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    /// Timeout until we fail the command.
    #[clap(short = 't', long)]
    #[arg(default_value = "10s")]
    start_timeout: humantime::Duration,
 }
 /// Stop local safekeeper.
 #[derive(clap::Args)]
 #[clap(about = "Stop local safekeeper")]
 struct SafekeeperStopCmdArgs {
-    #[clap(help = "safekeeper id")]
+    /// Safekeeper ID.
    #[arg(default_value_t = NodeId(1))]
    id: NodeId,
    /// If 'immediate', don't flush repository data on shutdown.
    #[arg(value_enum, default_value = "fast")]
-    #[clap(
+    #[clap(short = 'm')]
        short = 'm',
        help = "If 'immediate', don't flush repository data at shutdown"
    )]
    stop_mode: StopMode,
 }
 /// Restart local safekeeper.
 #[derive(clap::Args)]
 #[clap(about = "Restart local safekeeper")]
 struct SafekeeperRestartCmdArgs {
-    #[clap(help = "safekeeper id")]
+    /// Safekeeper ID.
    #[arg(default_value_t = NodeId(1))]
    id: NodeId,
    /// If 'immediate', don't flush repository data on shutdown.
    #[arg(value_enum, default_value = "fast")]
-    #[clap(
+    #[clap(short = 'm')]
        short = 'm',
        help = "If 'immediate', don't flush repository data at shutdown"
    )]
    stop_mode: StopMode,
-    #[clap(
+    /// Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo.
-        short = 'e',
+    #[clap(short = 'e', long = "safekeeper-extra-opt")]
        long = "safekeeper-extra-opt",
        help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo"
    )]
    extra_opt: Vec<String>,
-    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    /// Timeout until we fail the command.
    #[clap(short = 't', long)]
    #[arg(default_value = "10s")]
    start_timeout: humantime::Duration,
 }
 /// Manage Postgres instances.
 #[derive(clap::Subcommand)]
 #[clap(about = "Manage Postgres instances")]
 enum EndpointCmd {
    List(EndpointListCmdArgs),
    Create(EndpointCreateCmdArgs),
@@ -567,33 +526,27 @@ enum EndpointCmd {
    GenerateJwt(EndpointGenerateJwtCmdArgs),
 }
 /// List endpoints.
 #[derive(clap::Args)]
 #[clap(about = "List endpoints")]
 struct EndpointListCmdArgs {
-    #[clap(
+    /// Tenant ID, as a 32-byte hexadecimal string.
-        long = "tenant-id",
+    #[clap(long = "tenant-id")]
        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
    )]
    tenant_shard_id: Option<TenantShardId>,
 }
 /// Create a compute endpoint.
 #[derive(clap::Args)]
 #[clap(about = "Create a compute endpoint")]
 struct EndpointCreateCmdArgs {
-    #[clap(
+    /// Tenant ID, as a 32-byte hexadecimal string.
-        long = "tenant-id",
+    #[clap(long = "tenant-id")]
        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
    )]
    tenant_id: Option<TenantId>,
-
+    /// Postgres endpoint ID.
    #[clap(help = "Postgres endpoint id")]
    endpoint_id: Option<String>,
-    #[clap(long, help = "Name of the branch the endpoint will run on")]
+    /// Name of the branch the endpoint will run on.
    #[clap(long)]
    branch_name: Option<String>,
-    #[clap(
+    /// Specify LSN on the timeline to start from. By default, end of the timeline would be used.
-        long,
+    #[clap(long)]
        help = "Specify Lsn on the timeline to start from. By default, end of the timeline would be used"
    )]
    lsn: Option<Lsn>,
    #[clap(long)]
    pg_port: Option<u16>,
@@ -604,16 +557,13 @@ struct EndpointCreateCmdArgs {
    #[clap(long = "pageserver-id")]
    endpoint_pageserver_id: Option<NodeId>,
-    #[clap(
+    /// Don't do basebackup, create endpoint directory with only config files.
-        long,
+    #[clap(long, action = clap::ArgAction::Set, default_value_t = false)]
        help = "Don't do basebackup, create endpoint directory with only config files",
        action = clap::ArgAction::Set,
        default_value_t = false
    )]
    config_only: bool,
    /// Postgres version.
    #[arg(default_value = DEFAULT_PG_VERSION_NUM)]
-    #[clap(long, help = "Postgres version")]
+    #[clap(long)]
    pg_version: PgMajorVersion,
    /// Use gRPC to communicate with Pageservers, by generating grpc:// connstrings.
@@ -624,170 +574,140 @@ struct EndpointCreateCmdArgs {
    #[clap(long)]
    grpc: bool,
-    #[clap(
+    /// If set, the node will be a hot replica on the specified timeline.
-        long,
+    #[clap(long, action = clap::ArgAction::Set, default_value_t = false)]
        help = "If set, the node will be a hot replica on the specified timeline",
        action = clap::ArgAction::Set,
        default_value_t = false
    )]
    hot_standby: bool,
-
+    /// If set, will set up the catalog for neon_superuser.
-    #[clap(long, help = "If set, will set up the catalog for neon_superuser")]
+    #[clap(long)]
    update_catalog: bool,
-
+    /// Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but
-    #[clap(
+    /// useful for tests.
-        long,
+    #[clap(long)]
        help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests."
    )]
    allow_multiple: bool,
-    /// Only allow changing it on creation
+    /// Name of the privileged role for the endpoint.
-    #[clap(long, help = "Name of the privileged role for the endpoint")]
+    // Only allow changing it on creation.
    #[clap(long)]
    privileged_role_name: Option<String>,
 }
 /// Start Postgres. If the endpoint doesn't exist yet, it is created.
 #[derive(clap::Args)]
 #[clap(about = "Start postgres. If the endpoint doesn't exist yet, it is created.")]
 struct EndpointStartCmdArgs {
-    #[clap(help = "Postgres endpoint id")]
+    /// Postgres endpoint ID.
    endpoint_id: String,
    /// Pageserver ID.
    #[clap(long = "pageserver-id")]
    endpoint_pageserver_id: Option<NodeId>,
-
+    /// Safekeepers membership generation to prefix neon.safekeepers with.
-    #[clap(
+    #[clap(long)]
        long,
        help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations."
    )]
    safekeepers_generation: Option<u32>,
-    #[clap(
+    /// List of safekeepers endpoint will talk to.
-        long,
+    #[clap(long)]
        help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override."
    )]
    safekeepers: Option<String>,
-
+    /// Configure the remote extensions storage proxy gateway URL to request for extensions.
-    #[clap(
+    #[clap(long, alias = "remote-ext-config")]
        long,
        help = "Configure the remote extensions storage proxy gateway URL to request for extensions.",
        alias = "remote-ext-config"
    )]
    remote_ext_base_url: Option<String>,
-
+    /// If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`
-    #[clap(
+    #[clap(long)]
        long,
        help = "If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`"
    )]
    create_test_user: bool,
-
+    /// Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but
-    #[clap(
+    /// useful for tests.
-        long,
+    #[clap(long)]
        help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests."
    )]
    allow_multiple: bool,
-
+    /// Timeout until we fail the command.
-    #[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")]
+    #[clap(short = 't', long, value_parser= humantime::parse_duration)]
    #[arg(default_value = "90s")]
    start_timeout: Duration,
-    #[clap(
+    /// Download LFC cache from endpoint storage on endpoint startup
-        long,
+    #[clap(long, default_value = "false")]
        help = "Download LFC cache from endpoint storage on endpoint startup",
        default_value = "false"
    )]
    autoprewarm: bool,
-    #[clap(long, help = "Upload LFC cache to endpoint storage periodically")]
+    /// Upload LFC cache to endpoint storage periodically
    #[clap(long)]
    offload_lfc_interval_seconds: Option<std::num::NonZeroU64>,
-    #[clap(
+    /// Run in development mode, skipping VM-specific operations like process termination
-        long,
+    #[clap(long, action = clap::ArgAction::SetTrue)]
        help = "Run in development mode, skipping VM-specific operations like process termination",
        action = clap::ArgAction::SetTrue
    )]
    dev: bool,
 }
 /// Reconfigure an endpoint.
 #[derive(clap::Args)]
 #[clap(about = "Reconfigure an endpoint")]
 struct EndpointReconfigureCmdArgs {
-    #[clap(
+    /// Tenant id. Represented as a hexadecimal string 32 symbols length
-        long = "tenant-id",
+    #[clap(long = "tenant-id")]
        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
    )]
    tenant_id: Option<TenantId>,
-
+    /// Postgres endpoint ID.
    #[clap(help = "Postgres endpoint id")]
    endpoint_id: String,
    /// Pageserver ID.
    #[clap(long = "pageserver-id")]
    endpoint_pageserver_id: Option<NodeId>,
    #[clap(long)]
    safekeepers: Option<String>,
 }
 /// Refresh the endpoint's configuration by forcing it reload it's spec
 #[derive(clap::Args)]
 #[clap(about = "Refresh the endpoint's configuration by forcing it reload it's spec")]
 struct EndpointRefreshConfigurationArgs {
-    #[clap(help = "Postgres endpoint id")]
+    /// Postgres endpoint id
    endpoint_id: String,
 }
 /// Stop an endpoint.
 #[derive(clap::Args)]
 #[clap(about = "Stop an endpoint")]
 struct EndpointStopCmdArgs {
-    #[clap(help = "Postgres endpoint id")]
+    /// Postgres endpoint ID.
    endpoint_id: String,
-
+    /// Also delete data directory (now optional, should be default in future).
-    #[clap(
+    #[clap(long)]
        long,
        help = "Also delete data directory (now optional, should be default in future)"
    )]
    destroy: bool,
-    #[clap(long, help = "Postgres shutdown mode")]
+    /// Postgres shutdown mode, passed to `pg_ctl -m <mode>`.
    #[clap(long)]
    #[clap(default_value = "fast")]
    mode: EndpointTerminateMode,
 }
 /// Update the pageservers in the spec file of the compute endpoint
 #[derive(clap::Args)]
 #[clap(about = "Update the pageservers in the spec file of the compute endpoint")]
 struct EndpointUpdatePageserversCmdArgs {
-    #[clap(help = "Postgres endpoint id")]
+    /// Postgres endpoint id
    endpoint_id: String,
-    #[clap(short = 'p', long, help = "Specified pageserver id")]
+    /// Specified pageserver id
    #[clap(short = 'p', long)]
    pageserver_id: Option<NodeId>,
 }
 /// Generate a JWT for an endpoint.
 #[derive(clap::Args)]
 #[clap(about = "Generate a JWT for an endpoint")]
 struct EndpointGenerateJwtCmdArgs {
-    #[clap(help = "Postgres endpoint id")]
+    /// Postgres endpoint ID.
    endpoint_id: String,
-
+    /// Scope to generate the JWT with.
-    #[clap(short = 's', long, help = "Scope to generate the JWT with", value_parser = ComputeClaimsScope::from_str)]
+    #[clap(short = 's', long, value_parser = ComputeClaimsScope::from_str)]
    scope: Option<ComputeClaimsScope>,
 }
 /// Manage neon_local branch name mappings.
 #[derive(clap::Subcommand)]
 #[clap(about = "Manage neon_local branch name mappings")]
 enum MappingsCmd {
    Map(MappingsMapCmdArgs),
 }
 /// Create new mapping which cannot exist already.
 #[derive(clap::Args)]
 #[clap(about = "Create new mapping which cannot exist already")]
 struct MappingsMapCmdArgs {
-    #[clap(
+    /// Tenant ID, as a 32-byte hexadecimal string.
-        long,
+    #[clap(long)]
        help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
    )]
    tenant_id: TenantId,
-    #[clap(
+    /// Timeline ID, as a 32-byte hexadecimal string.
-        long,
+    #[clap(long)]
        help = "Timeline id. Represented as a hexadecimal string 32 symbols length"
    )]
    timeline_id: TimelineId,
-    #[clap(long, help = "Branch name to give to the timeline")]
+    /// Branch name to give to the timeline.
    #[clap(long)]
    branch_name: String,
 }
--- a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
@@ -120,6 +120,11 @@
                    "value": "host=pageserver port=6400",
                    "vartype": "string"
                },
                {
                    "name": "neon.pageserver_grpc_urls",
                    "value": "grpc://pageserver:6401/",
                    "vartype": "string"
                },
                {
                    "name": "max_replication_write_lag",
                    "value": "500MB",
--- a/docker-compose/pageserver_config/pageserver.toml
+++ b/docker-compose/pageserver_config/pageserver.toml
@@ -1,6 +1,7 @@
 broker_endpoint='http://storage_broker:50051'
 pg_distrib_dir='/usr/local/'
 listen_pg_addr='0.0.0.0:6400'
 listen_grpc_addr='0.0.0.0:6401'
 listen_http_addr='0.0.0.0:9898'
 remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
 control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address
--- a/docs/rfcs/2025-07-07-node-deletion-api-improvement.md
+++ b/docs/rfcs/2025-07-07-node-deletion-api-improvement.md
@@ -0,0 +1,246 @@
 # Node deletion API improvement
 Created on 2025-07-07
 Implemented on _TBD_
 ## Summary
 This RFC describes improvements to the storage controller API for gracefully deleting pageserver
 nodes.
 ## Motivation
 The basic node deletion API introduced in [#8226](https://github.com/neondatabase/neon/issues/8333)
 has several limitations:
 - Deleted nodes can re-add themselves if they restart (e.g., a flaky node that keeps restarting and
 we cannot reach via SSH to stop the pageserver). This issue has been resolved by tombstone
 mechanism in [#12036](https://github.com/neondatabase/neon/issues/12036)
 - Process of node deletion is not graceful, i.e. it just imitates a node failure
 In this context, "graceful" node deletion means that users do not experience any disruption or
 negative effects, provided the system remains in a healthy state (i.e., the remaining pageservers
 can handle the workload and all requirements are met). To achieve this, the system must perform
 live migration of all tenant shards from the node being deleted while the node is still running
 and continue processing all incoming requests. The node is removed only after all tenant shards
 have been safely migrated.
 Although live migrations can be achieved with the drain functionality, it leads to incorrect shard
 placement, such as not matching availability zones. This results in unnecessary work to optimize
 the placement that was just recently performed.
 If we delete a node before its tenant shards are fully moved, the new node won't have all the
 needed data (e.g. heatmaps) ready. This means user requests to the new node will be much slower at
 first. If there are many tenant shards, this slowdown affects a huge amount of users.
 Graceful node deletion is more complicated and can introduce new issues. It takes longer because
 live migration of each tenant shard can last several minutes. Using non-blocking accessors may
 also cause deletion to wait if other processes are holding inner state lock. It also gets trickier
 because we need to handle other requests, like drain and fill, at the same time.
 ## Impacted components (e.g. pageserver, safekeeper, console, etc)
 - storage controller
 - pageserver (indirectly)
 ## Proposed implementation
 ### Tombstones
 To resolve the problem of deleted nodes re-adding themselves, a tombstone mechanism was introduced
 as part of the node stored information. Each node has a separate `NodeLifecycle` field with two
 possible states: `Active` and `Deleted`. When node deletion completes, the database row is not
 deleted but instead has its `NodeLifecycle` column switched to `Deleted`. Nodes with `Deleted`
 lifecycle are treated as if the row is absent for most handlers, with several exceptions: reattach
 and register functionality must be aware of tombstones. Additionally, new debug handlers are
 available for listing and deleting tombstones via the `/debug/v1/tombstone` path.
 ### Gracefulness
 The problem of making node deletion graceful is complex and involves several challenges:
 - **Cancellable**: The operation must be cancellable to allow administrators to abort the process
 if needed, e.g. if run by mistake.
 - **Non-blocking**: We don't want to block deployment operations like draining/filling on the node
 deletion process. We need clear policies for handling concurrent operations: what happens when a
 drain/fill request arrives while deletion is in progress, and what happens when a delete request
 arrives while drain/fill is in progress.
 - **Persistent**: If the storage controller restarts during this long-running operation, we must
 preserve progress and automatically resume the deletion process after the storage controller
 restarts.
 - **Migrated correctly**: We cannot simply use the existing drain mechanism for nodes scheduled
 for deletion, as this would move shards to irrelevant locations. The drain process expects the
 node to return, so it only moves shards to backup locations, not to their preferred AZs. It also
 leaves secondary locations unmoved. This could result in unnecessary load on the storage
 controller and inefficient resource utilization.
 - **Force option**: Administrators need the ability to force immediate, non-graceful deletion when
 time constraints or emergency situations require it, bypassing the normal graceful migration
 process.
 See below for a detailed breakdown of the proposed changes and mechanisms.
 #### Node lifecycle
 New `NodeLifecycle` enum and a matching database field with these values:
 - `Active`: The normal state. All operations are allowed.
 - `ScheduledForDeletion`: The node is marked to be deleted soon. Deletion may be in progress or
 will happen later, but the node will eventually be removed. All operations are allowed.
 - `Deleted`: The node is fully deleted. No operations are allowed, and the node cannot be brought
 back. The only action left is to remove its record from the database. Any attempt to register a
 node in this state will fail.
 This state persists across storage controller restarts.
 **State transition**
 ```
        +--------------------+
    +---|       Active       |<---------------------+
    |   +--------------------+                      |
    |                     ^                         |
    | start_node_delete   | cancel_node_delete      |
    v                     |                         |
  +----------------------------------+              |
  |       ScheduledForDeletion       |              |
  +----------------------------------+              |
       |                                            |
       |                              node_register |
       |                                            |
       | delete_node (at the finish)                |
       |                                            |
       v                                            |
  +---------+         tombstone_delete        +----------+
  | Deleted |-------------------------------->|  no row  |
  +---------+                                 +----------+
 ```
 #### NodeSchedulingPolicy::Deleting
 A `Deleting` variant to the `NodeSchedulingPolicy` enum. This means the deletion function is
 running for the node right now. Only one node can have the `Deleting` policy at a time.
 The `NodeSchedulingPolicy::Deleting` state is persisted in the database. However, after a storage
 controller restart, any node previously marked as `Deleting` will have its scheduling policy reset
 to `Pause`. The policy will only transition back to `Deleting` when the deletion operation is
 actively started again, as triggered by the node's `NodeLifecycle::ScheduledForDeletion` state.
 `NodeSchedulingPolicy` transition details:
 1. When `node_delete` begins, set the policy to `NodeSchedulingPolicy::Deleting`.
 2. If `node_delete` is cancelled (for example, due to a concurrent drain operation), revert the
 policy to its previous value. The policy is persisted in storcon DB.
 3. After `node_delete` completes, the final value of the scheduling policy is irrelevant, since
 `NodeLifecycle::Deleted` prevents any further access to this field.
 The deletion process cannot be initiated for nodes currently undergoing deployment-related
 operations (`Draining`, `Filling`, or `PauseForRestart` policies). Deletion will only be triggered
 once the node transitions to either the `Active` or `Pause` state.
 #### OperationTracker
 A replacement for `Option<OperationHandler> ongoing_operation`, the `OperationTracker` is a
 dedicated service state object responsible for managing all long-running node operations (drain,
 fill, delete) with robust concurrency control.
 Key responsibilities:
 - Orchestrates the execution of operations
 - Supports cancellation of currently running operations
 - Enforces operation constraints, e.g. allowing only single drain/fill operation at a time
 - Persists deletion state, enabling recovery of pending deletions across restarts
 - Ensures thread safety across concurrent requests
 #### Attached tenant shard processing
 When deleting a node, handle each attached tenant shard as follows:
 1. Pick the best node to become the new attached (the candidate).
 2. If the candidate already has this shard as a secondary:
    - Create a new secondary for the shard on another suitable node.
   Otherwise:
    - Create a secondary for the shard on the candidate node.
 3. Wait until all secondaries are ready and pre-warmed.
 4. Promote the candidate's secondary to attached.
 5. Remove the secondary from the node being deleted.
 This process safely moves all attached shards before deleting the node.
 #### Secondary tenant shard processing
 When deleting a node, handle each secondary tenant shard as follows:
 1. Choose the best node to become the new secondary.
 2. Create a secondary for the shard on that node.
 3. Wait until the new secondary is ready.
 4. Remove the secondary from the node being deleted.
 This ensures all secondary shards are safely moved before deleting the node.
 ### Reliability, failure modes and corner cases
 In case of a storage controller failure and following restart, the system behavior depends on the
 `NodeLifecycle` state:
 - If `NodeLifecycle` is `Active`: No action is taken for this node.
 - If `NodeLifecycle` is `Deleted`: The node will not be re-added.
 - If `NodeLifecycle` is `ScheduledForDeletion`: A deletion background task will be launched for
 this node.
 In case of a pageserver node failure during deletion, the behavior depends on the `force` flag:
 - If `force` is set: The node deletion will proceed regardless of the node's availability.
 - If `force` is not set: The deletion will be retried a limited number of times. If the node
 remains unavailable, the deletion process will pause and automatically resume when the node
 becomes healthy again.
 ### Operations concurrency
 The following sections describe the behavior when different types of requests arrive at the storage
 controller and how they interact with ongoing operations.
 #### Delete request
 Handler: `PUT /control/v1/node/:node_id/delete`
 1. If node lifecycle is `NodeLifecycle::ScheduledForDeletion`:
    - Return `200 OK`: there is already an ongoing deletion request for this node
 2. Update & persist lifecycle to `NodeLifecycle::ScheduledForDeletion`
 3. Persist current scheduling policy
 4. If there is no active operation (drain/fill/delete):
    - Run deletion process for this node
 #### Cancel delete request
 Handler: `DELETE /control/v1/node/:node_id/delete`
 1. If node lifecycle is not `NodeLifecycle::ScheduledForDeletion`:
    - Return `404 Not Found`: there is no current deletion request for this node
 2. If the active operation is deleting this node, cancel it
 3. Update & persist lifecycle to `NodeLifecycle::Active`
 4. Restore the last scheduling policy from persistence
 #### Drain/fill request
 1. If there are already ongoing drain/fill processes:
    - Return `409 Conflict`: queueing of drain/fill processes is not supported
 2. If there is an ongoing delete process:
    - Cancel it and wait until it is cancelled
 3. Run the drain/fill process
 4. After the drain/fill process is cancelled or finished:
    - Try to find another candidate to delete and run the deletion process for that node
 #### Drain/fill cancel request
 1. If the active operation is not the related process:
    - Return `400 Bad Request`: cancellation request is incorrect, operations are not the same
 2. Cancel the active operation
 3. Try to find another candidate to delete and run the deletion process for that node
 ## Definition of Done
 - [x] Fix flaky node scenario and introduce related debug handlers
 - [ ] Node deletion intent is persistent - a node will be eventually deleted after a deletion
 request regardless of draining/filling requests and restarts
 - [ ] Node deletion can be graceful - deletion completes only after moving all tenant shards to
 recommended locations
 - [ ] Deploying does not break due to long deletions - drain/fill operations override deletion
 process and deletion resumes after drain/fill completes
 - [ ] `force` flag is implemented and provides fast, failure-tolerant node removal (e.g., when a
 pageserver node does not respond)
 - [ ] Legacy delete handler code is removed from storage_controller, test_runner, and storcon_cli
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -68,11 +68,15 @@ pub enum LfcPrewarmState {
    /// We tried to fetch the corresponding LFC state from the endpoint storage,
    /// but received `Not Found 404`. This should normally happen only during the
    /// first endpoint start after creation with `autoprewarm: true`.
    /// This may also happen if LFC is turned off or not initialized
    ///
    /// During the orchestrated prewarm via API, when a caller explicitly
    /// provides the LFC state key to prewarm from, it's the caller responsibility
    /// to handle this status as an error state in this case.
    Skipped,
    /// LFC prewarm was cancelled. Some pages in LFC cache may be prewarmed if query
    /// has started working before cancellation
    Cancelled,
 }
 impl Display for LfcPrewarmState {
@@ -83,6 +87,7 @@ impl Display for LfcPrewarmState {
            LfcPrewarmState::Completed => f.write_str("Completed"),
            LfcPrewarmState::Skipped => f.write_str("Skipped"),
            LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
            LfcPrewarmState::Cancelled => f.write_str("Cancelled"),
        }
    }
 }
@@ -97,6 +102,7 @@ pub enum LfcOffloadState {
    Failed {
        error: String,
    },
    Skipped,
 }
 #[derive(Serialize, Debug, Clone, PartialEq)]
--- a/libs/neon-shmem/Cargo.toml
+++ b/libs/neon-shmem/Cargo.toml
@@ -6,15 +6,26 @@ license.workspace = true
 [dependencies]
 thiserror.workspace = true
-nix.workspace=true
+nix.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 libc.workspace = true
 lock_api.workspace = true
 rustc-hash.workspace = true
 [dev-dependencies]
 criterion = { workspace = true, features = ["html_reports"] }
 rand = "0.9"
 rand_distr = "0.5.1"
 xxhash-rust = { version = "0.8.15", features = ["xxh3"] }
 ahash.workspace = true
 twox-hash = { version = "2.1.1" }
 seahash = "4.1.0"
 hashbrown = { git = "https://github.com/quantumish/hashbrown.git", rev = "6610e6d" }
 [target.'cfg(target_os = "macos")'.dependencies]
 tempfile = "3.14.0"
-[dev-dependencies]
+[[bench]]
-rand.workspace = true
+name = "hmap_resize"
-rand_distr = "0.5.1"
+harness = false
--- a/libs/neon-shmem/benches/hmap_resize.rs
+++ b/libs/neon-shmem/benches/hmap_resize.rs
@@ -0,0 +1,330 @@
 use criterion::{BatchSize, BenchmarkId, Criterion, criterion_group, criterion_main};
 use neon_shmem::hash::HashMapAccess;
 use neon_shmem::hash::HashMapInit;
 use neon_shmem::hash::entry::Entry;
 use rand::distr::{Distribution, StandardUniform};
 use rand::prelude::*;
 use std::default::Default;
 use std::hash::BuildHasher;
 // Taken from bindings to C code
 #[derive(Clone, Debug, Hash, Eq, PartialEq)]
 #[repr(C)]
 pub struct FileCacheKey {
    pub _spc_id: u32,
    pub _db_id: u32,
    pub _rel_number: u32,
    pub _fork_num: u32,
    pub _block_num: u32,
 }
 impl Distribution<FileCacheKey> for StandardUniform {
    // questionable, but doesn't need to be good randomness
    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> FileCacheKey {
        FileCacheKey {
            _spc_id: rng.random(),
            _db_id: rng.random(),
            _rel_number: rng.random(),
            _fork_num: rng.random(),
            _block_num: rng.random(),
        }
    }
 }
 #[derive(Clone, Debug)]
 #[repr(C)]
 pub struct FileCacheEntry {
    pub _offset: u32,
    pub _access_count: u32,
    pub _prev: *mut FileCacheEntry,
    pub _next: *mut FileCacheEntry,
    pub _state: [u32; 8],
 }
 impl FileCacheEntry {
    fn dummy() -> Self {
        Self {
            _offset: 0,
            _access_count: 0,
            _prev: std::ptr::null_mut(),
            _next: std::ptr::null_mut(),
            _state: [0; 8],
        }
    }
 }
 // Utilities for applying operations.
 #[derive(Clone, Debug)]
 struct TestOp<K, V>(K, Option<V>);
 fn apply_op<K: Clone + std::hash::Hash + Eq, V, S: std::hash::BuildHasher>(
    op: TestOp<K, V>,
    map: &mut HashMapAccess<K, V, S>,
 ) {
    let entry = map.entry(op.0);
    match op.1 {
        Some(new) => match entry {
            Entry::Occupied(mut e) => Some(e.insert(new)),
            Entry::Vacant(e) => {
                _ = e.insert(new).unwrap();
                None
            }
        },
        None => match entry {
            Entry::Occupied(e) => Some(e.remove()),
            Entry::Vacant(_) => None,
        },
    };
 }
 // Hash utilities
 struct SeaRandomState {
    k1: u64,
    k2: u64,
    k3: u64,
    k4: u64,
 }
 impl std::hash::BuildHasher for SeaRandomState {
    type Hasher = seahash::SeaHasher;
    fn build_hasher(&self) -> Self::Hasher {
        seahash::SeaHasher::with_seeds(self.k1, self.k2, self.k3, self.k4)
    }
 }
 impl SeaRandomState {
    fn new() -> Self {
        let mut rng = rand::rng();
        Self {
            k1: rng.random(),
            k2: rng.random(),
            k3: rng.random(),
            k4: rng.random(),
        }
    }
 }
 fn small_benchs(c: &mut Criterion) {
    let mut group = c.benchmark_group("Small maps");
    group.sample_size(10);
    group.bench_function("small_rehash", |b| {
        let ideal_filled = 4_000_000;
        let size = 5_000_000;
        let mut writer = HashMapInit::new_resizeable(size, size * 2).attach_writer();
        let mut rng = rand::rng();
        while writer.get_num_buckets_in_use() < ideal_filled as usize {
            let key: FileCacheKey = rng.random();
            let val = FileCacheEntry::dummy();
            apply_op(TestOp(key, Some(val)), &mut writer);
        }
        b.iter(|| writer.shuffle());
    });
    group.bench_function("small_rehash_xxhash", |b| {
        let ideal_filled = 4_000_000;
        let size = 5_000_000;
        let mut writer = HashMapInit::new_resizeable(size, size * 2)
            .with_hasher(twox_hash::xxhash64::RandomState::default())
            .attach_writer();
        let mut rng = rand::rng();
        while writer.get_num_buckets_in_use() < ideal_filled as usize {
            let key: FileCacheKey = rng.random();
            let val = FileCacheEntry::dummy();
            apply_op(TestOp(key, Some(val)), &mut writer);
        }
        b.iter(|| writer.shuffle());
    });
    group.bench_function("small_rehash_ahash", |b| {
        let ideal_filled = 4_000_000;
        let size = 5_000_000;
        let mut writer = HashMapInit::new_resizeable(size, size * 2)
            .with_hasher(ahash::RandomState::default())
            .attach_writer();
        let mut rng = rand::rng();
        while writer.get_num_buckets_in_use() < ideal_filled as usize {
            let key: FileCacheKey = rng.random();
            let val = FileCacheEntry::dummy();
            apply_op(TestOp(key, Some(val)), &mut writer);
        }
        b.iter(|| writer.shuffle());
    });
    group.bench_function("small_rehash_seahash", |b| {
        let ideal_filled = 4_000_000;
        let size = 5_000_000;
        let mut writer = HashMapInit::new_resizeable(size, size * 2)
            .with_hasher(SeaRandomState::new())
            .attach_writer();
        let mut rng = rand::rng();
        while writer.get_num_buckets_in_use() < ideal_filled as usize {
            let key: FileCacheKey = rng.random();
            let val = FileCacheEntry::dummy();
            apply_op(TestOp(key, Some(val)), &mut writer);
        }
        b.iter(|| writer.shuffle());
    });
    group.finish();
 }
 fn real_benchs(c: &mut Criterion) {
    let mut group = c.benchmark_group("Realistic workloads");
    group.sample_size(10);
    group.bench_function("real_bulk_insert", |b| {
        let size = 125_000_000;
        let ideal_filled = 100_000_000;
        let mut rng = rand::rng();
        b.iter_batched(
            || HashMapInit::new_resizeable(size, size * 2).attach_writer(),
            |writer| {
                for _ in 0..ideal_filled {
                    let key: FileCacheKey = rng.random();
                    let val = FileCacheEntry::dummy();
                    let entry = writer.entry(key);
                    match entry {
                        Entry::Occupied(mut e) => {
                            std::hint::black_box(e.insert(val));
                        }
                        Entry::Vacant(e) => {
                            let _ = std::hint::black_box(e.insert(val).unwrap());
                        }
                    }
                }
            },
            BatchSize::SmallInput,
        )
    });
    group.bench_function("real_rehash", |b| {
        let size = 125_000_000;
        let ideal_filled = 100_000_000;
        let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
        let mut rng = rand::rng();
        while writer.get_num_buckets_in_use() < ideal_filled {
            let key: FileCacheKey = rng.random();
            let val = FileCacheEntry::dummy();
            apply_op(TestOp(key, Some(val)), &mut writer);
        }
        b.iter(|| writer.shuffle());
    });
    group.bench_function("real_rehash_hashbrown", |b| {
        let size = 125_000_000;
        let ideal_filled = 100_000_000;
        let mut writer = hashbrown::raw::RawTable::new();
        let mut rng = rand::rng();
        let hasher = rustc_hash::FxBuildHasher;
        unsafe {
            writer
                .resize(
                    size,
                    |(k, _)| hasher.hash_one(k),
                    hashbrown::raw::Fallibility::Infallible,
                )
                .unwrap();
        }
        while writer.len() < ideal_filled as usize {
            let key: FileCacheKey = rng.random();
            let val = FileCacheEntry::dummy();
            writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
                hasher.hash_one(k)
            });
        }
        b.iter(|| unsafe {
            writer.table.rehash_in_place(
                &|table, index| {
                    hasher.hash_one(
                        &table
                            .bucket::<(FileCacheKey, FileCacheEntry)>(index)
                            .as_ref()
                            .0,
                    )
                },
                std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
                if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
                    Some(|ptr| std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry)))
                } else {
                    None
                },
            )
        });
    });
    for elems in [2, 4, 8, 16, 32, 64, 96, 112] {
        group.bench_with_input(
            BenchmarkId::new("real_rehash_varied", elems),
            &elems,
            |b, &size| {
                let ideal_filled = size * 1_000_000;
                let size = 125_000_000;
                let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
                let mut rng = rand::rng();
                while writer.get_num_buckets_in_use() < ideal_filled as usize {
                    let key: FileCacheKey = rng.random();
                    let val = FileCacheEntry::dummy();
                    apply_op(TestOp(key, Some(val)), &mut writer);
                }
                b.iter(|| writer.shuffle());
            },
        );
        group.bench_with_input(
            BenchmarkId::new("real_rehash_varied_hashbrown", elems),
            &elems,
            |b, &size| {
                let ideal_filled = size * 1_000_000;
                let size = 125_000_000;
                let mut writer = hashbrown::raw::RawTable::new();
                let mut rng = rand::rng();
                let hasher = rustc_hash::FxBuildHasher;
                unsafe {
                    writer
                        .resize(
                            size,
                            |(k, _)| hasher.hash_one(k),
                            hashbrown::raw::Fallibility::Infallible,
                        )
                        .unwrap();
                }
                while writer.len() < ideal_filled as usize {
                    let key: FileCacheKey = rng.random();
                    let val = FileCacheEntry::dummy();
                    writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
                        hasher.hash_one(k)
                    });
                }
                b.iter(|| unsafe {
                    writer.table.rehash_in_place(
                        &|table, index| {
                            hasher.hash_one(
                                &table
                                    .bucket::<(FileCacheKey, FileCacheEntry)>(index)
                                    .as_ref()
                                    .0,
                            )
                        },
                        std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
                        if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
                            Some(|ptr| {
                                std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry))
                            })
                        } else {
                            None
                        },
                    )
                });
            },
        );
    }
    group.finish();
 }
 criterion_group!(benches, small_benchs, real_benchs);
 criterion_main!(benches);
--- a/libs/neon-shmem/src/hash.rs
+++ b/libs/neon-shmem/src/hash.rs
@@ -16,6 +16,7 @@
 //!
 //! Concurrency is managed very simply: the entire map is guarded by one shared-memory RwLock.
 use std::fmt::Debug;
 use std::hash::{BuildHasher, Hash};
 use std::mem::MaybeUninit;
@@ -56,6 +57,22 @@ pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
    num_buckets: u32,
 }
 impl<'a, K, V, S> Debug for HashMapInit<'a, K, V, S>
 where
    K: Debug,
    V: Debug,
 {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("HashMapInit")
            .field("shmem_handle", &self.shmem_handle)
            .field("shared_ptr", &self.shared_ptr)
            .field("shared_size", &self.shared_size)
            // .field("hasher", &self.hasher)
            .field("num_buckets", &self.num_buckets)
            .finish()
    }
 }
 /// This is a per-process handle to a hash table that (possibly) lives in shared memory.
 /// If a child process is launched with fork(), the child process should
 /// get its own HashMapAccess by calling HashMapInit::attach_writer/reader().
@@ -71,6 +88,20 @@ pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
 unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<'_, K, V, S> {}
 unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<'_, K, V, S> {}
 impl<'a, K, V, S> Debug for HashMapAccess<'a, K, V, S>
 where
    K: Debug,
    V: Debug,
 {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("HashMapAccess")
            .field("shmem_handle", &self.shmem_handle)
            .field("shared_ptr", &self.shared_ptr)
            // .field("hasher", &self.hasher)
            .finish()
    }
 }
 impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
    /// Change the 'hasher' used by the hash table.
    ///
@@ -298,7 +329,7 @@ where
    /// Get a reference to the entry containing a key.
    ///
-    /// NB: THis takes a write lock as there's no way to distinguish whether the intention
+    /// NB: This takes a write lock as there's no way to distinguish whether the intention
    /// is to use the entry for reading or for writing in advance.
    pub fn entry(&self, key: K) -> Entry<'a, '_, K, V> {
        let hash = self.get_hash_value(&key);
--- a/libs/neon-shmem/src/hash/core.rs
+++ b/libs/neon-shmem/src/hash/core.rs
@@ -1,5 +1,6 @@
 //! Simple hash table with chaining.
 use std::fmt::Debug;
 use std::hash::Hash;
 use std::mem::MaybeUninit;
@@ -17,6 +18,19 @@ pub(crate) struct Bucket<K, V> {
    pub(crate) inner: Option<(K, V)>,
 }
 impl<K, V> Debug for Bucket<K, V>
 where
    K: Debug,
    V: Debug,
 {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("Bucket")
            .field("next", &self.next)
            .field("inner", &self.inner)
            .finish()
    }
 }
 /// Core hash table implementation.
 pub(crate) struct CoreHashMap<'a, K, V> {
    /// Dictionary used to map hashes to bucket indices.
@@ -31,6 +45,22 @@ pub(crate) struct CoreHashMap<'a, K, V> {
    pub(crate) buckets_in_use: u32,
 }
 impl<'a, K, V> Debug for CoreHashMap<'a, K, V>
 where
    K: Debug,
    V: Debug,
 {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("CoreHashMap")
            .field("dictionary", &self.dictionary)
            .field("buckets", &self.buckets)
            .field("free_head", &self.free_head)
            .field("alloc_limit", &self.alloc_limit)
            .field("buckets_in_use", &self.buckets_in_use)
            .finish()
    }
 }
 /// Error for when there are no empty buckets left but one is needed.
 #[derive(Debug, PartialEq)]
 pub struct FullError;
--- a/libs/neon-shmem/src/hash/entry.rs
+++ b/libs/neon-shmem/src/hash/entry.rs
@@ -61,6 +61,10 @@ impl<K, V> OccupiedEntry<'_, '_, K, V> {
    ///
    /// This may result in multiple bucket accesses if the entry was obtained by index as the
    /// previous chain entry needs to be discovered in this case.
    ///
    /// # Panics
    /// Panics if the `prev_pos` field is equal to [`PrevPos::Unknown`]. In practice, this means
    /// the entry was obtained via calling something like [`super::HashMapAccess::entry_at_bucket`].
    pub fn remove(mut self) -> V {
        // If this bucket was queried by index, go ahead and follow its chain from the start.
        let prev = if let PrevPos::Unknown(hash) = self.prev_pos {
--- a/libs/neon-shmem/src/shmem.rs
+++ b/libs/neon-shmem/src/shmem.rs
@@ -21,6 +21,7 @@ use nix::unistd::ftruncate as nix_ftruncate;
 /// the underlying file is resized. Do not access the area beyond the current size. Currently, that
 /// will cause the file to be expanded, but we might use `mprotect()` etc. to enforce that in the
 /// future.
 #[derive(Debug)]
 pub struct ShmemHandle {
    /// memfd file descriptor
    fd: OwnedFd,
@@ -35,6 +36,7 @@ pub struct ShmemHandle {
 }
 /// This is stored at the beginning in the shared memory area.
 #[derive(Debug)]
 struct SharedStruct {
    max_size: usize,
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -310,6 +310,11 @@ impl AtomicLsn {
        }
    }
    /// Consumes the atomic and returns the contained value.
    pub const fn into_inner(self) -> Lsn {
        Lsn(self.inner.into_inner())
    }
    /// Atomically retrieve the `Lsn` value from memory.
    pub fn load(&self) -> Lsn {
        Lsn(self.inner.load(Ordering::Acquire))
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -341,6 +341,34 @@ extern "C-unwind" fn log_internal(
    }
 }
 /* BEGIN_HADRON */
 extern "C" fn reset_safekeeper_statuses_for_metrics(wp: *mut WalProposer, num_safekeepers: u32) {
    unsafe {
        let callback_data = (*(*wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
        if api.is_null() {
            return;
        }
        (*api).reset_safekeeper_statuses_for_metrics(&mut (*wp), num_safekeepers);
    }
 }
 extern "C" fn update_safekeeper_status_for_metrics(
    wp: *mut WalProposer,
    sk_index: u32,
    status: u8,
 ) {
    unsafe {
        let callback_data = (*(*wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
        if api.is_null() {
            return;
        }
        (*api).update_safekeeper_status_for_metrics(&mut (*wp), sk_index, status);
    }
 }
 /* END_HADRON */
 #[derive(Debug, PartialEq)]
 pub enum Level {
    Debug5,
@@ -414,6 +442,10 @@ pub(crate) fn create_api() -> walproposer_api {
        finish_sync_safekeepers: Some(finish_sync_safekeepers),
        process_safekeeper_feedback: Some(process_safekeeper_feedback),
        log_internal: Some(log_internal),
        /* BEGIN_HADRON */
        reset_safekeeper_statuses_for_metrics: Some(reset_safekeeper_statuses_for_metrics),
        update_safekeeper_status_for_metrics: Some(update_safekeeper_status_for_metrics),
        /* END_HADRON */
    }
 }
@@ -451,6 +483,8 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
        replica_promote: false,
        min_ps_feedback: empty_feedback,
        wal_rate_limiter: empty_wal_rate_limiter,
        num_safekeepers: 0,
        safekeeper_status: [0; 32],
    }
 }
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -159,6 +159,21 @@ pub trait ApiImpl {
    fn after_election(&self, _wp: &mut WalProposer) {
        todo!()
    }
    /* BEGIN_HADRON */
    fn reset_safekeeper_statuses_for_metrics(&self, _wp: &mut WalProposer, _num_safekeepers: u32) {
        // Do nothing for testing purposes.
    }
    fn update_safekeeper_status_for_metrics(
        &self,
        _wp: &mut WalProposer,
        _sk_index: u32,
        _status: u8,
    ) {
        // Do nothing for testing purposes.
    }
    /* END_HADRON */
 }
 #[derive(Debug)]
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -54,6 +54,7 @@ pageserver_api.workspace = true
 pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
 pageserver_compaction.workspace = true
 pageserver_page_api.workspace = true
 peekable.workspace = true
 pem.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
@@ -66,6 +67,7 @@ postgres-types.workspace = true
 posthog_client_lite.workspace = true
 pprof.workspace = true
 pq_proto.workspace = true
 prost.workspace = true
 rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
--- a/pageserver/client_grpc/src/lib.rs
+++ b/pageserver/client_grpc/src/lib.rs
@@ -3,3 +3,4 @@ mod pool;
 mod retry;
 pub use client::{PageserverClient, ShardSpec};
 pub use pageserver_api::shard::ShardStripeSize; // used in ShardSpec
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -33,6 +33,8 @@ pub enum ProtocolError {
    Invalid(&'static str, String),
    #[error("required field '{0}' is missing")]
    Missing(&'static str),
    #[error("invalid combination of not_modified_lsn '{0}' and request_lsn '{1}'")]
    InvalidLsns(Lsn, Lsn),
 }
 impl ProtocolError {
@@ -85,9 +87,9 @@ impl TryFrom<proto::ReadLsn> for ReadLsn {
            return Err(ProtocolError::invalid("request_lsn", pb.request_lsn));
        }
        if pb.not_modified_since_lsn > pb.request_lsn {
-            return Err(ProtocolError::invalid(
+            return Err(ProtocolError::InvalidLsns(
-                "not_modified_since_lsn",
+                Lsn(pb.not_modified_since_lsn),
-                pb.not_modified_since_lsn,
+                Lsn(pb.request_lsn),
            ));
        }
        Ok(Self {
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -25,6 +25,9 @@ tracing.workspace = true
 tokio.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
 axum.workspace = true
 http.workspace = true
 metrics.workspace = true
 tonic.workspace = true
 url.workspace = true
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -34,6 +34,10 @@ use crate::util::{request_stats, tokio_thread_local_stats};
 /// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
 #[derive(clap::Parser)]
 pub(crate) struct Args {
    #[clap(long, default_value = "false")]
    grpc: bool,
    #[clap(long, default_value = "false")]
    grpc_stream: bool,
    #[clap(long, default_value = "http://localhost:9898")]
    mgmt_api_endpoint: String,
    /// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
@@ -78,6 +82,9 @@ pub(crate) struct Args {
    #[clap(long)]
    set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,
    #[clap(long)]
    only_relnode: Option<u32>,
    /// Queue depth generated in each client.
    #[clap(long, default_value = "1")]
    queue_depth: NonZeroUsize,
@@ -92,10 +99,31 @@ pub(crate) struct Args {
    #[clap(long, default_value = "1")]
    batch_size: NonZeroUsize,
    #[clap(long)]
    only_relnode: Option<u32>,
    targets: Option<Vec<TenantTimelineId>>,
    #[clap(long, default_value = "100")]
    pool_max_consumers: NonZeroUsize,
    #[clap(long, default_value = "5")]
    pool_error_threshold: NonZeroUsize,
    #[clap(long, default_value = "5000")]
    pool_connect_timeout: NonZeroUsize,
    #[clap(long, default_value = "1000")]
    pool_connect_backoff: NonZeroUsize,
    #[clap(long, default_value = "60000")]
    pool_max_idle_duration: NonZeroUsize,
    #[clap(long, default_value = "0")]
    max_delay_ms: usize,
    #[clap(long, default_value = "0")]
    percent_drops: usize,
    #[clap(long, default_value = "0")]
    percent_hangs: usize,
 }
 /// State shared by all clients
@@ -152,7 +180,6 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
        main_impl(args, thread_local_stats)
    })
 }
 async fn main_impl(
    args: Args,
    all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
@@ -317,6 +344,7 @@ async fn main_impl(
    let rps_period = args
        .per_client_rate
        .map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
    let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
        let ss = shared_state.clone();
        let cancel = cancel.clone();
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -453,6 +453,7 @@ impl TimelineHandles {
            handles: Default::default(),
        }
    }
    async fn get(
        &mut self,
        tenant_id: TenantId,
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -5,10 +5,12 @@ MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
 	communicator.o \
 	communicator_new.o \
 	communicator_process.o \
 	extension_server.o \
 	file_cache.o \
 	hll.o \
 	lfc_prewarm.o \
 	libpagestore.o \
 	logical_replication_monitor.o \
 	neon.o \
@@ -67,6 +69,7 @@ WALPROP_OBJS = \
 # libcommunicator.a is built by cargo from the Rust sources under communicator/
 # subdirectory. `cargo build` also generates communicator_bindings.h.
 communicator_new.o: communicator/communicator_bindings.h
 communicator_process.o: communicator/communicator_bindings.h
 file_cache.o: communicator/communicator_bindings.h
--- a/pgxn/neon/communicator/Cargo.toml
+++ b/pgxn/neon/communicator/Cargo.toml
@@ -17,12 +17,30 @@ rest_broker = []
 [dependencies]
 axum.workspace = true
 bytes.workspace = true
 clashmap.workspace = true
 http.workspace = true
 libc.workspace = true
 nix.workspace = true
 atomic_enum = "0.3.0"
 measured.workspace = true
 prometheus.workspace = true
 prost.workspace = true
 strum_macros.workspace = true
 thiserror.workspace = true
 tonic = { workspace = true, default-features = false, features=["codegen", "prost", "transport"] }
 tokio = { workspace = true, features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] }
 tokio-pipe = { version = "0.2.12" }
 tracing.workspace = true
 tracing-subscriber.workspace = true
-measured.workspace = true
+uring-common = { workspace = true, features = ["bytes"] }
 pageserver_client_grpc.workspace = true
 pageserver_api.workspace = true
 pageserver_page_api.workspace = true
 neon-shmem.workspace = true
 utils.workspace = true
 workspace_hack = { version = "0.1", path = "../../../workspace_hack" }
--- a/pgxn/neon/communicator/README.md
+++ b/pgxn/neon/communicator/README.md
@@ -3,9 +3,18 @@
 This package provides the so-called "compute-pageserver communicator",
 or just "communicator" in short. The communicator is a separate
 background worker process that runs in the PostgreSQL server. It's
-part of the neon extension. Currently, it only provides an HTTP
+part of the neon extension.
-endpoint for metrics, but in the future it will evolve to handle all
+
-communications with the pageservers.
+The commuicator handles the communication with the pageservers, and
 also provides an HTTP endpoint for metrics over a local Unix Domain
 socket (aka. the "communicator control socket"). On the PostgreSQL
 side, the glue code in pgxn/neon/ uses the communicator to implement
 the PostgreSQL Storage Manager (SMGR) interface.
 ## Design criteria
 - Low latency
 - Saturate a 10 Gbit / s network interface without becoming a bottleneck
 ## Source code view
@@ -14,10 +23,122 @@ pgxn/neon/communicator_process.c
    the glue that interacts with PostgreSQL code and the Rust
    code in the communicator process.
 pgxn/neon/communicator_new.c
 	Contains the backend code that interacts with the communicator
 	process.
-pgxn/neon/communicator/src/worker_process/
+pgxn/neon/communicator/src/backend_interface.rs
-    Worker process main loop and glue code
+	The entry point for calls from each backend.
 pgxn/neon/communicator/src/init.rs
 	Initialization at server startup
 At compilation time, pgxn/neon/communicator/ produces a static
 library, libcommunicator.a. It is linked to the neon.so extension
 library.
 The real networking code, which is independent of PostgreSQL, is in
 the pageserver/client_grpc crate.
 ## Process view
 The communicator runs in a dedicated background worker process, the
 "communicator process". The communicator uses a multi-threaded Tokio
 runtime to execute the IO requests. So the communicator process has
 multiple threads running. That's unusual for Postgres processes and
 care must be taken to make that work.
 ### Backend <-> worker communication
 Each backend has a number of I/O request slots in shared memory. The
 slots are statically allocated for each backend, and must not be
 accessed by other backends. The worker process reads requests from the
 shared memory slots, and writes responses back to the slots.
 Here's an example snapshot of the system, when two requests from two
 different backends are in progress:
 ```
 Backends           Request slots          Communicator process
 ---------          -------------          --------------------
 Backend 1          1: Idle
                   2: Idle
                   3: Processing          tokio task handling request 3
 Backend 2          4: Completed
                   5: Processing          tokio task handling request 5
                   6: Idle
 ...                ...
 ```
 To submit an IO request, the backend first picks one of its Idle
 slots, writes the IO request in the slot, and updates it to
 'Submitted' state. That transfers the ownership of the slot to the
 worker process, until the worker process marks the request as
 Completed. The worker process spawns a separate Tokio task for each
 request.
 To inform the worker process that a request slot has a pending IO
 request, there's a pipe shared by the worker process and all backend
 processes. The backend writes the index of the request slot to the
 pipe after changing the slot's state to Submitted. This wakes up the
 worker process.
 (Note that the pipe is just used for wakeups, but the worker process
 is free to pick up Submitted IO requests even without receiving the
 wakeup. As of this writing, it doesn't do that, but it might be useful
 in the future to reduce latency even further, for example.)
 When the worker process has completed processing the request, it
 writes the result back in the request slot. A GetPage request can also
 contain a pointer to buffer in the shared buffer cache. In that case,
 the worker process writes the resulting page contents directly to the
 buffer, and just a result code in the request slot. It then updates
 the 'state' field to Completed, which passes the owner ship back to
 the originating backend. Finally, it signals the process Latch of the
 originating backend, waking it up.
 ### Differences between PostgreSQL v16, v17 and v18
 PostgreSQL v18 introduced the new AIO mechanism. The PostgreSQL AIO
 mechanism uses a very similar mechanism as described in the previous
 section, for the communication between AIO worker processes and
 backends. With our communicator, the AIO worker processes are not
 used, but we use the same PgAioHandle request slots as in upstream.
 For Neon-specific IO requests like GetDbSize, a neon request slot is
 used. But for the actual IO requests, the request slot merely contains
 a pointer to the PgAioHandle slot. The worker process updates the
 status of that, calls the IO callbacks upon completionetc, just like
 the upstream AIO worker processes do.
 ## Sequence diagram
                      neon
    PostgreSQL     extension       backend_interface.rs  worker_process.rs    processor    tonic
       |               .                    .                   .                 .
 	   | smgr_read()   .                    .                   .                 .
 	   +-------------> +                    .                   .                 .
 	   .               |                    .                   .                 .
 	   .               |  rcommunicator_    .                   .                 .
 	   .               | get_page_at_lsn    .                   .                 .
 	   .               +------------------> +                   .                 .
                                            |                   .                 .
                                            | write request to  .                 .                 .
                                            | slot              .                 .
                                            |                   .                 .
                                            |                   .                 .
 											| submit_request()  .                 .
 											+-----------------> +                 .
 											|                   |                 .
 											|					| db_size_request .               .
 																+---------------->.
 																                  . TODO
 ### Compute <-> pageserver protocol
 The protocol between Compute and the pageserver is based on gRPC. See `protos/`.
--- a/pgxn/neon/communicator/src/backend_comms.rs
+++ b/pgxn/neon/communicator/src/backend_comms.rs
@@ -0,0 +1,224 @@
 //! This module implements a request/response "slot" for submitting
 //! requests from backends to the communicator process.
 //!
 //! NB: The "backend" side of this code runs in Postgres backend processes,
 //! which means that it is not safe to use the 'tracing' crate for logging, nor
 //! to launch threads or use tokio tasks!
 use std::cell::UnsafeCell;
 use std::sync::atomic::{AtomicI32, Ordering};
 use crate::neon_request::{NeonIORequest, NeonIOResult};
 use atomic_enum::atomic_enum;
 /// One request/response slot. Each backend has its own set of slots that it
 /// uses.
 ///
 /// This is the moral equivalent of PgAioHandle for Postgres AIO requests
 /// Like PgAioHandle, try to keep this small.
 ///
 /// There is an array of these in shared memory. Therefore, this must be Sized.
 ///
 /// ## Lifecycle of a request
 ///
 /// A slot is always owned by either the backend process or the communicator
 /// process, depending on the 'state'. Only the owning process is allowed to
 /// read or modify the slot, except for reading the 'state' itself to check who
 /// owns it.
 ///
 /// A slot begins in the Idle state, where it is owned by the backend process.
 /// To submit a request, the backend process fills the slot with the request
 /// data, and changes it to the Submitted state. After changing the state, the
 /// slot is owned by the communicator process, and the backend is not allowed
 /// to access it until the communicator process marks it as Completed.
 ///
 /// When the communicator process sees that the slot is in Submitted state, it
 /// starts to process the request. After processing the request, it stores the
 /// result in the slot, and changes the state to Completed. It is now owned by
 /// the backend process again, which may now read the result, and reuse the
 /// slot for a new request.
 ///
 /// For correctness of the above protocol, we really only need two states:
 /// "owned by backend" and "owned by communicator process". But to help with
 /// debugging and better assertions, there are a few more states. When the
 /// backend starts to fill in the request details in the slot, it first sets the
 /// state from Idle to Filling, and when it's done with that, from Filling to
 /// Submitted. In the Filling state, the slot is still owned by the
 /// backend. Similarly, when the communicator process starts to process a
 /// request, it sets it to Processing state first, but the slot is still owned
 /// by the communicator process.
 ///
 /// This struct doesn't handle waking up the communicator process when a request
 /// has been submitted or when a response is ready. The 'owner_procno' is used
 /// for waking up the backend on completion, but that happens elsewhere.
 pub struct NeonIORequestSlot {
    /// similar to PgAioHandleState
    state: AtomicNeonIORequestSlotState,
    /// The owning process's ProcNumber. The worker process uses this to set the
    /// process's latch on completion.
    ///
    /// (This could be calculated from num_neon_request_slots_per_backend and
    /// the index of this slot in the overall 'neon_requst_slots array'. But we
    /// prefer the communicator process to not know how the request slots are
    /// divided between the backends.)
    owner_procno: AtomicI32,
    /// SAFETY: This is modified by submit_request(), after it has established
    /// ownership of the slot by setting state from Idle to Filling
    request: UnsafeCell<NeonIORequest>,
    /// Valid when state is Completed
    ///
    /// SAFETY: This is modified by RequestProcessingGuard::complete(). There
    /// can be only one RequestProcessingGuard outstanding for a slot at a time,
    /// because it is returned by start_processing_request() which checks the
    /// state, so RequestProcessingGuard has exclusive access to the slot.
    result: UnsafeCell<NeonIOResult>,
 }
 // The protocol described in the "Lifecycle of a request" section above ensures
 // the safe access to the fields
 unsafe impl Send for NeonIORequestSlot {}
 unsafe impl Sync for NeonIORequestSlot {}
 impl Default for NeonIORequestSlot {
    fn default() -> NeonIORequestSlot {
        NeonIORequestSlot {
            owner_procno: AtomicI32::new(-1),
            request: UnsafeCell::new(NeonIORequest::Empty),
            result: UnsafeCell::new(NeonIOResult::Empty),
            state: AtomicNeonIORequestSlotState::new(NeonIORequestSlotState::Idle),
        }
    }
 }
 #[atomic_enum]
 #[derive(Eq, PartialEq)]
 pub enum NeonIORequestSlotState {
    Idle,
    /// Backend is filling in the request
    Filling,
    /// Backend has submitted the request to the communicator, but the
    /// communicator process has not yet started processing it.
    Submitted,
    /// Communicator is processing the request
    Processing,
    /// Communicator has completed the request, and the 'result' field is now
    /// valid, but the backend has not read the result yet.
    Completed,
 }
 impl NeonIORequestSlot {
    /// Write a request to the slot, and mark it as Submitted.
    ///
    /// Note: This does not wake up the worker process to actually process
    /// the request. It's the caller's responsibility to do that.
    pub fn submit_request(&self, request: &NeonIORequest, proc_number: i32) {
        // Verify that the slot is in Idle state previously, and put it in
        // Filling state.
        //
        // XXX: This step isn't strictly necessary. Assuming the caller didn't
        // screw up and try to use a slot that's already in use, we could fill
        // the slot and switch it directly from Idle to Submitted state.
        if let Err(s) = self.state.compare_exchange(
            NeonIORequestSlotState::Idle,
            NeonIORequestSlotState::Filling,
            Ordering::Relaxed,
            Ordering::Relaxed,
        ) {
            panic!("unexpected state in request slot: {s:?}");
        }
        // Fill in the request details
        self.owner_procno.store(proc_number, Ordering::Relaxed);
        unsafe { *self.request.get() = *request }
        // This synchronizes-with store/swap in [`start_processing_request`].
        // Note that this ensures that the previous non-atomic writes visible
        // to other threads too.
        self.state
            .store(NeonIORequestSlotState::Submitted, Ordering::Release);
    }
    pub fn get_state(&self) -> NeonIORequestSlotState {
        self.state.load(Ordering::Relaxed)
    }
    pub fn try_get_result(&self) -> Option<NeonIOResult> {
        // This synchronizes-with the store/swap in [`RequestProcessingGuard::completed`]
        let state = self.state.load(Ordering::Acquire);
        if state == NeonIORequestSlotState::Completed {
            let result = unsafe { *self.result.get() };
            self.state
                .store(NeonIORequestSlotState::Idle, Ordering::Relaxed);
            Some(result)
        } else {
            None
        }
    }
    /// Read the IO request from the slot indicated in the wakeup
    pub fn start_processing_request<'a>(&'a self) -> Option<RequestProcessingGuard<'a>> {
        // XXX: using atomic load rather than compare_exchange would be
        // sufficient here, as long as the communicator process has _some_ means
        // of tracking which requests it's already processing. That could be a
        // flag somewhere in communicator's private memory, for example.
        //
        // This synchronizes-with the store in [`submit_request`].
        if let Err(s) = self.state.compare_exchange(
            NeonIORequestSlotState::Submitted,
            NeonIORequestSlotState::Processing,
            Ordering::Acquire,
            Ordering::Relaxed,
        ) {
            // FIXME surprising state. This is unexpected at the moment, but if we
            // started to process requests more aggressively, without waiting for the
            // read from the pipe, then this could happen
            panic!("unexpected state in request slot: {s:?}");
        }
        Some(RequestProcessingGuard(self))
    }
 }
 /// [`NeonIORequestSlot::start_processing_request`] returns this guard object to
 /// indicate that the the caller now "owns" the slot, until it calls
 /// [`RequestProcessingGuard::completed`].
 ///
 /// TODO: implement Drop on this, to mark the request as Aborted or Errored
 /// if [`RequestProcessingGuard::completed`] is not called.
 pub struct RequestProcessingGuard<'a>(&'a NeonIORequestSlot);
 unsafe impl<'a> Send for RequestProcessingGuard<'a> {}
 unsafe impl<'a> Sync for RequestProcessingGuard<'a> {}
 impl<'a> RequestProcessingGuard<'a> {
    pub fn get_request(&self) -> &NeonIORequest {
        unsafe { &*self.0.request.get() }
    }
    pub fn get_owner_procno(&self) -> i32 {
        self.0.owner_procno.load(Ordering::Relaxed)
    }
    pub fn completed(self, result: NeonIOResult) {
        // Store the result to the slot.
        unsafe {
            *self.0.result.get() = result;
        };
        // Mark the request as completed. After that, we no longer have
        // ownership of the slot, and must not modify it.
        let old_state = self
            .0
            .state
            .swap(NeonIORequestSlotState::Completed, Ordering::Release);
        assert!(old_state == NeonIORequestSlotState::Processing);
    }
 }
--- a/pgxn/neon/communicator/src/backend_interface.rs
+++ b/pgxn/neon/communicator/src/backend_interface.rs
@@ -0,0 +1,296 @@
 //! This code runs in each backend process. That means that launching Rust threads, panicking
 //! etc. is forbidden!
 use std::os::fd::OwnedFd;
 use crate::backend_comms::NeonIORequestSlot;
 use crate::init::CommunicatorInitStruct;
 use crate::integrated_cache::{BackendCacheReadOp, IntegratedCacheReadAccess};
 use crate::neon_request::{CCachedGetPageVResult, CLsn, COid};
 use crate::neon_request::{NeonIORequest, NeonIOResult};
 use utils::lsn::Lsn;
 pub struct CommunicatorBackendStruct<'t> {
    my_proc_number: i32,
    neon_request_slots: &'t [NeonIORequestSlot],
    submission_pipe_write_fd: OwnedFd,
    pending_cache_read_op: Option<BackendCacheReadOp<'t>>,
    integrated_cache: &'t IntegratedCacheReadAccess<'t>,
 }
 #[unsafe(no_mangle)]
 pub extern "C" fn rcommunicator_backend_init(
    cis: Box<CommunicatorInitStruct>,
    my_proc_number: i32,
 ) -> &'static mut CommunicatorBackendStruct<'static> {
    if my_proc_number < 0 {
        panic!("cannot attach to communicator shared memory with procnumber {my_proc_number}");
    }
    let integrated_cache = Box::leak(Box::new(cis.integrated_cache_init_struct.backend_init()));
    let bs: &'static mut CommunicatorBackendStruct =
        Box::leak(Box::new(CommunicatorBackendStruct {
            my_proc_number,
            neon_request_slots: cis.neon_request_slots,
            submission_pipe_write_fd: cis.submission_pipe_write_fd,
            pending_cache_read_op: None,
            integrated_cache,
        }));
    bs
 }
 /// Start a request. You can poll for its completion and get the result by
 /// calling bcomm_poll_dbsize_request_completion(). The communicator will wake
 /// us up by setting our process latch, so to wait for the completion, wait on
 /// the latch and call bcomm_poll_dbsize_request_completion() every time the
 /// latch is set.
 ///
 /// Safety: The C caller must ensure that the references are valid.
 /// The requested slot must be free, or this panics.
 #[unsafe(no_mangle)]
 pub extern "C" fn bcomm_start_io_request(
    bs: &'_ mut CommunicatorBackendStruct,
    slot_idx: i32,
    request: &NeonIORequest,
    immediate_result_ptr: &mut NeonIOResult,
 ) -> i32 {
    assert!(bs.pending_cache_read_op.is_none());
    // Check if the request can be satisfied from the cache first
    if let NeonIORequest::RelSize(req) = request {
        if let Some(nblocks) = bs.integrated_cache.get_rel_size(&req.reltag()) {
            *immediate_result_ptr = NeonIOResult::RelSize(nblocks);
            return -1;
        }
    }
    // Create neon request and submit it
    bs.start_neon_io_request(slot_idx, request);
    slot_idx
 }
 #[unsafe(no_mangle)]
 pub extern "C" fn bcomm_start_get_page_v_request(
    bs: &mut CommunicatorBackendStruct,
    slot_idx: i32,
    request: &NeonIORequest,
    immediate_result_ptr: &mut CCachedGetPageVResult,
 ) -> i32 {
    let NeonIORequest::GetPageV(get_pagev_request) = request else {
        panic!("invalid request passed to bcomm_start_get_page_v_request()");
    };
    assert!(matches!(request, NeonIORequest::GetPageV(_)));
    assert!(bs.pending_cache_read_op.is_none());
    // Check if the request can be satisfied from the cache first
    let mut all_cached = true;
    let mut read_op = bs.integrated_cache.start_read_op();
    for i in 0..get_pagev_request.nblocks {
        if let Some(cache_block) = read_op.get_page(
            &get_pagev_request.reltag(),
            get_pagev_request.block_number + i as u32,
        ) {
            immediate_result_ptr.cache_block_numbers[i as usize] = cache_block;
        } else {
            // not found in cache
            all_cached = false;
            break;
        }
    }
    if all_cached {
        bs.pending_cache_read_op = Some(read_op);
        return -1;
    }
    // Create neon request and submit it
    bs.start_neon_io_request(slot_idx, request);
    slot_idx
 }
 /// Check if a request has completed. Returns:
 ///
 /// -1 if the request is still being processed
 /// 0 on success
 #[unsafe(no_mangle)]
 pub extern "C" fn bcomm_poll_request_completion(
    bs: &mut CommunicatorBackendStruct,
    request_slot_idx: u32,
    result_p: &mut NeonIOResult,
 ) -> i32 {
    match bs.neon_request_slots[request_slot_idx as usize].try_get_result() {
        None => -1, // still processing
        Some(result) => {
            *result_p = result;
            0
        }
    }
 }
 /// Check if a request has completed. Returns:
 ///
 /// 'false' if the slot is Idle. The backend process has ownership.
 /// 'true' if the slot is busy, and should be polled for result.
 #[unsafe(no_mangle)]
 pub extern "C" fn bcomm_get_request_slot_status(
    bs: &mut CommunicatorBackendStruct,
    request_slot_idx: u32,
 ) -> bool {
    use crate::backend_comms::NeonIORequestSlotState;
    match bs.neon_request_slots[request_slot_idx as usize].get_state() {
        NeonIORequestSlotState::Idle => false,
        NeonIORequestSlotState::Filling => {
            // 'false' would be the right result here. However, this
            // is a very transient state. The C code should never
            // leave a slot in this state, so if it sees that,
            // something's gone wrong and it's not clear what to do
            // with it.
            panic!("unexpected Filling state in request slot {request_slot_idx}");
        }
        NeonIORequestSlotState::Submitted => true,
        NeonIORequestSlotState::Processing => true,
        NeonIORequestSlotState::Completed => true,
    }
 }
 // LFC functions
 /// Finish a local file cache read
 ///
 //
 #[unsafe(no_mangle)]
 pub extern "C" fn bcomm_finish_cache_read(bs: &mut CommunicatorBackendStruct) -> bool {
    if let Some(op) = bs.pending_cache_read_op.take() {
        op.finish()
    } else {
        panic!("bcomm_finish_cache_read() called with no cached read pending");
    }
 }
 /// Check if LFC contains the given buffer, and update its last-written LSN if not.
 ///
 /// This is used in WAL replay in read replica, to skip updating pages that are
 /// not in cache.
 #[unsafe(no_mangle)]
 pub extern "C" fn bcomm_update_lw_lsn_for_block_if_not_cached(
    bs: &mut CommunicatorBackendStruct,
    spc_oid: COid,
    db_oid: COid,
    rel_number: u32,
    fork_number: u8,
    block_number: u32,
    lsn: CLsn,
 ) -> bool {
    bs.integrated_cache.update_lw_lsn_for_block_if_not_cached(
        &pageserver_page_api::RelTag {
            spcnode: spc_oid,
            dbnode: db_oid,
            relnode: rel_number,
            forknum: fork_number,
        },
        block_number,
        Lsn(lsn),
    )
 }
 #[repr(C)]
 #[derive(Clone, Debug)]
 pub struct FileCacheIterator {
    next_bucket: u64,
    pub spc_oid: COid,
    pub db_oid: COid,
    pub rel_number: u32,
    pub fork_number: u8,
    pub block_number: u32,
 }
 /// Iterate over LFC contents
 #[allow(clippy::missing_safety_doc)]
 #[unsafe(no_mangle)]
 pub unsafe extern "C" fn bcomm_cache_iterate_begin(
    _bs: &mut CommunicatorBackendStruct,
    iter: *mut FileCacheIterator,
 ) {
    unsafe { (*iter).next_bucket = 0 };
 }
 #[allow(clippy::missing_safety_doc)]
 #[unsafe(no_mangle)]
 pub unsafe extern "C" fn bcomm_cache_iterate_next(
    bs: &mut CommunicatorBackendStruct,
    iter: *mut FileCacheIterator,
 ) -> bool {
    use crate::integrated_cache::GetBucketResult;
    loop {
        let next_bucket = unsafe { (*iter).next_bucket } as usize;
        match bs.integrated_cache.get_bucket(next_bucket) {
            GetBucketResult::Occupied(rel, blk) => {
                unsafe {
                    (*iter).spc_oid = rel.spcnode;
                    (*iter).db_oid = rel.dbnode;
                    (*iter).rel_number = rel.relnode;
                    (*iter).fork_number = rel.forknum;
                    (*iter).block_number = blk;
                    (*iter).next_bucket += 1;
                }
                break true;
            }
            GetBucketResult::Vacant => {
                unsafe {
                    (*iter).next_bucket += 1;
                }
                continue;
            }
            GetBucketResult::OutOfBounds => {
                break false;
            }
        }
    }
 }
 #[allow(clippy::missing_safety_doc)]
 #[unsafe(no_mangle)]
 pub unsafe extern "C" fn bcomm_cache_get_num_pages_used(bs: &mut CommunicatorBackendStruct) -> u64 {
    bs.integrated_cache.get_num_buckets_in_use() as u64
 }
 impl<'t> CommunicatorBackendStruct<'t> {
    /// The slot must be free, or this panics.
    pub(crate) fn start_neon_io_request(&mut self, request_slot_idx: i32, request: &NeonIORequest) {
        let my_proc_number = self.my_proc_number;
        self.neon_request_slots[request_slot_idx as usize].submit_request(request, my_proc_number);
        // Tell the communicator about it
        self.notify_about_request(request_slot_idx);
    }
    /// Send a wakeup to the communicator process
    fn notify_about_request(self: &CommunicatorBackendStruct<'t>, request_slot_idx: i32) {
        // wake up communicator by writing the idx to the submission pipe
        //
        // This can block, if the pipe is full. That should be very rare,
        // because the communicator tries hard to drain the pipe to prevent
        // that. Also, there's a natural upper bound on how many wakeups can be
        // queued up: there is only a limited number of request slots for each
        // backend.
        //
        // If it does block very briefly, that's not too serious.
        let idxbuf = request_slot_idx.to_ne_bytes();
        let _res = nix::unistd::write(&self.submission_pipe_write_fd, &idxbuf);
        // FIXME: check result, return any errors
    }
 }
--- a/pgxn/neon/communicator/src/file_cache.rs
+++ b/pgxn/neon/communicator/src/file_cache.rs
@@ -0,0 +1,156 @@
 //! Implement the "low-level" parts of the file cache.
 //!
 //! This module just deals with reading and writing the file, and keeping track
 //! which blocks in the cache file are in use and which are free. The "high
 //! level" parts of tracking which block in the cache file corresponds to which
 //! relation block is handled in 'integrated_cache' instead.
 //!
 //! This module is only used to access the file from the communicator
 //! process. The backend processes *also* read the file (and sometimes also
 //! write it? ), but the backends use direct C library calls for that.
 use std::fs::File;
 use std::os::unix::fs::FileExt;
 use std::path::Path;
 use std::sync::Arc;
 use std::sync::Mutex;
 use measured::metric;
 use measured::metric::MetricEncoding;
 use measured::metric::gauge::GaugeState;
 use measured::{Gauge, MetricGroup};
 use crate::BLCKSZ;
 use tokio::task::spawn_blocking;
 pub type CacheBlock = u64;
 pub const INVALID_CACHE_BLOCK: CacheBlock = u64::MAX;
 pub struct FileCache {
    file: Arc<File>,
    free_list: Mutex<FreeList>,
    metrics: FileCacheMetricGroup,
 }
 #[derive(MetricGroup)]
 #[metric(new())]
 struct FileCacheMetricGroup {
    /// Local File Cache size in 8KiB blocks
    max_blocks: Gauge,
    /// Number of free 8KiB blocks in Local File Cache
    num_free_blocks: Gauge,
 }
 // TODO: We keep track of all free blocks in this vec. That doesn't really scale.
 // Idea: when free_blocks fills up with more than 1024 entries, write them all to
 // one block on disk.
 #[derive(Debug)]
 struct FreeList {
    next_free_block: CacheBlock,
    max_blocks: u64,
    free_blocks: Vec<CacheBlock>,
 }
 impl FileCache {
    pub fn new(file_cache_path: &Path, mut initial_size: u64) -> Result<FileCache, std::io::Error> {
        if initial_size < 100 {
            tracing::warn!(
                "min size for file cache is 100 blocks, {} requested",
                initial_size
            );
            initial_size = 100;
        }
        let file = std::fs::OpenOptions::new()
            .read(true)
            .write(true)
            .truncate(true)
            .create(true)
            .open(file_cache_path)?;
        tracing::info!("initialized file cache with {} blocks", initial_size);
        Ok(FileCache {
            file: Arc::new(file),
            free_list: Mutex::new(FreeList {
                next_free_block: 0,
                max_blocks: initial_size,
                free_blocks: Vec::new(),
            }),
            metrics: FileCacheMetricGroup::new(),
        })
    }
    // File cache management
    pub async fn read_block(
        &self,
        cache_block: CacheBlock,
        mut dst: impl uring_common::buf::IoBufMut + Send + Sync,
    ) -> Result<(), std::io::Error> {
        assert!(dst.bytes_total() == BLCKSZ);
        let file = self.file.clone();
        let dst_ref = unsafe { std::slice::from_raw_parts_mut(dst.stable_mut_ptr(), BLCKSZ) };
        spawn_blocking(move || file.read_exact_at(dst_ref, cache_block * BLCKSZ as u64)).await??;
        Ok(())
    }
    pub async fn write_block(
        &self,
        cache_block: CacheBlock,
        src: impl uring_common::buf::IoBuf + Send + Sync,
    ) -> Result<(), std::io::Error> {
        assert!(src.bytes_init() == BLCKSZ);
        let file = self.file.clone();
        let src_ref = unsafe { std::slice::from_raw_parts(src.stable_ptr(), BLCKSZ) };
        spawn_blocking(move || file.write_all_at(src_ref, cache_block * BLCKSZ as u64)).await??;
        Ok(())
    }
    pub fn alloc_block(&self) -> Option<CacheBlock> {
        let mut free_list = self.free_list.lock().unwrap();
        if let Some(x) = free_list.free_blocks.pop() {
            return Some(x);
        }
        if free_list.next_free_block < free_list.max_blocks {
            let result = free_list.next_free_block;
            free_list.next_free_block += 1;
            return Some(result);
        }
        None
    }
    pub fn dealloc_block(&self, cache_block: CacheBlock) {
        let mut free_list = self.free_list.lock().unwrap();
        free_list.free_blocks.push(cache_block);
    }
 }
 impl<T: metric::group::Encoding> MetricGroup<T> for FileCache
 where
    GaugeState: MetricEncoding<T>,
 {
    fn collect_group_into(&self, enc: &mut T) -> Result<(), <T as metric::group::Encoding>::Err> {
        // Update the gauges with fresh values first
        {
            let free_list = self.free_list.lock().unwrap();
            self.metrics.max_blocks.set(free_list.max_blocks as i64);
            let total_free_blocks: i64 = free_list.free_blocks.len() as i64
                + (free_list.max_blocks as i64 - free_list.next_free_block as i64);
            self.metrics.num_free_blocks.set(total_free_blocks);
        }
        self.metrics.collect_group_into(enc)
    }
 }
--- a/pgxn/neon/communicator/src/global_allocator.rs
+++ b/pgxn/neon/communicator/src/global_allocator.rs
@@ -0,0 +1,107 @@
 //! Global allocator, for tracking memory usage of the Rust parts
 //!
 //! Postgres is designed to handle allocation failure (ie. malloc() returning NULL) gracefully.  It
 //! rolls backs the transaction and gives the user an "ERROR: out of memory" error. Rust code
 //! however panics if an allocation fails. We don't want that to ever happen, because an unhandled
 //! panic leads to Postgres crash and restart. Our strategy is to pre-allocate a large enough chunk
 //! of memory for use by the Rust code, so that the allocations never fail.
 //!
 //! To pick the size for the pre-allocated chunk, we have a metric to track the high watermark
 //! memory usage of all the Rust allocations in total.
 //!
 //! TODO:
 //!
 //! - Currently we just export the metrics. Actual allocations are still just passed through to
 //!   the system allocator.
 //! - Take padding etc. overhead into account
 use std::alloc::{GlobalAlloc, Layout, System};
 use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
 use measured::metric;
 use measured::metric::MetricEncoding;
 use measured::metric::gauge::GaugeState;
 use measured::{Gauge, MetricGroup};
 pub(crate) struct MyAllocator {
    allocations: AtomicU64,
    deallocations: AtomicU64,
    allocated: AtomicUsize,
    high: AtomicUsize,
 }
 #[derive(MetricGroup)]
 #[metric(new())]
 struct MyAllocatorMetricGroup {
    /// Number of allocations in Rust code
    communicator_mem_allocations: Gauge,
    /// Number of deallocations in Rust code
    communicator_mem_deallocations: Gauge,
    /// Bytes currently allocated
    communicator_mem_allocated: Gauge,
    /// High watermark of allocated bytes
    communicator_mem_high: Gauge,
 }
 unsafe impl GlobalAlloc for MyAllocator {
    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
        self.allocations.fetch_add(1, Ordering::Relaxed);
        let mut allocated = self.allocated.fetch_add(layout.size(), Ordering::Relaxed);
        allocated += layout.size();
        self.high.fetch_max(allocated, Ordering::Relaxed);
        unsafe { System.alloc(layout) }
    }
    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
        self.deallocations.fetch_add(1, Ordering::Relaxed);
        self.allocated.fetch_sub(layout.size(), Ordering::Relaxed);
        unsafe { System.dealloc(ptr, layout) }
    }
 }
 #[global_allocator]
 static GLOBAL: MyAllocator = MyAllocator {
    allocations: AtomicU64::new(0),
    deallocations: AtomicU64::new(0),
    allocated: AtomicUsize::new(0),
    high: AtomicUsize::new(0),
 };
 pub(crate) struct MyAllocatorCollector {
    metrics: MyAllocatorMetricGroup,
 }
 impl MyAllocatorCollector {
    pub(crate) fn new() -> Self {
        Self {
            metrics: MyAllocatorMetricGroup::new(),
        }
    }
 }
 impl<T: metric::group::Encoding> MetricGroup<T> for MyAllocatorCollector
 where
    GaugeState: MetricEncoding<T>,
 {
    fn collect_group_into(&self, enc: &mut T) -> Result<(), <T as metric::group::Encoding>::Err> {
        // Update the gauges with fresh values first
        self.metrics
            .communicator_mem_allocations
            .set(GLOBAL.allocations.load(Ordering::Relaxed) as i64);
        self.metrics
            .communicator_mem_deallocations
            .set(GLOBAL.allocations.load(Ordering::Relaxed) as i64);
        self.metrics
            .communicator_mem_allocated
            .set(GLOBAL.allocated.load(Ordering::Relaxed) as i64);
        self.metrics
            .communicator_mem_high
            .set(GLOBAL.high.load(Ordering::Relaxed) as i64);
        self.metrics.collect_group_into(enc)
    }
 }
--- a/pgxn/neon/communicator/src/init.rs
+++ b/pgxn/neon/communicator/src/init.rs
@@ -0,0 +1,166 @@
 //! Initialization functions. These are executed in the postmaster process,
 //! at different stages of server startup.
 //!
 //!
 //! Communicator initialization steps:
 //!
 //! 1. At postmaster startup, before shared memory is allocated,
 //!    rcommunicator_shmem_size() is called to get the amount of
 //!    shared memory that this module needs.
 //!
 //! 2. Later, after the shared memory has been allocated,
 //!    rcommunicator_shmem_init() is called to initialize the shmem
 //!    area.
 //!
 //! Per process initialization:
 //!
 //! When a backend process starts up, it calls rcommunicator_backend_init().
 //! In the communicator worker process, other functions are called, see
 //! `worker_process` module.
 use std::ffi::c_int;
 use std::mem;
 use std::mem::MaybeUninit;
 use std::os::fd::OwnedFd;
 use crate::backend_comms::NeonIORequestSlot;
 use crate::integrated_cache::IntegratedCacheInitStruct;
 /// This struct is created in the postmaster process, and inherited to
 /// the communicator process and all backend processes through fork()
 #[repr(C)]
 pub struct CommunicatorInitStruct {
    pub submission_pipe_read_fd: OwnedFd,
    pub submission_pipe_write_fd: OwnedFd,
    // Shared memory data structures
    pub num_neon_request_slots: u32,
    pub neon_request_slots: &'static [NeonIORequestSlot],
    pub integrated_cache_init_struct: IntegratedCacheInitStruct<'static>,
 }
 impl std::fmt::Debug for CommunicatorInitStruct {
    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
        fmt.debug_struct("CommunicatorInitStruct")
            .field("submission_pipe_read_fd", &self.submission_pipe_read_fd)
            .field("submission_pipe_write_fd", &self.submission_pipe_write_fd)
            .field("num_neon_request_slots", &self.num_neon_request_slots)
            .field("neon_request_slots length", &self.neon_request_slots.len())
            .finish()
    }
 }
 #[unsafe(no_mangle)]
 pub extern "C" fn rcommunicator_shmem_size(num_neon_request_slots: u32) -> u64 {
    let mut size = 0;
    size += mem::size_of::<NeonIORequestSlot>() * num_neon_request_slots as usize;
    // For integrated_cache's Allocator. TODO: make this adjustable
    size += IntegratedCacheInitStruct::shmem_size();
    size as u64
 }
 /// Initialize the shared memory segment. Returns a backend-private
 /// struct, which will be inherited by backend processes through fork
 #[unsafe(no_mangle)]
 pub extern "C" fn rcommunicator_shmem_init(
    submission_pipe_read_fd: c_int,
    submission_pipe_write_fd: c_int,
    num_neon_request_slots: u32,
    shmem_area_ptr: *mut MaybeUninit<u8>,
    shmem_area_len: u64,
    initial_file_cache_size: u64,
    max_file_cache_size: u64,
 ) -> &'static mut CommunicatorInitStruct {
    let shmem_area: &'static mut [MaybeUninit<u8>] =
        unsafe { std::slice::from_raw_parts_mut(shmem_area_ptr, shmem_area_len as usize) };
    let (neon_request_slots, remaining_area) =
        alloc_array_from_slice::<NeonIORequestSlot>(shmem_area, num_neon_request_slots as usize);
    for slot in neon_request_slots.iter_mut() {
        slot.write(NeonIORequestSlot::default());
    }
    // 'neon_request_slots' is initialized now. (MaybeUninit::slice_assume_init_mut() is nightly-only
    // as of this writing.)
    let neon_request_slots = unsafe {
        std::mem::transmute::<&mut [MaybeUninit<NeonIORequestSlot>], &mut [NeonIORequestSlot]>(
            neon_request_slots,
        )
    };
    // Give the rest of the area to the integrated cache
    let integrated_cache_init_struct = IntegratedCacheInitStruct::shmem_init(
        remaining_area,
        initial_file_cache_size,
        max_file_cache_size,
    );
    let (submission_pipe_read_fd, submission_pipe_write_fd) = unsafe {
        use std::os::fd::FromRawFd;
        (
            OwnedFd::from_raw_fd(submission_pipe_read_fd),
            OwnedFd::from_raw_fd(submission_pipe_write_fd),
        )
    };
    let cis: &'static mut CommunicatorInitStruct = Box::leak(Box::new(CommunicatorInitStruct {
        submission_pipe_read_fd,
        submission_pipe_write_fd,
        num_neon_request_slots,
        neon_request_slots,
        integrated_cache_init_struct,
    }));
    cis
 }
 pub fn alloc_from_slice<T>(
    area: &mut [MaybeUninit<u8>],
 ) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
    let layout = std::alloc::Layout::new::<T>();
    let area_start = area.as_mut_ptr();
    // pad to satisfy alignment requirements
    let padding = area_start.align_offset(layout.align());
    if padding + layout.size() > area.len() {
        panic!("out of memory");
    }
    let area = &mut area[padding..];
    let (result_area, remain) = area.split_at_mut(layout.size());
    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
    let result = unsafe { result_ptr.as_mut().unwrap() };
    (result, remain)
 }
 pub fn alloc_array_from_slice<T>(
    area: &mut [MaybeUninit<u8>],
    len: usize,
 ) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
    let layout = std::alloc::Layout::new::<T>();
    let area_start = area.as_mut_ptr();
    // pad to satisfy alignment requirements
    let padding = area_start.align_offset(layout.align());
    if padding + layout.size() * len > area.len() {
        panic!("out of memory");
    }
    let area = &mut area[padding..];
    let (result_area, remain) = area.split_at_mut(layout.size() * len);
    let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
    let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
    (result, remain)
 }
--- a/pgxn/neon/communicator/src/integrated_cache.rs
+++ b/pgxn/neon/communicator/src/integrated_cache.rs
@@ -0,0 +1,990 @@
 //! Integrated communicator cache
 //!
 //! It tracks:
 //! - Relation sizes and existence
 //! - Last-written LSN
 //! - Block cache (also known as LFC)
 //!
 //! TODO: limit the size
 //! TODO: concurrency
 //!
 //! Note: This deals with "relations" which is really just one "relation fork" in Postgres
 //! terms. RelFileLocator + ForkNumber is the key.
 //
 // TODO: Thoughts on eviction:
 //
 // There are two things we need to track, and evict if we run out of space:
 // - blocks in the file cache's file. If the file grows too large, need to evict something.
 //   Also if the cache is resized
 //
 // - entries in the cache map. If we run out of memory in the shmem area, need to evict
 //   something
 //
 use std::mem::MaybeUninit;
 use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, AtomicUsize, Ordering};
 use utils::lsn::{AtomicLsn, Lsn};
 use crate::file_cache::INVALID_CACHE_BLOCK;
 use crate::file_cache::{CacheBlock, FileCache};
 use crate::init::alloc_from_slice;
 use pageserver_page_api::RelTag;
 use measured::metric;
 use measured::metric::MetricEncoding;
 use measured::metric::counter::CounterState;
 use measured::metric::gauge::GaugeState;
 use measured::{Counter, Gauge, MetricGroup};
 use neon_shmem::hash::{HashMapInit, entry::Entry};
 use neon_shmem::shmem::ShmemHandle;
 // in # of entries
 const RELSIZE_CACHE_SIZE: u32 = 64 * 1024;
 /// This struct is initialized at postmaster startup, and passed to all the processes via fork().
 pub struct IntegratedCacheInitStruct<'t> {
    shared: &'t IntegratedCacheShared,
    relsize_cache_handle: HashMapInit<'t, RelKey, RelEntry>,
    block_map_handle: HashMapInit<'t, BlockKey, BlockEntry>,
 }
 /// This struct is allocated in the (fixed-size) shared memory area at postmaster startup.
 /// It is accessible by all the backends and the communicator process.
 #[derive(Debug)]
 pub struct IntegratedCacheShared {
    global_lw_lsn: AtomicU64,
 }
 /// Represents write-access to the integrated cache. This is used by the communicator process.
 pub struct IntegratedCacheWriteAccess<'t> {
    shared: &'t IntegratedCacheShared,
    relsize_cache: neon_shmem::hash::HashMapAccess<'t, RelKey, RelEntry>,
    block_map: neon_shmem::hash::HashMapAccess<'t, BlockKey, BlockEntry>,
    pub(crate) file_cache: Option<FileCache>,
    // Fields for eviction
    clock_hand: AtomicUsize,
    metrics: IntegratedCacheMetricGroup,
 }
 #[derive(MetricGroup)]
 #[metric(new())]
 struct IntegratedCacheMetricGroup {
    /// Page evictions from the Local File Cache
    cache_page_evictions_counter: Counter,
    /// Block entry evictions from the integrated cache
    block_entry_evictions_counter: Counter,
    /// Number of times the clock hand has moved
    clock_iterations_counter: Counter,
    // metrics from the hash map
    /// Allocated size of the block cache hash map
    block_map_num_buckets: Gauge,
    /// Number of buckets in use in the block cache hash map
    block_map_num_buckets_in_use: Gauge,
    /// Allocated size of the relsize cache hash map
    relsize_cache_num_buckets: Gauge,
    /// Number of buckets in use in the relsize cache hash map
    relsize_cache_num_buckets_in_use: Gauge,
 }
 /// Represents read-only access to the integrated cache. Backend processes have this.
 pub struct IntegratedCacheReadAccess<'t> {
    shared: &'t IntegratedCacheShared,
    relsize_cache: neon_shmem::hash::HashMapAccess<'t, RelKey, RelEntry>,
    block_map: neon_shmem::hash::HashMapAccess<'t, BlockKey, BlockEntry>,
 }
 impl<'t> IntegratedCacheInitStruct<'t> {
    /// Return the desired size in bytes of the fixed-size shared memory area to reserve for the
    /// integrated cache.
    pub fn shmem_size() -> usize {
        // The relsize cache is fixed-size. The block map is allocated in a separate resizable
        // area.
        let mut sz = 0;
        sz += std::mem::size_of::<IntegratedCacheShared>();
        sz += HashMapInit::<RelKey, RelEntry>::estimate_size(RELSIZE_CACHE_SIZE);
        sz
    }
    /// Initialize the shared memory segment. This runs once in postmaster. Returns a struct which
    /// will be inherited by all processes through fork.
    pub fn shmem_init(
        shmem_area: &'t mut [MaybeUninit<u8>],
        initial_file_cache_size: u64,
        max_file_cache_size: u64,
    ) -> IntegratedCacheInitStruct<'t> {
        // Initialize the shared struct
        let (shared, remain_shmem_area) = alloc_from_slice::<IntegratedCacheShared>(shmem_area);
        let shared = shared.write(IntegratedCacheShared {
            global_lw_lsn: AtomicU64::new(0),
        });
        // Use the remaining part of the fixed-size area for the relsize cache
        let relsize_cache_handle =
            neon_shmem::hash::HashMapInit::with_fixed(RELSIZE_CACHE_SIZE, remain_shmem_area);
        let max_bytes =
            HashMapInit::<BlockKey, BlockEntry>::estimate_size(max_file_cache_size as u32);
        // Initialize the block map in a separate resizable shared memory area
        let shmem_handle = ShmemHandle::new("block mapping", 0, max_bytes).unwrap();
        let block_map_handle =
            neon_shmem::hash::HashMapInit::with_shmem(initial_file_cache_size as u32, shmem_handle);
        IntegratedCacheInitStruct {
            shared,
            relsize_cache_handle,
            block_map_handle,
        }
    }
    /// Initialize access to the integrated cache for the communicator worker process
    pub fn worker_process_init(
        self,
        lsn: Lsn,
        file_cache: Option<FileCache>,
    ) -> IntegratedCacheWriteAccess<'t> {
        let IntegratedCacheInitStruct {
            shared,
            relsize_cache_handle,
            block_map_handle,
        } = self;
        shared.global_lw_lsn.store(lsn.0, Ordering::Relaxed);
        IntegratedCacheWriteAccess {
            shared,
            relsize_cache: relsize_cache_handle.attach_writer(),
            block_map: block_map_handle.attach_writer(),
            file_cache,
            clock_hand: AtomicUsize::new(0),
            metrics: IntegratedCacheMetricGroup::new(),
        }
    }
    /// Initialize access to the integrated cache for a backend process
    pub fn backend_init(self) -> IntegratedCacheReadAccess<'t> {
        let IntegratedCacheInitStruct {
            shared,
            relsize_cache_handle,
            block_map_handle,
        } = self;
        IntegratedCacheReadAccess {
            shared,
            relsize_cache: relsize_cache_handle.attach_reader(),
            block_map: block_map_handle.attach_reader(),
        }
    }
 }
 /// Value stored in the cache mapping hash table.
 struct BlockEntry {
    lw_lsn: AtomicLsn,
    cache_block: AtomicU64,
    pinned: AtomicU64,
    // 'referenced' bit for the clock algorithm
    referenced: AtomicBool,
 }
 /// Value stored in the relsize cache hash table.
 struct RelEntry {
    /// cached size of the relation
    /// u32::MAX means 'not known' (that's InvalidBlockNumber in Postgres)
    nblocks: AtomicU32,
    /// This is the last time the "metadata" of this relation changed, not
    /// the contents of the blocks. That is, the size of the relation.
    lw_lsn: AtomicLsn,
 }
 impl std::fmt::Debug for RelEntry {
    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
        fmt.debug_struct("Rel")
            .field("nblocks", &self.nblocks.load(Ordering::Relaxed))
            .finish()
    }
 }
 impl std::fmt::Debug for BlockEntry {
    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
        fmt.debug_struct("Block")
            .field("lw_lsn", &self.lw_lsn.load())
            .field("cache_block", &self.cache_block.load(Ordering::Relaxed))
            .field("pinned", &self.pinned.load(Ordering::Relaxed))
            .field("referenced", &self.referenced.load(Ordering::Relaxed))
            .finish()
    }
 }
 #[derive(Clone, Debug, PartialEq, PartialOrd, Eq, Hash, Ord)]
 struct RelKey(RelTag);
 impl From<&RelTag> for RelKey {
    fn from(val: &RelTag) -> RelKey {
        RelKey(*val)
    }
 }
 #[derive(Clone, Debug, PartialEq, PartialOrd, Eq, Hash, Ord)]
 struct BlockKey {
    rel: RelTag,
    block_number: u32,
 }
 impl From<(&RelTag, u32)> for BlockKey {
    fn from(val: (&RelTag, u32)) -> BlockKey {
        BlockKey {
            rel: *val.0,
            block_number: val.1,
        }
    }
 }
 /// Return type used in the cache's get_*() functions. 'Found' means that the page, or other
 /// information that was enqueried, exists in the cache. '
 pub enum CacheResult<V> {
    /// The enqueried page or other information existed in the cache.
    Found(V),
    /// The cache doesn't contain the page (or other enqueried information, like relation size). The
    /// Lsn is the 'not_modified_since' LSN that should be used in the request to the pageserver to
    /// read the page.
    NotFound(Lsn),
 }
 /// Return type of [try_evict_entry]
 enum EvictResult {
    /// Could not evict page because it was pinned
    Pinned,
    /// The victim bucket was already vacant
    Vacant,
    /// Evicted an entry. If it had a cache block associated with it, it's returned
    /// here, otherwise None
    Evicted(Option<CacheBlock>),
 }
 impl<'t> IntegratedCacheWriteAccess<'t> {
    pub fn get_rel_size(&'t self, rel: &RelTag) -> CacheResult<u32> {
        if let Some(nblocks) = get_rel_size(&self.relsize_cache, rel) {
            CacheResult::Found(nblocks)
        } else {
            let lsn = Lsn(self.shared.global_lw_lsn.load(Ordering::Relaxed));
            CacheResult::NotFound(lsn)
        }
    }
    pub async fn get_page(
        &'t self,
        rel: &RelTag,
        block_number: u32,
        dst: impl uring_common::buf::IoBufMut + Send + Sync,
    ) -> Result<CacheResult<()>, std::io::Error> {
        let x = if let Some(block_entry) = self.block_map.get(&BlockKey::from((rel, block_number)))
        {
            block_entry.referenced.store(true, Ordering::Relaxed);
            let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
            if cache_block != INVALID_CACHE_BLOCK {
                // pin it and release lock
                block_entry.pinned.fetch_add(1, Ordering::Relaxed);
                (cache_block, DeferredUnpin(block_entry.pinned.as_ptr()))
            } else {
                return Ok(CacheResult::NotFound(block_entry.lw_lsn.load()));
            }
        } else {
            let lsn = Lsn(self.shared.global_lw_lsn.load(Ordering::Relaxed));
            return Ok(CacheResult::NotFound(lsn));
        };
        let (cache_block, _deferred_pin) = x;
        self.file_cache
            .as_ref()
            .unwrap()
            .read_block(cache_block, dst)
            .await?;
        // unpin the entry (by implicitly dropping deferred_pin)
        Ok(CacheResult::Found(()))
    }
    pub async fn page_is_cached(
        &'t self,
        rel: &RelTag,
        block_number: u32,
    ) -> Result<CacheResult<()>, std::io::Error> {
        if let Some(block_entry) = self.block_map.get(&BlockKey::from((rel, block_number))) {
            // This is used for prefetch requests. Treat the probe as an 'access', to keep it
            // in cache.
            block_entry.referenced.store(true, Ordering::Relaxed);
            let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
            if cache_block != INVALID_CACHE_BLOCK {
                Ok(CacheResult::Found(()))
            } else {
                Ok(CacheResult::NotFound(block_entry.lw_lsn.load()))
            }
        } else {
            let lsn = Lsn(self.shared.global_lw_lsn.load(Ordering::Relaxed));
            Ok(CacheResult::NotFound(lsn))
        }
    }
    /// Does the relation exists? CacheResult::NotFound means that the cache doesn't contain that
    /// information, i.e. we don't know if the relation exists or not.
    pub fn get_rel_exists(&'t self, rel: &RelTag) -> CacheResult<bool> {
        // we don't currently cache negative entries, so if the relation is in the cache, it exists
        if let Some(_rel_entry) = self.relsize_cache.get(&RelKey::from(rel)) {
            CacheResult::Found(true)
        } else {
            let lsn = Lsn(self.shared.global_lw_lsn.load(Ordering::Relaxed));
            CacheResult::NotFound(lsn)
        }
    }
    pub fn get_db_size(&'t self, _db_oid: u32) -> CacheResult<u64> {
        // TODO: it would be nice to cache database sizes too. Getting the database size
        // is not a very common operation, but when you do it, it's often interactive, with
        // e.g. psql \l+ command, so the user will feel the latency.
        // fixme: is this right lsn?
        let lsn = Lsn(self.shared.global_lw_lsn.load(Ordering::Relaxed));
        CacheResult::NotFound(lsn)
    }
    pub fn remember_rel_size(&'t self, rel: &RelTag, nblocks: u32, lsn: Lsn) {
        match self.relsize_cache.entry(RelKey::from(rel)) {
            Entry::Vacant(e) => {
                tracing::trace!("inserting rel entry for {rel:?}, {nblocks} blocks");
                // FIXME: what to do if we run out of memory? Evict other relation entries?
                _ = e
                    .insert(RelEntry {
                        nblocks: AtomicU32::new(nblocks),
                        lw_lsn: AtomicLsn::new(lsn.0),
                    })
                    .expect("out of memory");
            }
            Entry::Occupied(e) => {
                tracing::trace!("updating rel entry for {rel:?}, {nblocks} blocks");
                e.get().nblocks.store(nblocks, Ordering::Relaxed);
                e.get().lw_lsn.store(lsn);
            }
        };
    }
    /// Remember the given page contents in the cache.
    pub async fn remember_page(
        &'t self,
        rel: &RelTag,
        block_number: u32,
        src: impl uring_common::buf::IoBuf + Send + Sync,
        lw_lsn: Lsn,
        is_write: bool,
    ) {
        let key = BlockKey::from((rel, block_number));
        // FIXME: make this work when file cache is disabled. Or make it mandatory
        let file_cache = self.file_cache.as_ref().unwrap();
        if is_write {
            // there should be no concurrent IOs. If a backend tries to read the page
            // at the same time, they may get a torn write. That's the same as with
            // regular POSIX filesystem read() and write()
            // First check if we have a block in cache already
            let mut old_cache_block = None;
            let mut found_existing = false;
            // NOTE(quantumish): honoring original semantics here (used to be update_with_fn)
            // but I don't see any reason why this has to take a write lock.
            if let Entry::Occupied(e) = self.block_map.entry(key.clone()) {
                let block_entry = e.get();
                found_existing = true;
                // Prevent this entry from being evicted
                let pin_count = block_entry.pinned.fetch_add(1, Ordering::Relaxed);
                if pin_count > 0 {
                    // this is unexpected, because the caller has obtained the io-in-progress lock,
                    // so no one else should try to modify the page at the same time.
                    // XXX: and I think a read should not be happening either, because the postgres
                    // buffer is held locked. TODO: check these conditions and tidy this up a little. Seems fragile to just panic.
                    panic!("block entry was unexpectedly pinned");
                }
                let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
                old_cache_block = if cache_block != INVALID_CACHE_BLOCK {
                    Some(cache_block)
                } else {
                    None
                };
            }
            // Allocate a new block if required
            let cache_block = old_cache_block.unwrap_or_else(|| {
                loop {
                    if let Some(x) = file_cache.alloc_block() {
                        break x;
                    }
                    if let Some(x) = self.try_evict_cache_block() {
                        break x;
                    }
                }
            });
            // Write the page to the cache file
            file_cache
                .write_block(cache_block, src)
                .await
                .expect("error writing to cache");
            // FIXME: handle errors gracefully.
            // FIXME: unpin the block entry on error
            // Update the block entry
            loop {
                let entry = self.block_map.entry(key.clone());
                assert_eq!(found_existing, matches!(entry, Entry::Occupied(_)));
                match entry {
                    Entry::Occupied(e) => {
                        let block_entry = e.get();
                        // Update the cache block
                        let old_blk = block_entry.cache_block.compare_exchange(
                            INVALID_CACHE_BLOCK,
                            cache_block,
                            Ordering::Relaxed,
                            Ordering::Relaxed,
                        );
                        assert!(old_blk == Ok(INVALID_CACHE_BLOCK) || old_blk == Err(cache_block));
                        block_entry.lw_lsn.store(lw_lsn);
                        block_entry.referenced.store(true, Ordering::Relaxed);
                        let pin_count = block_entry.pinned.fetch_sub(1, Ordering::Relaxed);
                        assert!(pin_count > 0);
                        break;
                    }
                    Entry::Vacant(e) => {
                        if e.insert(BlockEntry {
                            lw_lsn: AtomicLsn::new(lw_lsn.0),
                            cache_block: AtomicU64::new(cache_block),
                            pinned: AtomicU64::new(0),
                            referenced: AtomicBool::new(true),
                        })
                        .is_ok()
                        {
                            break;
                        } else {
                            // The hash map was full. Evict an entry and retry.
                        }
                    }
                }
                self.try_evict_block_entry();
            }
        } else {
            // !is_write
            //
            // We can assume that it doesn't already exist, because the
            // caller is assumed to have already checked it, and holds
            // the io-in-progress lock. (The BlockEntry might exist, but no cache block)
            // Allocate a new block first
            let cache_block = {
                loop {
                    if let Some(x) = file_cache.alloc_block() {
                        break x;
                    }
                    if let Some(x) = self.try_evict_cache_block() {
                        break x;
                    }
                }
            };
            // Write the page to the cache file
            file_cache
                .write_block(cache_block, src)
                .await
                .expect("error writing to cache");
            // FIXME: handle errors gracefully.
            loop {
                match self.block_map.entry(key.clone()) {
                    Entry::Occupied(e) => {
                        let block_entry = e.get();
                        // FIXME: could there be concurrent readers?
                        assert!(block_entry.pinned.load(Ordering::Relaxed) == 0);
                        let old_cache_block =
                            block_entry.cache_block.swap(cache_block, Ordering::Relaxed);
                        if old_cache_block != INVALID_CACHE_BLOCK {
                            panic!(
                                "remember_page called in !is_write mode, but page is already cached at blk {old_cache_block}"
                            );
                        }
                        break;
                    }
                    Entry::Vacant(e) => {
                        if e.insert(BlockEntry {
                            lw_lsn: AtomicLsn::new(lw_lsn.0),
                            cache_block: AtomicU64::new(cache_block),
                            pinned: AtomicU64::new(0),
                            referenced: AtomicBool::new(true),
                        })
                        .is_ok()
                        {
                            break;
                        } else {
                            // The hash map was full. Evict an entry and retry.
                        }
                    }
                };
                self.try_evict_block_entry();
            }
        }
    }
    /// Forget information about given relation in the cache. (For DROP TABLE and such)
    pub fn forget_rel(&'t self, rel: &RelTag, _nblocks: Option<u32>, flush_lsn: Lsn) {
        tracing::trace!("forgetting rel entry for {rel:?}");
        self.relsize_cache.remove(&RelKey::from(rel));
        // update with flush LSN
        let _ = self
            .shared
            .global_lw_lsn
            .fetch_max(flush_lsn.0, Ordering::Relaxed);
        // also forget all cached blocks for the relation
        // FIXME
        /*
            let mut iter = MapIterator::new(&key_range_for_rel_blocks(rel));
            let r = self.cache_tree.start_read();
            while let Some((k, _v)) = iter.next(&r) {
                let w = self.cache_tree.start_write();
                let mut evicted_cache_block = None;
                let res = w.update_with_fn(&k, |e| {
                    if let Some(e) = e {
                        let block_entry = if let MapEntry::Block(e) = e {
                            e
                        } else {
                            panic!("unexpected map entry type for block key");
                        };
                        let cache_block = block_entry
                            .cache_block
                            .swap(INVALID_CACHE_BLOCK, Ordering::Relaxed);
                        if cache_block != INVALID_CACHE_BLOCK {
                            evicted_cache_block = Some(cache_block);
                        }
                        UpdateAction::Remove
                    } else {
                        UpdateAction::Nothing
                    }
                });
                // FIXME: It's pretty surprising to run out of memory while removing. But
                // maybe it can happen because of trying to shrink a node?
                res.expect("out of memory");
                if let Some(evicted_cache_block) = evicted_cache_block {
                    self.file_cache
                        .as_ref()
                        .unwrap()
                        .dealloc_block(evicted_cache_block);
                }
        }
            */
    }
    // Maintenance routines
    /// Evict one block entry from the cache.
    ///
    /// This is called when the hash map is full, to make an entry available for a new
    /// insertion. There's no guarantee that the entry is free by the time this function
    /// returns anymore; it can taken by a concurrent thread at any time. So you need to
    /// call this and retry repeatedly until you succeed.
    fn try_evict_block_entry(&self) {
        let num_buckets = self.block_map.get_num_buckets();
        loop {
            self.metrics.clock_iterations_counter.inc();
            let victim_bucket = self.clock_hand.fetch_add(1, Ordering::Relaxed) % num_buckets;
            let evict_this = match self.block_map.get_at_bucket(victim_bucket).as_deref() {
                None => {
                    // The caller wants to have a free bucket. If there's one already, we're good.
                    return;
                }
                Some((_, blk_entry)) => {
                    // Clear the 'referenced' flag. If it was already clear,
                    // release the lock (by exiting this scope), and try to
                    // evict it.
                    !blk_entry.referenced.swap(false, Ordering::Relaxed)
                }
            };
            if evict_this {
                match self.try_evict_entry(victim_bucket) {
                    EvictResult::Pinned => {
                        // keep looping
                    }
                    EvictResult::Vacant => {
                        // This was released by someone else. Return so that
                        // the caller will try to use it. (Chances are that it
                        // will be reused by someone else, but let's try.)
                        return;
                    }
                    EvictResult::Evicted(None) => {
                        // This is now free.
                        return;
                    }
                    EvictResult::Evicted(Some(cache_block)) => {
                        // This is now free. We must not leak the cache block, so put it to the freelist
                        self.file_cache.as_ref().unwrap().dealloc_block(cache_block);
                        return;
                    }
                }
            }
            // TODO: add some kind of a backstop to error out if we loop
            // too many times without finding any unpinned entries
        }
    }
    /// Evict one block from the file cache. This is called when the file cache fills up,
    /// to release a cache block.
    ///
    /// Returns the evicted block. It's not put to the free list, so it's available for
    /// the caller to use immediately.
    fn try_evict_cache_block(&self) -> Option<CacheBlock> {
        let num_buckets = self.block_map.get_num_buckets();
        let mut iterations = 0;
        while iterations < 100 {
            self.metrics.clock_iterations_counter.inc();
            let victim_bucket = self.clock_hand.fetch_add(1, Ordering::Relaxed) % num_buckets;
            let evict_this = match self.block_map.get_at_bucket(victim_bucket).as_deref() {
                None => {
                    // This bucket was unused. It's no use for finding a free cache block
                    continue;
                }
                Some((_, blk_entry)) => {
                    // Clear the 'referenced' flag. If it was already clear,
                    // release the lock (by exiting this scope), and try to
                    // evict it.
                    !blk_entry.referenced.swap(false, Ordering::Relaxed)
                }
            };
            if evict_this {
                match self.try_evict_entry(victim_bucket) {
                    EvictResult::Pinned => {
                        // keep looping
                    }
                    EvictResult::Vacant => {
                        // This was released by someone else. Keep looping.
                    }
                    EvictResult::Evicted(None) => {
                        // This is now free, but it didn't have a cache block
                        // associated with it. Keep looping.
                    }
                    EvictResult::Evicted(Some(cache_block)) => {
                        // Reuse this
                        return Some(cache_block);
                    }
                }
            }
            iterations += 1;
        }
        // Reached the max iteration count without finding an entry. Return
        // to give the caller a chance to do other things
        None
    }
    /// Returns Err, if the page could not be evicted because it was pinned
    fn try_evict_entry(&self, victim: usize) -> EvictResult {
        // grab the write lock
        if let Some(e) = self.block_map.entry_at_bucket(victim) {
            let old = e.get();
            // note: all the accesses to 'pinned' currently happen
            // within update_with_fn(), or while holding ValueReadGuard, which protects from concurrent
            // updates. Otherwise, another thread could set the 'pinned'
            // flag just after we have checked it here.
            //
            // FIXME: ^^ outdated comment, update_with_fn() is no more
            if old.pinned.load(Ordering::Relaxed) == 0 {
                let old_val = e.remove();
                let _ = self
                    .shared
                    .global_lw_lsn
                    .fetch_max(old_val.lw_lsn.into_inner().0, Ordering::Relaxed);
                let evicted_cache_block = match old_val.cache_block.into_inner() {
                    INVALID_CACHE_BLOCK => None,
                    n => Some(n),
                };
                if evicted_cache_block.is_some() {
                    self.metrics.cache_page_evictions_counter.inc();
                }
                self.metrics.block_entry_evictions_counter.inc();
                EvictResult::Evicted(evicted_cache_block)
            } else {
                EvictResult::Pinned
            }
        } else {
            EvictResult::Vacant
        }
    }
    /// Resize the local file cache.
    pub fn resize_file_cache(&self, num_blocks: u32) {
        let old_num_blocks = self.block_map.get_num_buckets() as u32;
        if old_num_blocks < num_blocks {
            if let Err(err) = self.block_map.grow(num_blocks) {
                tracing::warn!(
                    "could not grow file cache to {} blocks (old size {}): {}",
                    num_blocks,
                    old_num_blocks,
                    err
                );
            }
        } else {
            // TODO: Shrinking not implemented yet
        }
    }
    pub fn dump_map(&self, _dst: &mut dyn std::io::Write) {
        //FIXME self.cache_map.start_read().dump(dst);
    }
 }
 impl<T: metric::group::Encoding> MetricGroup<T> for IntegratedCacheWriteAccess<'_>
 where
    CounterState: MetricEncoding<T>,
    GaugeState: MetricEncoding<T>,
 {
    fn collect_group_into(&self, enc: &mut T) -> Result<(), <T as metric::group::Encoding>::Err> {
        // Update gauges
        self.metrics
            .block_map_num_buckets
            .set(self.block_map.get_num_buckets() as i64);
        self.metrics
            .block_map_num_buckets_in_use
            .set(self.block_map.get_num_buckets_in_use() as i64);
        self.metrics
            .relsize_cache_num_buckets
            .set(self.relsize_cache.get_num_buckets() as i64);
        self.metrics
            .relsize_cache_num_buckets_in_use
            .set(self.relsize_cache.get_num_buckets_in_use() as i64);
        if let Some(file_cache) = &self.file_cache {
            file_cache.collect_group_into(enc)?;
        }
        self.metrics.collect_group_into(enc)
    }
 }
 /// Read relation size from the cache.
 ///
 /// This is in a separate function so that it can be shared by
 /// IntegratedCacheReadAccess::get_rel_size() and IntegratedCacheWriteAccess::get_rel_size()
 fn get_rel_size(
    r: &neon_shmem::hash::HashMapAccess<RelKey, RelEntry>,
    rel: &RelTag,
 ) -> Option<u32> {
    if let Some(rel_entry) = r.get(&RelKey::from(rel)) {
        let nblocks = rel_entry.nblocks.load(Ordering::Relaxed);
        if nblocks != u32::MAX {
            Some(nblocks)
        } else {
            None
        }
    } else {
        None
    }
 }
 pub enum GetBucketResult {
    Occupied(RelTag, u32),
    Vacant,
    OutOfBounds,
 }
 /// Accessor for other backends
 ///
 /// This allows backends to read pages from the cache directly, on their own, without making a
 /// request to the communicator process.
 impl<'t> IntegratedCacheReadAccess<'t> {
    pub fn get_rel_size(&'t self, rel: &RelTag) -> Option<u32> {
        get_rel_size(&self.relsize_cache, rel)
    }
    pub fn start_read_op(&'t self) -> BackendCacheReadOp<'t> {
        BackendCacheReadOp {
            read_guards: Vec::new(),
            map_access: self,
        }
    }
    /// Check if LFC contains the given buffer, and update its last-written LSN if not.
    ///
    /// Returns:
    ///   true if the block is in the LFC
    ///   false if it's not.
    ///
    /// If the block was not in the LFC (i.e. when this returns false), the last-written LSN
    /// value on the block is updated to the given 'lsn', so that the next read of the block
    /// will read the new version. Otherwise the caller is assumed to modify the page and
    /// to update the last-written LSN later by writing the new page.
    pub fn update_lw_lsn_for_block_if_not_cached(
        &'t self,
        rel: &RelTag,
        block_number: u32,
        lsn: Lsn,
    ) -> bool {
        let key = BlockKey::from((rel, block_number));
        let entry = self.block_map.entry(key);
        match entry {
            Entry::Occupied(e) => {
                let block_entry = e.get();
                if block_entry.cache_block.load(Ordering::Relaxed) != INVALID_CACHE_BLOCK {
                    block_entry.referenced.store(true, Ordering::Relaxed);
                    true
                } else {
                    let old_lwlsn = block_entry.lw_lsn.fetch_max(lsn);
                    if old_lwlsn >= lsn {
                        // shouldn't happen
                        tracing::warn!(
                            "attempted to move last-written LSN backwards from {old_lwlsn} to {lsn} for rel {rel} blk {block_number}"
                        );
                    }
                    false
                }
            }
            Entry::Vacant(e) => {
                if e.insert(BlockEntry {
                    lw_lsn: AtomicLsn::new(lsn.0),
                    cache_block: AtomicU64::new(INVALID_CACHE_BLOCK),
                    pinned: AtomicU64::new(0),
                    referenced: AtomicBool::new(true),
                })
                .is_ok()
                {
                    false
                } else {
                    // The hash table is full.
                    //
                    // TODO: Evict something. But for now, just set the global lw LSN instead.
                    // That's correct, but not very efficient for future reads
                    let _ = self
                        .shared
                        .global_lw_lsn
                        .fetch_max(lsn.0, Ordering::Relaxed);
                    false
                }
            }
        }
    }
    pub fn get_bucket(&self, bucket_no: usize) -> GetBucketResult {
        match self.block_map.get_at_bucket(bucket_no).as_deref() {
            None => {
                // free bucket, or out of bounds
                if bucket_no >= self.block_map.get_num_buckets() {
                    GetBucketResult::OutOfBounds
                } else {
                    GetBucketResult::Vacant
                }
            }
            Some((key, _)) => GetBucketResult::Occupied(key.rel, key.block_number),
        }
    }
    pub fn get_num_buckets_in_use(&self) -> usize {
        self.block_map.get_num_buckets_in_use()
    }
 }
 pub struct BackendCacheReadOp<'t> {
    read_guards: Vec<DeferredUnpin>,
    map_access: &'t IntegratedCacheReadAccess<'t>,
 }
 impl<'e> BackendCacheReadOp<'e> {
    /// Initiate a read of the page from the cache.
    ///
    /// This returns the "cache block number", i.e. the block number within the cache file, where
    /// the page's contents is stored. To get the page contents, the caller needs to read that block
    /// from the cache file. This returns a guard object that you must hold while it performs the
    /// read. It's possible that while you are performing the read, the cache block is invalidated.
    /// After you have completed the read, call BackendCacheReadResult::finish() to check if the
    /// read was in fact valid or not. If it was concurrently invalidated, you need to retry.
    pub fn get_page(&mut self, rel: &RelTag, block_number: u32) -> Option<u64> {
        if let Some(block_entry) = self
            .map_access
            .block_map
            .get(&BlockKey::from((rel, block_number)))
        {
            block_entry.referenced.store(true, Ordering::Relaxed);
            let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
            if cache_block != INVALID_CACHE_BLOCK {
                block_entry.pinned.fetch_add(1, Ordering::Relaxed);
                self.read_guards
                    .push(DeferredUnpin(block_entry.pinned.as_ptr()));
                Some(cache_block)
            } else {
                None
            }
        } else {
            None
        }
    }
    pub fn finish(self) -> bool {
        // TODO: currently, we hold a pin on the in-memory map, so concurrent invalidations are not
        // possible. But if we switch to optimistic locking, this would return 'false' if the
        // optimistic locking failed and you need to retry.
        true
    }
 }
 /// A hack to decrement an AtomicU64 on drop. This is used to decrement the pin count
 /// of a BlockEntry. The safety depends on the fact that the BlockEntry is not evicted
 /// or moved while it's pinned.
 struct DeferredUnpin(*mut u64);
 unsafe impl Sync for DeferredUnpin {}
 unsafe impl Send for DeferredUnpin {}
 impl Drop for DeferredUnpin {
    fn drop(&mut self) {
        // unpin it
        unsafe {
            let pin_ref = AtomicU64::from_ptr(self.0);
            pin_ref.fetch_sub(1, Ordering::Relaxed);
        }
    }
 }
--- a/pgxn/neon/communicator/src/lib.rs
+++ b/pgxn/neon/communicator/src/lib.rs
@@ -1,5 +1,29 @@
 //! Three main parts:
 //! - async tokio communicator core, which receives requests and processes them.
 //! - Main loop and requests queues, which routes requests from backends to the core
 //! - the per-backend glue code, which submits requests
 mod backend_comms;
 // mark this 'pub', because these functions are called from C code. Otherwise, the compiler
 // complains about a bunch of structs and enum variants being unused, because it thinkgs
 // the functions that use them are never called. There are some C-callable functions in
 // other modules too, but marking this as pub is currently enough to silence the warnings
 //
 // TODO: perhaps collect *all* the extern "C" functions to one module?
 pub mod backend_interface;
 mod file_cache;
 mod init;
 mod integrated_cache;
 mod neon_request;
 mod worker_process;
 mod global_allocator;
 /// Name of the Unix Domain Socket that serves the metrics, and other APIs in the
 /// future. This is within the Postgres data directory.
 const NEON_COMMUNICATOR_SOCKET_NAME: &str = "neon-communicator.socket";
 // FIXME: get this from postgres headers somehow
 pub const BLCKSZ: usize = 8192;
--- a/pgxn/neon/communicator/src/neon_request.rs
+++ b/pgxn/neon/communicator/src/neon_request.rs
@@ -0,0 +1,466 @@
 // Definitions of some core PostgreSQL datatypes.
 /// XLogRecPtr is defined in "access/xlogdefs.h" as:
 ///
 /// ```
 /// typedef uint64 XLogRecPtr;
 /// ```
 /// cbindgen:no-export
 pub type XLogRecPtr = u64;
 pub type CLsn = XLogRecPtr;
 pub type COid = u32;
 // This conveniently matches PG_IOV_MAX
 pub const MAX_GETPAGEV_PAGES: usize = 32;
 pub const INVALID_BLOCK_NUMBER: u32 = u32::MAX;
 use std::ffi::CStr;
 use pageserver_page_api::{self as page_api, SlruKind};
 /// Request from a Postgres backend to the communicator process
 #[allow(clippy::large_enum_variant)]
 #[repr(C)]
 #[derive(Copy, Clone, Debug, strum_macros::EnumDiscriminants)]
 #[strum_discriminants(derive(measured::FixedCardinalityLabel))]
 pub enum NeonIORequest {
    Empty,
    // Read requests. These are C-friendly variants of the corresponding structs in
    // pageserver_page_api.
    RelSize(CRelSizeRequest),
    GetPageV(CGetPageVRequest),
    ReadSlruSegment(CReadSlruSegmentRequest),
    PrefetchV(CPrefetchVRequest),
    DbSize(CDbSizeRequest),
    /// This is like GetPageV, but bypasses the LFC and allows specifiying the
    /// request LSNs directly. For debugging purposes only.
    GetPageVUncached(CGetPageVUncachedRequest),
    // Write requests. These are needed to keep the relation size cache and LFC up-to-date.
    // They are not sent to the pageserver.
    WritePage(CWritePageRequest),
    RelExtend(CRelExtendRequest),
    RelZeroExtend(CRelZeroExtendRequest),
    RelCreate(CRelCreateRequest),
    RelTruncate(CRelTruncateRequest),
    RelUnlink(CRelUnlinkRequest),
    // Other requests
    UpdateCachedRelSize(CUpdateCachedRelSizeRequest),
 }
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub enum NeonIOResult {
    Empty,
    /// InvalidBlockNumber == 0xffffffff means "rel does not exist"
    RelSize(u32),
    /// the result pages are written to the shared memory addresses given in the request
    GetPageV,
    /// The result is written to the file, path to which is provided
    /// in the request. The [`u64`] value here is the number of blocks.
    ReadSlruSegment(u64),
    /// A prefetch request returns as soon as the request has been received by the communicator.
    /// It is processed in the background.
    PrefetchVLaunched,
    DbSize(u64),
    // FIXME design compact error codes. Can't easily pass a string or other dynamic data.
    // currently, this is 'errno'
    Error(i32),
    Aborted,
    /// used for all write requests
    WriteOK,
 }
 impl NeonIORequest {
    /// All requests include a unique request ID, which can be used to trace the execution
    /// of a request all the way to the pageservers. The request ID needs to be unique
    /// within the lifetime of the Postgres instance (but not across servers or across
    /// restarts of the same server).
    pub fn request_id(&self) -> u64 {
        use NeonIORequest::*;
        match self {
            Empty => 0,
            RelSize(req) => req.request_id,
            GetPageV(req) => req.request_id,
            GetPageVUncached(req) => req.request_id,
            ReadSlruSegment(req) => req.request_id,
            PrefetchV(req) => req.request_id,
            DbSize(req) => req.request_id,
            WritePage(req) => req.request_id,
            RelExtend(req) => req.request_id,
            RelZeroExtend(req) => req.request_id,
            RelCreate(req) => req.request_id,
            RelTruncate(req) => req.request_id,
            RelUnlink(req) => req.request_id,
            UpdateCachedRelSize(req) => req.request_id,
        }
    }
 }
 /// Special quick result to a CGetPageVRequest request, indicating that the
 /// the requested pages are present in the local file cache. The backend can
 /// read the blocks directly from the given LFC blocks.
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct CCachedGetPageVResult {
    pub cache_block_numbers: [u64; MAX_GETPAGEV_PAGES],
 }
 /// ShmemBuf represents a buffer in shared memory.
 ///
 /// SAFETY: The pointer must point to an area in shared memory. The functions allow you to liberally
 /// get a mutable pointer to the contents; it is the caller's responsibility to ensure that you
 /// don't access a buffer that's you're not allowed to. Inappropriate access to the buffer doesn't
 /// violate Rust's safety semantics, but it will mess up and crash Postgres.
 ///
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct ShmemBuf {
    // Pointer to where the result is written or where to read from. Must point into a buffer in shared memory!
    pub ptr: *mut u8,
 }
 unsafe impl Send for ShmemBuf {}
 unsafe impl Sync for ShmemBuf {}
 unsafe impl uring_common::buf::IoBuf for ShmemBuf {
    fn stable_ptr(&self) -> *const u8 {
        self.ptr
    }
    fn bytes_init(&self) -> usize {
        crate::BLCKSZ
    }
    fn bytes_total(&self) -> usize {
        crate::BLCKSZ
    }
 }
 unsafe impl uring_common::buf::IoBufMut for ShmemBuf {
    fn stable_mut_ptr(&mut self) -> *mut u8 {
        self.ptr
    }
    unsafe fn set_init(&mut self, pos: usize) {
        if pos > crate::BLCKSZ {
            panic!(
                "set_init called past end of buffer, pos {}, buffer size {}",
                pos,
                crate::BLCKSZ
            );
        }
    }
 }
 impl ShmemBuf {
    pub fn as_mut_ptr(&self) -> *mut u8 {
        self.ptr
    }
 }
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct CRelSizeRequest {
    pub request_id: u64,
    pub spc_oid: COid,
    pub db_oid: COid,
    pub rel_number: u32,
    pub fork_number: u8,
    pub allow_missing: bool,
 }
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct CGetPageVRequest {
    pub request_id: u64,
    pub spc_oid: COid,
    pub db_oid: COid,
    pub rel_number: u32,
    pub fork_number: u8,
    pub block_number: u32,
    pub nblocks: u8,
    // These fields define where the result is written. Must point into a buffer in shared memory!
    pub dest: [ShmemBuf; MAX_GETPAGEV_PAGES],
 }
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct CGetPageVUncachedRequest {
    pub request_id: u64,
    pub spc_oid: COid,
    pub db_oid: COid,
    pub rel_number: u32,
    pub fork_number: u8,
    pub block_number: u32,
    pub nblocks: u8,
    pub request_lsn: CLsn,
    pub not_modified_since: CLsn,
    // These fields define where the result is written. Must point into a buffer in shared memory!
    pub dest: [ShmemBuf; MAX_GETPAGEV_PAGES],
 }
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct CReadSlruSegmentRequest {
    pub request_id: u64,
    pub slru_kind: SlruKind,
    pub segment_number: u32,
    pub request_lsn: CLsn,
    /// Must be a null-terminated C string containing the file path
    /// where the communicator will write the SLRU segment.
    pub destination_file_path: ShmemBuf,
 }
 impl CReadSlruSegmentRequest {
    /// Returns the file path where the communicator will write the
    /// SLRU segment.
    pub(crate) fn destination_file_path(&self) -> String {
        unsafe { CStr::from_ptr(self.destination_file_path.as_mut_ptr() as *const _) }
            .to_string_lossy()
            .into_owned()
    }
 }
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct CPrefetchVRequest {
    pub request_id: u64,
    pub spc_oid: COid,
    pub db_oid: COid,
    pub rel_number: u32,
    pub fork_number: u8,
    pub block_number: u32,
    pub nblocks: u8,
 }
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct CDbSizeRequest {
    pub request_id: u64,
    pub db_oid: COid,
 }
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct CWritePageRequest {
    pub request_id: u64,
    pub spc_oid: COid,
    pub db_oid: COid,
    pub rel_number: u32,
    pub fork_number: u8,
    pub block_number: u32,
    pub lsn: CLsn,
    // `src` defines the new page contents. Must point into a buffer in shared memory!
    pub src: ShmemBuf,
 }
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct CRelExtendRequest {
    pub request_id: u64,
    pub spc_oid: COid,
    pub db_oid: COid,
    pub rel_number: u32,
    pub fork_number: u8,
    pub block_number: u32,
    pub lsn: CLsn,
    // `src` defines the new page contents. Must point into a buffer in shared memory!
    pub src: ShmemBuf,
 }
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct CRelZeroExtendRequest {
    pub request_id: u64,
    pub spc_oid: COid,
    pub db_oid: COid,
    pub rel_number: u32,
    pub fork_number: u8,
    pub block_number: u32,
    pub nblocks: u32,
    pub lsn: CLsn,
 }
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct CRelCreateRequest {
    pub request_id: u64,
    pub spc_oid: COid,
    pub db_oid: COid,
    pub rel_number: u32,
    pub fork_number: u8,
    pub lsn: CLsn,
 }
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct CRelTruncateRequest {
    pub request_id: u64,
    pub spc_oid: COid,
    pub db_oid: COid,
    pub rel_number: u32,
    pub fork_number: u8,
    pub nblocks: u32,
    pub lsn: CLsn,
 }
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct CRelUnlinkRequest {
    pub request_id: u64,
    pub spc_oid: COid,
    pub db_oid: COid,
    pub rel_number: u32,
    pub fork_number: u8,
    pub lsn: CLsn,
 }
 impl CRelSizeRequest {
    pub fn reltag(&self) -> page_api::RelTag {
        page_api::RelTag {
            spcnode: self.spc_oid,
            dbnode: self.db_oid,
            relnode: self.rel_number,
            forknum: self.fork_number,
        }
    }
 }
 impl CGetPageVRequest {
    pub fn reltag(&self) -> page_api::RelTag {
        page_api::RelTag {
            spcnode: self.spc_oid,
            dbnode: self.db_oid,
            relnode: self.rel_number,
            forknum: self.fork_number,
        }
    }
 }
 impl CGetPageVUncachedRequest {
    pub fn reltag(&self) -> page_api::RelTag {
        page_api::RelTag {
            spcnode: self.spc_oid,
            dbnode: self.db_oid,
            relnode: self.rel_number,
            forknum: self.fork_number,
        }
    }
 }
 impl CPrefetchVRequest {
    pub fn reltag(&self) -> page_api::RelTag {
        page_api::RelTag {
            spcnode: self.spc_oid,
            dbnode: self.db_oid,
            relnode: self.rel_number,
            forknum: self.fork_number,
        }
    }
 }
 impl CWritePageRequest {
    pub fn reltag(&self) -> page_api::RelTag {
        page_api::RelTag {
            spcnode: self.spc_oid,
            dbnode: self.db_oid,
            relnode: self.rel_number,
            forknum: self.fork_number,
        }
    }
 }
 impl CRelExtendRequest {
    pub fn reltag(&self) -> page_api::RelTag {
        page_api::RelTag {
            spcnode: self.spc_oid,
            dbnode: self.db_oid,
            relnode: self.rel_number,
            forknum: self.fork_number,
        }
    }
 }
 impl CRelZeroExtendRequest {
    pub fn reltag(&self) -> page_api::RelTag {
        page_api::RelTag {
            spcnode: self.spc_oid,
            dbnode: self.db_oid,
            relnode: self.rel_number,
            forknum: self.fork_number,
        }
    }
 }
 impl CRelCreateRequest {
    pub fn reltag(&self) -> page_api::RelTag {
        page_api::RelTag {
            spcnode: self.spc_oid,
            dbnode: self.db_oid,
            relnode: self.rel_number,
            forknum: self.fork_number,
        }
    }
 }
 impl CRelTruncateRequest {
    pub fn reltag(&self) -> page_api::RelTag {
        page_api::RelTag {
            spcnode: self.spc_oid,
            dbnode: self.db_oid,
            relnode: self.rel_number,
            forknum: self.fork_number,
        }
    }
 }
 impl CRelUnlinkRequest {
    pub fn reltag(&self) -> page_api::RelTag {
        page_api::RelTag {
            spcnode: self.spc_oid,
            dbnode: self.db_oid,
            relnode: self.rel_number,
            forknum: self.fork_number,
        }
    }
 }
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct CUpdateCachedRelSizeRequest {
    pub request_id: u64,
    pub spc_oid: COid,
    pub db_oid: COid,
    pub rel_number: u32,
    pub fork_number: u8,
    pub nblocks: u32,
    pub lsn: CLsn,
 }
 impl CUpdateCachedRelSizeRequest {
    pub fn reltag(&self) -> page_api::RelTag {
        page_api::RelTag {
            spcnode: self.spc_oid,
            dbnode: self.db_oid,
            relnode: self.rel_number,
            forknum: self.fork_number,
        }
    }
 }
--- a/pgxn/neon/communicator/src/worker_process/callbacks.rs
+++ b/pgxn/neon/communicator/src/worker_process/callbacks.rs
@@ -4,10 +4,13 @@
 //!
 //! These are called from the communicator threads! Careful what you do, most Postgres
 //! functions are not safe to call in that context.
 use utils::lsn::Lsn;
 #[cfg(not(test))]
 unsafe extern "C" {
    pub fn notify_proc_unsafe(procno: std::ffi::c_int);
    pub fn callback_set_my_latch_unsafe();
    pub fn callback_get_request_lsn_unsafe() -> crate::neon_request::CLsn;
    pub fn callback_get_lfc_metrics_unsafe() -> LfcMetrics;
 }
@@ -16,20 +19,36 @@ unsafe extern "C" {
 // package, but the code coverage build still builds these and tries to link with the
 // external C code.)
 #[cfg(test)]
 unsafe fn notify_proc_unsafe(_procno: std::ffi::c_int) {
    panic!("not usable in unit tests");
 }
 #[cfg(test)]
 unsafe fn callback_set_my_latch_unsafe() {
    panic!("not usable in unit tests");
 }
 #[cfg(test)]
 unsafe fn callback_get_request_lsn_unsafe() -> crate::neon_request::CLsn {
    panic!("not usable in unit tests");
 }
 #[cfg(test)]
 unsafe fn callback_get_lfc_metrics_unsafe() -> LfcMetrics {
    panic!("not usable in unit tests");
 }
 // safe wrappers
 pub(super) fn notify_proc(procno: std::ffi::c_int) {
    unsafe { notify_proc_unsafe(procno) };
 }
 pub(super) fn callback_set_my_latch() {
    unsafe { callback_set_my_latch_unsafe() };
 }
 pub(super) fn get_request_lsn() -> Lsn {
    Lsn(unsafe { callback_get_request_lsn_unsafe() })
 }
 pub(super) fn callback_get_lfc_metrics() -> LfcMetrics {
    unsafe { callback_get_lfc_metrics_unsafe() }
 }
--- a/pgxn/neon/communicator/src/worker_process/control_socket.rs
+++ b/pgxn/neon/communicator/src/worker_process/control_socket.rs
@@ -19,71 +19,105 @@ use http::StatusCode;
 use http::header::CONTENT_TYPE;
 use measured::MetricGroup;
 use measured::metric::MetricEncoding;
 use measured::metric::gauge::GaugeState;
 use measured::metric::group::Encoding;
 use measured::text::BufferedTextEncoder;
 use std::io::ErrorKind;
 use std::sync::Arc;
 use tokio::net::UnixListener;
 use crate::NEON_COMMUNICATOR_SOCKET_NAME;
 use crate::worker_process::lfc_metrics::LfcMetricsCollector;
 use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
-impl CommunicatorWorkerProcessStruct {
+enum ControlSocketState<'a> {
-    /// Launch the listener
+    Full(&'a CommunicatorWorkerProcessStruct<'a>),
-    pub(crate) async fn launch_control_socket_listener(
+    Legacy(LegacyControlSocketState),
-        &'static self,
+}
    ) -> Result<(), std::io::Error> {
        use axum::routing::get;
        let app = Router::new()
            .route("/metrics", get(get_metrics))
            .route("/autoscaling_metrics", get(get_autoscaling_metrics))
            .route("/debug/panic", get(handle_debug_panic))
            .with_state(self);
-        // If the server is restarted, there might be an old socket still
+struct LegacyControlSocketState {
-        // lying around. Remove it first.
+    pub(crate) lfc_metrics: LfcMetricsCollector,
-        match std::fs::remove_file(NEON_COMMUNICATOR_SOCKET_NAME) {
+}
            Ok(()) => {
                tracing::warn!("removed stale control socket");
            }
            Err(e) if e.kind() == ErrorKind::NotFound => {}
            Err(e) => {
                tracing::error!("could not remove stale control socket: {e:#}");
                // Try to proceed anyway. It will likely fail below though.
            }
        };
        // Create the unix domain socket and start listening on it
        let listener = UnixListener::bind(NEON_COMMUNICATOR_SOCKET_NAME)?;
        tokio::spawn(async {
            tracing::info!("control socket listener spawned");
            axum::serve(listener, app)
                .await
                .expect("axum::serve never returns")
        });
 impl<T> MetricGroup<T> for LegacyControlSocketState
 where
    T: Encoding,
    GaugeState: MetricEncoding<T>,
 {
    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
        self.lfc_metrics.collect_group_into(enc)?;
        Ok(())
    }
 }
 /// Launch the listener
 pub(crate) async fn launch_listener(
    worker: Option<&'static CommunicatorWorkerProcessStruct<'static>>,
 ) -> Result<(), std::io::Error> {
    use axum::routing::get;
    let state = match worker {
        Some(worker) => ControlSocketState::Full(worker),
        None => ControlSocketState::Legacy(LegacyControlSocketState {
            lfc_metrics: LfcMetricsCollector,
        }),
    };
    let app = Router::new()
        .route("/metrics", get(get_metrics))
        .route("/autoscaling_metrics", get(get_autoscaling_metrics))
        .route("/debug/panic", get(handle_debug_panic))
        .route("/debug/dump_cache_map", get(dump_cache_map))
        .with_state(Arc::new(state));
    // If the server is restarted, there might be an old socket still
    // lying around. Remove it first.
    match std::fs::remove_file(NEON_COMMUNICATOR_SOCKET_NAME) {
        Ok(()) => {
            tracing::warn!("removed stale control socket");
        }
        Err(e) if e.kind() == ErrorKind::NotFound => {}
        Err(e) => {
            tracing::error!("could not remove stale control socket: {e:#}");
            // Try to proceed anyway. It will likely fail below though.
        }
    };
    // Create the unix domain socket and start listening on it
    let listener = UnixListener::bind(NEON_COMMUNICATOR_SOCKET_NAME)?;
    tokio::spawn(async {
        tracing::info!("control socket listener spawned");
        axum::serve(listener, app)
            .await
            .expect("axum::serve never returns")
    });
    Ok(())
 }
 /// Expose all Prometheus metrics.
-async fn get_metrics(State(state): State<&CommunicatorWorkerProcessStruct>) -> Response {
+async fn get_metrics(State(state): State<Arc<ControlSocketState<'_>>>) -> Response {
-    tracing::trace!("/metrics requested");
+    match state.as_ref() {
-    metrics_to_response(&state).await
+        ControlSocketState::Full(worker) => metrics_to_response(&worker).await,
        ControlSocketState::Legacy(legacy) => metrics_to_response(&legacy).await,
    }
 }
 /// Expose Prometheus metrics, for use by the autoscaling agent.
 ///
 /// This is a subset of all the metrics.
-async fn get_autoscaling_metrics(
+async fn get_autoscaling_metrics(State(state): State<Arc<ControlSocketState<'_>>>) -> Response {
-    State(state): State<&CommunicatorWorkerProcessStruct>,
+    match state.as_ref() {
-) -> Response {
+        ControlSocketState::Full(worker) => metrics_to_response(&worker.lfc_metrics).await,
-    tracing::trace!("/metrics requested");
+        ControlSocketState::Legacy(legacy) => metrics_to_response(&legacy.lfc_metrics).await,
-    metrics_to_response(&state.lfc_metrics).await
+    }
 }
-async fn handle_debug_panic(State(_state): State<&CommunicatorWorkerProcessStruct>) -> Response {
+async fn handle_debug_panic(State(_state): State<Arc<ControlSocketState<'_>>>) -> Response {
    panic!("test HTTP handler task panic");
 }
@@ -100,3 +134,23 @@ async fn metrics_to_response(metrics: &(dyn MetricGroup<BufferedTextEncoder> + S
        .body(Body::from(enc.finish()))
        .unwrap()
 }
 async fn dump_cache_map(State(state): State<Arc<ControlSocketState<'_>>>) -> Response {
    match state.as_ref() {
        ControlSocketState::Full(worker) => {
            let mut buf: Vec<u8> = Vec::new();
            worker.cache.dump_map(&mut buf);
            Response::builder()
                .status(StatusCode::OK)
                .header(CONTENT_TYPE, "application/text")
                .body(Body::from(buf))
                .unwrap()
        }
        ControlSocketState::Legacy(_) => Response::builder()
            .status(StatusCode::NOT_FOUND)
            .header(CONTENT_TYPE, "application/text")
            .body(Body::from(Vec::new()))
            .unwrap(),
    }
 }
--- a/pgxn/neon/communicator/src/worker_process/in_progress_ios.rs
+++ b/pgxn/neon/communicator/src/worker_process/in_progress_ios.rs
@@ -0,0 +1,95 @@
 //! Lock table to ensure that only one IO request is in flight for a given
 //! block (or relation or database metadata) at a time
 use std::cmp::Eq;
 use std::hash::Hash;
 use std::sync::Arc;
 use tokio::sync::{Mutex, OwnedMutexGuard};
 use clashmap::ClashMap;
 use clashmap::Entry;
 use pageserver_page_api::RelTag;
 #[derive(Clone, Eq, Hash, PartialEq)]
 pub enum RequestInProgressKey {
    Db(u32),
    Rel(RelTag),
    Block(RelTag, u32),
 }
 type RequestId = u64;
 pub type RequestInProgressTable = MutexHashMap<RequestInProgressKey, RequestId>;
 // more primitive locking thingie:
 pub struct MutexHashMap<K, V>
 where
    K: Clone + Eq + Hash,
 {
    lock_table: ClashMap<K, (V, Arc<Mutex<()>>)>,
 }
 pub struct MutexHashMapGuard<'a, K, V>
 where
    K: Clone + Eq + Hash,
 {
    pub key: K,
    map: &'a MutexHashMap<K, V>,
    mutex: Arc<Mutex<()>>,
    _guard: OwnedMutexGuard<()>,
 }
 impl<'a, K, V> Drop for MutexHashMapGuard<'a, K, V>
 where
    K: Clone + Eq + Hash,
 {
    fn drop(&mut self) {
        let (_old_key, old_val) = self.map.lock_table.remove(&self.key).unwrap();
        assert!(Arc::ptr_eq(&old_val.1, &self.mutex));
        // the guard will be dropped as we return
    }
 }
 impl<K, V> MutexHashMap<K, V>
 where
    K: Clone + Eq + Hash,
    V: std::fmt::Display + Copy,
 {
    pub fn new() -> MutexHashMap<K, V> {
        MutexHashMap {
            lock_table: ClashMap::new(),
        }
    }
    pub async fn lock<'a>(&'a self, key: K, val: V) -> MutexHashMapGuard<'a, K, V> {
        let my_mutex = Arc::new(Mutex::new(()));
        let my_guard = Arc::clone(&my_mutex).lock_owned().await;
        loop {
            let (request_id, lock) = match self.lock_table.entry(key.clone()) {
                Entry::Occupied(e) => {
                    let e = e.get();
                    (e.0, Arc::clone(&e.1))
                }
                Entry::Vacant(e) => {
                    e.insert((val, Arc::clone(&my_mutex)));
                    break;
                }
            };
            tracing::info!("waiting for conflicting IO {request_id} to complete");
            let _ = lock.lock().await;
            tracing::info!("conflicting IO {request_id} completed");
        }
        MutexHashMapGuard {
            key,
            map: self,
            mutex: my_mutex,
            _guard: my_guard,
        }
    }
 }
--- a/pgxn/neon/communicator/src/worker_process/main_loop.rs
+++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs
@@ -1,34 +1,126 @@
 use std::collections::HashMap;
 use std::os::fd::AsRawFd;
 use std::os::fd::OwnedFd;
 use std::path::PathBuf;
 use std::str::FromStr as _;
 use crate::backend_comms::NeonIORequestSlot;
 use crate::file_cache::FileCache;
 use crate::global_allocator::MyAllocatorCollector;
 use crate::init::CommunicatorInitStruct;
 use crate::integrated_cache::{CacheResult, IntegratedCacheWriteAccess};
 use crate::neon_request::{CGetPageVRequest, CGetPageVUncachedRequest, CPrefetchVRequest};
 use crate::neon_request::{INVALID_BLOCK_NUMBER, NeonIORequest, NeonIOResult};
 use crate::worker_process::control_socket;
 use crate::worker_process::in_progress_ios::{RequestInProgressKey, RequestInProgressTable};
 use crate::worker_process::lfc_metrics::LfcMetricsCollector;
 use pageserver_client_grpc::{PageserverClient, ShardSpec, ShardStripeSize};
 use pageserver_page_api as page_api;
 use tokio::io::AsyncReadExt;
 use tokio_pipe::PipeRead;
 use uring_common::buf::IoBuf;
 use measured::MetricGroup;
 use measured::metric::MetricEncoding;
 use measured::metric::counter::CounterState;
 use measured::metric::gauge::GaugeState;
 use measured::metric::group::Encoding;
 use measured::{Gauge, GaugeVec};
 use utils::id::{TenantId, TimelineId};
-pub struct CommunicatorWorkerProcessStruct {
+use super::callbacks::{get_request_lsn, notify_proc};
 use tracing::{error, info, info_span, trace};
 use utils::lsn::Lsn;
 pub struct CommunicatorWorkerProcessStruct<'a> {
    /// Tokio runtime that the main loop and any other related tasks runs in.
    runtime: tokio::runtime::Runtime,
    /// Client to communicate with the pageserver
    client: PageserverClient,
    /// Request slots that backends use to send IO requests to the communicator.
    neon_request_slots: &'a [NeonIORequestSlot],
    /// Notification pipe. Backends use this to notify the communicator that a request is waiting to
    /// be processed in one of the request slots.
    submission_pipe_read_fd: OwnedFd,
    /// Locking table for all in-progress IO requests.
    in_progress_table: RequestInProgressTable,
    /// Local File Cache, relation size tracking, last-written LSN tracking
    pub(crate) cache: IntegratedCacheWriteAccess<'a>,
    /*** Metrics ***/
    pub(crate) lfc_metrics: LfcMetricsCollector,
    request_counters: GaugeVec<RequestTypeLabelGroupSet>,
    getpage_cache_misses_counter: Gauge,
    getpage_cache_hits_counter: Gauge,
    // For the requests that affect multiple blocks, have separate counters for the # of blocks affected
    request_nblocks_counters: GaugeVec<RequestTypeLabelGroupSet>,
    allocator_metrics: MyAllocatorCollector,
 }
 // Define a label group, consisting of 1 or more label values
 #[derive(measured::LabelGroup)]
 #[label(set = RequestTypeLabelGroupSet)]
 struct RequestTypeLabelGroup {
    request_type: crate::neon_request::NeonIORequestDiscriminants,
 }
 impl RequestTypeLabelGroup {
    fn from_req(req: &NeonIORequest) -> Self {
        RequestTypeLabelGroup {
            request_type: req.into(),
        }
    }
 }
 /// Launch the communicator process's Rust subsystems
 #[allow(clippy::too_many_arguments)]
 pub(super) fn init_legacy() -> Result<(), String> {
    let runtime = tokio::runtime::Builder::new_multi_thread()
        .enable_all()
        .thread_name("communicator thread")
        .build()
        .unwrap();
    // Start the listener on the control socket
    runtime
        .block_on(control_socket::launch_listener(None))
        .map_err(|e| e.to_string())?;
    Box::leak(Box::new(runtime));
    Ok(())
 }
 /// Launch the communicator process's Rust subsystems
 #[allow(clippy::too_many_arguments)]
 pub(super) fn init(
-    tenant_id: Option<&str>,
+    cis: CommunicatorInitStruct,
-    timeline_id: Option<&str>,
+    tenant_id: &str,
-) -> Result<&'static CommunicatorWorkerProcessStruct, String> {
+    timeline_id: &str,
    auth_token: Option<&str>,
    shard_map: HashMap<utils::shard::ShardIndex, String>,
    stripe_size: Option<ShardStripeSize>,
    initial_file_cache_size: u64,
    file_cache_path: Option<PathBuf>,
 ) -> Result<&'static CommunicatorWorkerProcessStruct<'static>, String> {
    // The caller validated these already
-    let _tenant_id = tenant_id
+    let tenant_id = TenantId::from_str(tenant_id).map_err(|e| format!("invalid tenant ID: {e}"))?;
-        .map(TenantId::from_str)
+    let timeline_id =
-        .transpose()
+        TimelineId::from_str(timeline_id).map_err(|e| format!("invalid timeline ID: {e}"))?;
-        .map_err(|e| format!("invalid tenant ID: {e}"))?;
+    let shard_spec =
-    let _timeline_id = timeline_id
+        ShardSpec::new(shard_map, stripe_size).map_err(|e| format!("invalid shard spec: {e}:"))?;
        .map(TimelineId::from_str)
        .transpose()
        .map_err(|e| format!("invalid timeline ID: {e}"))?;
    let runtime = tokio::runtime::Builder::new_multi_thread()
        .enable_all()
@@ -36,31 +128,716 @@ pub(super) fn init(
        .build()
        .unwrap();
    let last_lsn = get_request_lsn();
    let file_cache = if let Some(path) = file_cache_path {
        Some(FileCache::new(&path, initial_file_cache_size).expect("could not create cache file"))
    } else {
        // FIXME: temporarily for testing, use LFC even if disabled
        Some(
            FileCache::new(&PathBuf::from("new_filecache"), 1000)
                .expect("could not create cache file"),
        )
    };
    // Initialize subsystems
    let cache = cis
        .integrated_cache_init_struct
        .worker_process_init(last_lsn, file_cache);
    let client = {
        let _guard = runtime.enter();
        PageserverClient::new(
            tenant_id,
            timeline_id,
            shard_spec,
            auth_token.map(|s| s.to_string()),
            None,
        )
        .expect("could not create client")
    };
    let worker_struct = CommunicatorWorkerProcessStruct {
        // Note: it's important to not drop the runtime, or all the tasks are dropped
        // too. Including it in the returned struct is one way to keep it around.
        runtime,
        neon_request_slots: cis.neon_request_slots,
        client,
        cache,
        submission_pipe_read_fd: cis.submission_pipe_read_fd,
        in_progress_table: RequestInProgressTable::new(),
        // metrics
        lfc_metrics: LfcMetricsCollector,
        request_counters: GaugeVec::new(),
        getpage_cache_misses_counter: Gauge::new(),
        getpage_cache_hits_counter: Gauge::new(),
        request_nblocks_counters: GaugeVec::new(),
        allocator_metrics: MyAllocatorCollector::new(),
    };
    let worker_struct = Box::leak(Box::new(worker_struct));
    let main_loop_handle = worker_struct.runtime.spawn(worker_struct.run());
    worker_struct.runtime.spawn(async {
        let err = main_loop_handle.await.unwrap_err();
        error!("error: {err:?}");
    });
    // Start the listener on the control socket
    worker_struct
        .runtime
-        .block_on(worker_struct.launch_control_socket_listener())
+        .block_on(control_socket::launch_listener(Some(worker_struct)))
        .map_err(|e| e.to_string())?;
    Ok(worker_struct)
 }
-impl<T> MetricGroup<T> for CommunicatorWorkerProcessStruct
+impl<'t> CommunicatorWorkerProcessStruct<'t> {
    /// Update the configuration
    pub(super) fn update_shard_map(
        &self,
        new_shard_map: HashMap<utils::shard::ShardIndex, String>,
        stripe_size: Option<ShardStripeSize>,
    ) {
        let shard_spec = ShardSpec::new(new_shard_map, stripe_size).expect("invalid shard spec");
        {
            let _in_runtime = self.runtime.enter();
            if let Err(err) = self.client.update_shards(shard_spec) {
                tracing::error!("could not update shard map: {err:?}");
            }
        }
    }
    /// Main loop of the worker process. Receive requests from the backends and process them.
    pub(super) async fn run(&'static self) {
        let mut idxbuf: [u8; 4] = [0; 4];
        let mut submission_pipe_read =
            PipeRead::try_from(self.submission_pipe_read_fd.as_raw_fd()).expect("invalid pipe fd");
        loop {
            // Wait for a backend to ring the doorbell
            match submission_pipe_read.read(&mut idxbuf).await {
                Ok(4) => {}
                Ok(nbytes) => panic!("short read ({nbytes} bytes) on communicator pipe"),
                Err(e) => panic!("error reading from communicator pipe: {e}"),
            }
            let slot_idx = u32::from_ne_bytes(idxbuf) as usize;
            // Read the IO request from the slot indicated in the wakeup
            let Some(slot) = self.neon_request_slots[slot_idx].start_processing_request() else {
                // This currently should not happen. But if we had multiple threads picking up
                // requests, and without waiting for the notifications, it could.
                panic!("no request in slot");
            };
            // Ok, we have ownership of this request now. We must process it now, there's no going
            // back.
            //
            // Spawn a separate task for every request. That's a little excessive for requests that
            // can be quickly satisfied from the cache, but we expect that to be rare, because the
            // requesting backend would have already checked the cache.
            tokio::spawn(async move {
                use tracing::Instrument;
                let request_id = slot.get_request().request_id();
                let owner_procno = slot.get_owner_procno();
                let span = info_span!(
                    "processing",
                    request_id = request_id,
                    slot_idx = slot_idx,
                    procno = owner_procno,
                );
                async {
                    // FIXME: as a temporary hack, abort the request if we don't get a response
                    // promptly.
                    //
                    // Lots of regression tests are getting stuck and failing at the moment,
                    // this makes them fail a little faster, which it faster to iterate.
                    // This needs to be removed once more regression tests are passing.
                    // See also similar hack in the backend code, in wait_request_completion()
                    let result = tokio::time::timeout(
                        tokio::time::Duration::from_secs(60),
                        self.handle_request(slot.get_request()),
                    )
                    .await
                    .unwrap_or_else(|_elapsed| {
                        info!("request {request_id} timed out");
                        NeonIOResult::Error(libc::ETIMEDOUT)
                    });
                    trace!("request {request_id} at slot {slot_idx} completed");
                    // Ok, we have completed the IO. Mark the request as completed. After that,
                    // we no longer have ownership of the slot, and must not modify it.
                    slot.completed(result);
                    // Notify the backend about the completion. (Note that the backend might see
                    // the completed status even before this; this is just a wakeup)
                    notify_proc(owner_procno);
                }
                .instrument(span)
                .await
            });
        }
    }
    /// Compute the 'request_lsn' to use for a pageserver request
    fn request_lsns(&self, not_modified_since_lsn: Lsn) -> page_api::ReadLsn {
        let mut request_lsn = get_request_lsn();
        // Is it possible that the last-written LSN is ahead of last flush LSN? Generally not, we
        // shouldn't evict a page from the buffer cache before all its modifications have been
        // safely flushed. That's the "WAL before data" rule. However, there are a few exceptions:
        //
        // - when creation an index: _bt_blwritepage logs the full page without flushing WAL before
        // smgrextend (files are fsynced before build ends).
        //
        // XXX: If we make a request LSN greater than the current WAL flush LSN, the pageserver would
        // block waiting for the WAL arrive, until we flush it and it propagates through the
        // safekeepers to the pageserver. If there's nothing that forces the WAL to be flushed,
        // the pageserver would get stuck waiting forever. To avoid that, all the write-
        // functions in communicator_new.c call XLogSetAsyncXactLSN(). That nudges the WAL writer to
        // perform the flush relatively soon.
        //
        // It would perhaps be nicer to do the WAL flush here, but it's tricky to call back into
        // Postgres code to do that from here. That's why we rely on communicator_new.c to do the
        // calls "pre-emptively".
        //
        // FIXME: Because of the above, it can still happen that the flush LSN is ahead of
        // not_modified_since, if the WAL writer hasn't done the flush yet. It would be nice to know
        // if there are other cases like that that we have mised, but unfortunately we cannot turn
        // this into an assertion because of that legit case.
        //
        // See also the old logic in neon_get_request_lsns() C function
        if not_modified_since_lsn > request_lsn {
            tracing::info!(
                "not_modified_since_lsn {} is ahead of last flushed LSN {}",
                not_modified_since_lsn,
                request_lsn
            );
            request_lsn = not_modified_since_lsn;
        }
        page_api::ReadLsn {
            request_lsn,
            not_modified_since_lsn: Some(not_modified_since_lsn),
        }
    }
    /// Handle one IO request
    async fn handle_request(&'static self, request: &'_ NeonIORequest) -> NeonIOResult {
        self.request_counters
            .inc(RequestTypeLabelGroup::from_req(request));
        match request {
            NeonIORequest::Empty => {
                error!("unexpected Empty IO request");
                NeonIOResult::Error(0)
            }
            NeonIORequest::RelSize(req) => {
                let rel = req.reltag();
                let _in_progress_guard = self
                    .in_progress_table
                    .lock(RequestInProgressKey::Rel(rel), req.request_id)
                    .await;
                // Check the cache first
                let not_modified_since = match self.cache.get_rel_size(&rel) {
                    CacheResult::Found(nblocks) => {
                        tracing::trace!("found relsize for {:?} in cache: {}", rel, nblocks);
                        return NeonIOResult::RelSize(nblocks);
                    }
                    // XXX: we don't cache negative entries, so if there's no entry in the cache, it could mean
                    // that the relation doesn't exist or that we don't have it cached.
                    CacheResult::NotFound(lsn) => lsn,
                };
                let read_lsn = self.request_lsns(not_modified_since);
                match self
                    .client
                    .get_rel_size(page_api::GetRelSizeRequest {
                        read_lsn,
                        rel,
                        allow_missing: req.allow_missing,
                    })
                    .await
                {
                    Ok(Some(nblocks)) => {
                        // update the cache
                        tracing::trace!(
                            "updated relsize for {:?} in cache: {}, lsn {}",
                            rel,
                            nblocks,
                            read_lsn
                        );
                        self.cache
                            .remember_rel_size(&rel, nblocks, not_modified_since);
                        NeonIOResult::RelSize(nblocks)
                    }
                    Ok(None) => {
                        // TODO: cache negative entry?
                        NeonIOResult::RelSize(INVALID_BLOCK_NUMBER)
                    }
                    Err(err) => {
                        // FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
                        info!("tonic error: {err:?}");
                        NeonIOResult::Error(libc::EIO)
                    }
                }
            }
            NeonIORequest::GetPageV(req) => match self.handle_get_pagev_request(req).await {
                Ok(()) => NeonIOResult::GetPageV,
                Err(errno) => NeonIOResult::Error(errno),
            },
            NeonIORequest::GetPageVUncached(req) => {
                match self.handle_get_pagev_uncached_request(req).await {
                    Ok(()) => NeonIOResult::GetPageV,
                    Err(errno) => NeonIOResult::Error(errno),
                }
            }
            NeonIORequest::ReadSlruSegment(req) => {
                let lsn = Lsn(req.request_lsn);
                let file_path = req.destination_file_path();
                match self
                    .client
                    .get_slru_segment(page_api::GetSlruSegmentRequest {
                        read_lsn: self.request_lsns(lsn),
                        kind: req.slru_kind,
                        segno: req.segment_number,
                    })
                    .await
                {
                    Ok(slru_bytes) => {
                        if let Err(e) = tokio::fs::write(&file_path, &slru_bytes).await {
                            error!("could not write slru segment to file {file_path}: {e}");
                            return NeonIOResult::Error(e.raw_os_error().unwrap_or(libc::EIO));
                        }
                        let blocks_count = slru_bytes.len() / crate::BLCKSZ;
                        NeonIOResult::ReadSlruSegment(blocks_count as _)
                    }
                    Err(err) => {
                        // FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
                        info!("tonic error: {err:?}");
                        NeonIOResult::Error(libc::EIO)
                    }
                }
            }
            NeonIORequest::PrefetchV(req) => {
                self.request_nblocks_counters
                    .inc_by(RequestTypeLabelGroup::from_req(request), req.nblocks as i64);
                let req = *req;
                // FIXME: handle_request() runs in a separate task already, do we really need to spawn a new one here?
                tokio::spawn(async move { self.handle_prefetchv_request(&req).await });
                NeonIOResult::PrefetchVLaunched
            }
            NeonIORequest::DbSize(req) => {
                let _in_progress_guard = self
                    .in_progress_table
                    .lock(RequestInProgressKey::Db(req.db_oid), req.request_id)
                    .await;
                // Check the cache first
                let not_modified_since = match self.cache.get_db_size(req.db_oid) {
                    CacheResult::Found(db_size) => {
                        // get_page already copied the block content to the destination
                        return NeonIOResult::DbSize(db_size);
                    }
                    CacheResult::NotFound(lsn) => lsn,
                };
                match self
                    .client
                    .get_db_size(page_api::GetDbSizeRequest {
                        read_lsn: self.request_lsns(not_modified_since),
                        db_oid: req.db_oid,
                    })
                    .await
                {
                    Ok(db_size) => NeonIOResult::DbSize(db_size),
                    Err(err) => {
                        // FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
                        info!("tonic error: {err:?}");
                        NeonIOResult::Error(libc::EIO)
                    }
                }
            }
            // Write requests
            NeonIORequest::WritePage(req) => {
                let rel = req.reltag();
                let _in_progress_guard = self
                    .in_progress_table
                    .lock(
                        RequestInProgressKey::Block(rel, req.block_number),
                        req.request_id,
                    )
                    .await;
                // We must at least update the last-written LSN on the page, but also store the page
                // image in the LFC while we still have it
                self.cache
                    .remember_page(&rel, req.block_number, req.src, Lsn(req.lsn), true)
                    .await;
                NeonIOResult::WriteOK
            }
            NeonIORequest::RelExtend(req) => {
                let rel = req.reltag();
                let _in_progress_guard = self
                    .in_progress_table
                    .lock(
                        RequestInProgressKey::Block(rel, req.block_number),
                        req.request_id,
                    )
                    .await;
                // We must at least update the last-written LSN on the page and the relation size,
                // but also store the page image in the LFC while we still have it
                self.cache
                    .remember_page(&rel, req.block_number, req.src, Lsn(req.lsn), true)
                    .await;
                self.cache
                    .remember_rel_size(&req.reltag(), req.block_number + 1, Lsn(req.lsn));
                NeonIOResult::WriteOK
            }
            NeonIORequest::RelZeroExtend(req) => {
                self.request_nblocks_counters
                    .inc_by(RequestTypeLabelGroup::from_req(request), req.nblocks as i64);
                // TODO: need to grab an io-in-progress lock for this? I guess not
                // TODO: We could put the empty pages to the cache. Maybe have
                // a marker on the block entries for all-zero pages, instead of
                // actually storing the empty pages.
                self.cache.remember_rel_size(
                    &req.reltag(),
                    req.block_number + req.nblocks,
                    Lsn(req.lsn),
                );
                NeonIOResult::WriteOK
            }
            NeonIORequest::RelCreate(req) => {
                // TODO: need to grab an io-in-progress lock for this? I guess not
                self.cache.remember_rel_size(&req.reltag(), 0, Lsn(req.lsn));
                NeonIOResult::WriteOK
            }
            NeonIORequest::RelTruncate(req) => {
                // TODO: need to grab an io-in-progress lock for this? I guess not
                self.cache
                    .remember_rel_size(&req.reltag(), req.nblocks, Lsn(req.lsn));
                NeonIOResult::WriteOK
            }
            NeonIORequest::RelUnlink(req) => {
                // TODO: need to grab an io-in-progress lock for this? I guess not
                self.cache.forget_rel(&req.reltag(), None, Lsn(req.lsn));
                NeonIOResult::WriteOK
            }
            NeonIORequest::UpdateCachedRelSize(req) => {
                // TODO: need to grab an io-in-progress lock for this? I guess not
                self.cache
                    .remember_rel_size(&req.reltag(), req.nblocks, Lsn(req.lsn));
                NeonIOResult::WriteOK
            }
        }
    }
    /// Subroutine to handle a GetPageV request, since it's a little more complicated than
    /// others.
    async fn handle_get_pagev_request(&'t self, req: &CGetPageVRequest) -> Result<(), i32> {
        let rel = req.reltag();
        // Check the cache first
        //
        // Note: Because the backends perform a direct lookup in the cache before sending
        // the request to the communicator process, we expect the pages to almost never
        // be already in cache. It could happen if:
        // 1. two backends try to read the same page at the same time, but that should never
        //    happen because there's higher level locking in the Postgres buffer manager, or
        // 2. a prefetch request finished at the same time as a backend requested the
        //    page. That's much more likely.
        let mut cache_misses = Vec::with_capacity(req.nblocks as usize);
        for i in 0..req.nblocks {
            let blkno = req.block_number + i as u32;
            // note: this is deadlock-safe even though we hold multiple locks at the same time,
            // because they're always acquired in the same order.
            let in_progress_guard = self
                .in_progress_table
                .lock(RequestInProgressKey::Block(rel, blkno), req.request_id)
                .await;
            let dest = req.dest[i as usize];
            let not_modified_since = match self.cache.get_page(&rel, blkno, dest).await {
                Ok(CacheResult::Found(_)) => {
                    // get_page already copied the block content to the destination
                    trace!("found blk {} in rel {:?} in LFC", blkno, rel);
                    continue;
                }
                Ok(CacheResult::NotFound(lsn)) => lsn,
                Err(_io_error) => return Err(libc::EIO), // FIXME print the error?
            };
            cache_misses.push((blkno, not_modified_since, dest, in_progress_guard));
        }
        self.getpage_cache_misses_counter
            .inc_by(cache_misses.len() as i64);
        self.getpage_cache_hits_counter
            .inc_by(req.nblocks as i64 - cache_misses.len() as i64);
        if cache_misses.is_empty() {
            return Ok(());
        }
        let not_modified_since = cache_misses
            .iter()
            .map(|(_blkno, lsn, _dest, _guard)| *lsn)
            .max()
            .unwrap();
        // Construct a pageserver request for the cache misses
        let block_numbers: Vec<u32> = cache_misses
            .iter()
            .map(|(blkno, _lsn, _dest, _guard)| *blkno)
            .collect();
        let read_lsn = self.request_lsns(not_modified_since);
        trace!(
            "sending getpage request for blocks {:?} in rel {:?} lsns {}",
            block_numbers, rel, read_lsn
        );
        match self
            .client
            .get_page(page_api::GetPageRequest {
                request_id: req.request_id.into(),
                request_class: page_api::GetPageClass::Normal,
                read_lsn,
                rel,
                block_numbers: block_numbers.clone(),
            })
            .await
        {
            Ok(resp) => {
                // Write the received page images directly to the shared memory location
                // that the backend requested.
                if resp.pages.len() != block_numbers.len() {
                    error!(
                        "received unexpected response with {} page images from pageserver for a request for {} pages",
                        resp.pages.len(),
                        block_numbers.len(),
                    );
                    return Err(libc::EIO);
                }
                trace!(
                    "received getpage response for blocks {:?} in rel {:?} lsns {}",
                    block_numbers, rel, read_lsn
                );
                for (page, (blkno, _lsn, dest, _guard)) in resp.pages.into_iter().zip(cache_misses)
                {
                    let src: &[u8] = page.image.as_ref();
                    let len = std::cmp::min(src.len(), dest.bytes_total());
                    unsafe {
                        std::ptr::copy_nonoverlapping(src.as_ptr(), dest.as_mut_ptr(), len);
                    };
                    // Also store it in the LFC while we have it
                    self.cache
                        .remember_page(
                            &rel,
                            blkno,
                            page.image,
                            read_lsn.not_modified_since_lsn.unwrap(),
                            false,
                        )
                        .await;
                }
            }
            Err(err) => {
                // FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
                info!("tonic error: {err:?}");
                return Err(libc::EIO);
            }
        }
        Ok(())
    }
    /// Subroutine to handle an GetPageVUncached request.
    ///
    /// Note: this bypasses the cache, in-progress IO locking, and all other side-effects.
    /// This request type is only used in tests.
    async fn handle_get_pagev_uncached_request(
        &'t self,
        req: &CGetPageVUncachedRequest,
    ) -> Result<(), i32> {
        let rel = req.reltag();
        // Construct a pageserver request
        let block_numbers: Vec<u32> =
            (req.block_number..(req.block_number + (req.nblocks as u32))).collect();
        let read_lsn = page_api::ReadLsn {
            request_lsn: Lsn(req.request_lsn),
            not_modified_since_lsn: Some(Lsn(req.not_modified_since)),
        };
        trace!(
            "sending (uncached) getpage request for blocks {:?} in rel {:?} lsns {}",
            block_numbers, rel, read_lsn
        );
        match self
            .client
            .get_page(page_api::GetPageRequest {
                request_id: req.request_id.into(),
                request_class: page_api::GetPageClass::Normal,
                read_lsn,
                rel,
                block_numbers: block_numbers.clone(),
            })
            .await
        {
            Ok(resp) => {
                // Write the received page images directly to the shared memory location
                // that the backend requested.
                if resp.pages.len() != block_numbers.len() {
                    error!(
                        "received unexpected response with {} page images from pageserver for a request for {} pages",
                        resp.pages.len(),
                        block_numbers.len(),
                    );
                    return Err(libc::EIO);
                }
                trace!(
                    "received getpage response for blocks {:?} in rel {:?} lsns {}",
                    block_numbers, rel, read_lsn
                );
                for (page, dest) in resp.pages.into_iter().zip(req.dest) {
                    let src: &[u8] = page.image.as_ref();
                    let len = std::cmp::min(src.len(), dest.bytes_total());
                    unsafe {
                        std::ptr::copy_nonoverlapping(src.as_ptr(), dest.as_mut_ptr(), len);
                    };
                }
            }
            Err(err) => {
                // FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
                info!("tonic error: {err:?}");
                return Err(libc::EIO);
            }
        }
        Ok(())
    }
    /// Subroutine to handle a PrefetchV request, since it's a little more complicated than
    /// others.
    ///
    /// This is very similar to a GetPageV request, but the results are only stored in the cache.
    async fn handle_prefetchv_request(&'static self, req: &CPrefetchVRequest) -> Result<(), i32> {
        let rel = req.reltag();
        // Check the cache first
        let mut cache_misses = Vec::with_capacity(req.nblocks as usize);
        for i in 0..req.nblocks {
            let blkno = req.block_number + i as u32;
            // note: this is deadlock-safe even though we hold multiple locks at the same time,
            // because they're always acquired in the same order.
            let in_progress_guard = self
                .in_progress_table
                .lock(RequestInProgressKey::Block(rel, blkno), req.request_id)
                .await;
            let not_modified_since = match self.cache.page_is_cached(&rel, blkno).await {
                Ok(CacheResult::Found(_)) => {
                    trace!("found blk {} in rel {:?} in LFC", blkno, rel);
                    continue;
                }
                Ok(CacheResult::NotFound(lsn)) => lsn,
                Err(_io_error) => return Err(libc::EIO), // FIXME print the error?
            };
            cache_misses.push((blkno, not_modified_since, in_progress_guard));
        }
        if cache_misses.is_empty() {
            return Ok(());
        }
        let not_modified_since = cache_misses
            .iter()
            .map(|(_blkno, lsn, _guard)| *lsn)
            .max()
            .unwrap();
        let block_numbers: Vec<u32> = cache_misses
            .iter()
            .map(|(blkno, _lsn, _guard)| *blkno)
            .collect();
        // TODO: spawn separate tasks for these. Use the integrated cache to keep track of the
        // in-flight requests
        match self
            .client
            .get_page(page_api::GetPageRequest {
                request_id: req.request_id.into(),
                request_class: page_api::GetPageClass::Prefetch,
                read_lsn: self.request_lsns(not_modified_since),
                rel,
                block_numbers: block_numbers.clone(),
            })
            .await
        {
            Ok(resp) => {
                trace!(
                    "prefetch completed, remembering blocks {:?} in rel {:?} in LFC",
                    block_numbers, rel
                );
                if resp.pages.len() != block_numbers.len() {
                    error!(
                        "received unexpected response with {} page images from pageserver for a request for {} pages",
                        resp.pages.len(),
                        block_numbers.len(),
                    );
                    return Err(libc::EIO);
                }
                for (page, (blkno, _lsn, _guard)) in resp.pages.into_iter().zip(cache_misses) {
                    self.cache
                        .remember_page(&rel, blkno, page.image, not_modified_since, false)
                        .await;
                }
            }
            Err(err) => {
                // FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
                info!("tonic error: {err:?}");
                return Err(libc::EIO);
            }
        }
        Ok(())
    }
 }
 impl<T> MetricGroup<T> for CommunicatorWorkerProcessStruct<'_>
 where
    T: Encoding,
    CounterState: MetricEncoding<T>,
    GaugeState: MetricEncoding<T>,
 {
    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
-        self.lfc_metrics.collect_group_into(enc)
+        use measured::metric::MetricFamilyEncoding;
        use measured::metric::name::MetricName;
        self.lfc_metrics.collect_group_into(enc)?;
        self.cache.collect_group_into(enc)?;
        self.request_counters
            .collect_family_into(MetricName::from_str("request_counters"), enc)?;
        self.request_nblocks_counters
            .collect_family_into(MetricName::from_str("request_nblocks_counters"), enc)?;
        self.allocator_metrics.collect_group_into(enc)?;
        Ok(())
    }
 }
--- a/pgxn/neon/communicator/src/worker_process/mod.rs
+++ b/pgxn/neon/communicator/src/worker_process/mod.rs
@@ -4,9 +4,9 @@
 //! - launch the main loop,
 //! - receive IO requests from backends and process them,
 //! - write results back to backends.
 mod callbacks;
 mod control_socket;
 mod in_progress_ios;
 mod lfc_metrics;
 mod logging;
 mod main_loop;
--- a/pgxn/neon/communicator/src/worker_process/worker_interface.rs
+++ b/pgxn/neon/communicator/src/worker_process/worker_interface.rs
@@ -1,14 +1,21 @@
 //! Functions called from the C code in the worker process
 use std::collections::HashMap;
 use std::ffi::{CStr, CString, c_char};
 use std::path::PathBuf;
 use crate::init::CommunicatorInitStruct;
 use crate::worker_process::main_loop;
 use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
 use pageserver_client_grpc::ShardStripeSize;
 /// Launch the communicator's tokio tasks, which do most of the work.
 ///
 /// The caller has initialized the process as a regular PostgreSQL background worker
-/// process.
+/// process. The shared memory segment used to communicate with the backends has been
 /// allocated and initialized earlier, at postmaster startup, in
 /// rcommunicator_shmem_init().
 ///
 /// Inputs:
 ///   `tenant_id` and `timeline_id` can be NULL, if we're been launched in "non-Neon" mode,
@@ -23,27 +30,63 @@ use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
 /// This is called only once in the process, so the returned struct, and error message in
 /// case of failure, are simply leaked.
 #[unsafe(no_mangle)]
-pub extern "C" fn communicator_worker_launch(
+pub extern "C" fn communicator_worker_process_launch(
    cis: Box<CommunicatorInitStruct>,
    tenant_id: *const c_char,
    timeline_id: *const c_char,
    auth_token: *const c_char,
    shard_map: *mut *mut c_char,
    nshards: u32,
    stripe_size: u32,
    file_cache_path: *const c_char,
    initial_file_cache_size: u64,
    error_p: *mut *const c_char,
-) -> Option<&'static CommunicatorWorkerProcessStruct> {
+) -> Option<&'static CommunicatorWorkerProcessStruct<'static>> {
    tracing::warn!("starting threads in rust code");
    // Convert the arguments into more convenient Rust types
-    let tenant_id = if tenant_id.is_null() {
+    let tenant_id = {
        let cstr = unsafe { CStr::from_ptr(tenant_id) };
        cstr.to_str().expect("assume UTF-8")
    };
    let timeline_id = {
        let cstr = unsafe { CStr::from_ptr(timeline_id) };
        cstr.to_str().expect("assume UTF-8")
    };
    let auth_token = if auth_token.is_null() {
        None
    } else {
-        let cstr = unsafe { CStr::from_ptr(tenant_id) };
+        let cstr = unsafe { CStr::from_ptr(auth_token) };
        Some(cstr.to_str().expect("assume UTF-8"))
    };
-    let timeline_id = if timeline_id.is_null() {
+    let file_cache_path = {
-        None
+        if file_cache_path.is_null() {
            None
        } else {
            let c_str = unsafe { CStr::from_ptr(file_cache_path) };
            Some(PathBuf::from(c_str.to_str().unwrap()))
        }
    };
    let shard_map = shard_map_to_hash(nshards, shard_map);
    // FIXME: distinguish between unsharded, and sharded with 1 shard
    // Also, we might go from unsharded to sharded while the system
    // is running.
    let stripe_size = if stripe_size > 0 && nshards > 1 {
        Some(ShardStripeSize(stripe_size))
    } else {
-        let cstr = unsafe { CStr::from_ptr(timeline_id) };
+        None
        Some(cstr.to_str().expect("assume UTF-8"))
    };
    // The `init` function does all the work.
-    let result = main_loop::init(tenant_id, timeline_id);
+    let result = main_loop::init(
        *cis,
        tenant_id,
        timeline_id,
        auth_token,
        shard_map,
        stripe_size,
        initial_file_cache_size,
        file_cache_path,
    );
    // On failure, return the error message to the C caller in *error_p.
    match result {
@@ -58,3 +101,66 @@ pub extern "C" fn communicator_worker_launch(
        }
    }
 }
 #[unsafe(no_mangle)]
 pub extern "C" fn communicator_worker_process_launch_legacy(error_p: *mut *const c_char) -> bool {
    // The `init` function does all the work.
    let result = main_loop::init_legacy();
    // On failure, return the error message to the C caller in *error_p.
    match result {
        Ok(()) => true,
        Err(errmsg) => {
            let errmsg = CString::new(errmsg).expect("no nuls within error message");
            let errmsg = Box::leak(errmsg.into_boxed_c_str());
            let p: *const c_char = errmsg.as_ptr();
            unsafe { *error_p = p };
            false
        }
    }
 }
 /// Convert the "shard map" from an array of C strings, indexed by shard no to a rust HashMap
 fn shard_map_to_hash(
    nshards: u32,
    shard_map: *mut *mut c_char,
 ) -> HashMap<utils::shard::ShardIndex, String> {
    use utils::shard::*;
    assert!(nshards <= u8::MAX as u32);
    let mut result: HashMap<ShardIndex, String> = HashMap::new();
    let mut p = shard_map;
    for i in 0..nshards {
        let c_str = unsafe { CStr::from_ptr(*p) };
        p = unsafe { p.add(1) };
        let s = c_str.to_str().unwrap();
        let k = if nshards > 1 {
            ShardIndex::new(ShardNumber(i as u8), ShardCount(nshards as u8))
        } else {
            ShardIndex::unsharded()
        };
        result.insert(k, s.into());
    }
    result
 }
 /// Inform the rust code about a configuration change
 #[unsafe(no_mangle)]
 pub extern "C" fn communicator_worker_config_reload(
    proc_handle: &'static CommunicatorWorkerProcessStruct<'static>,
    file_cache_size: u64,
    shard_map: *mut *mut c_char,
    nshards: u32,
    stripe_size: u32,
 ) {
    proc_handle.cache.resize_file_cache(file_cache_size as u32);
    let shard_map = shard_map_to_hash(nshards, shard_map);
    let stripe_size = (nshards > 1).then_some(ShardStripeSize(stripe_size));
    proc_handle.update_shard_map(shard_map, stripe_size);
 }
--- a/Show More
+++ b/Show More
`@@ -1 +1 @@`
	`SELECT num_requested AS checkpoints_req FROM pg_stat_checkpointer;`	`SELECT num_requested AS checkpoints_req FROM pg_catalog.pg_stat_checkpointer;`
`@@ -1 +1 @@`
	`SELECT checkpoints_req FROM pg_stat_bgwriter;`	`SELECT checkpoints_req FROM pg_catalog.pg_stat_bgwriter;`
`@@ -1 +1 @@`
	`SELECT checkpoints_timed FROM pg_stat_bgwriter;`	`SELECT checkpoints_timed FROM pg_catalog.pg_stat_bgwriter;`
`@@ -1 +1 @@`
	`SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled;`	`SELECT (neon.backpressure_throttling_time()::pg_catalog.float8 / 1000000) AS throttled;`
`@@ -1 +1 @@`
	`SELECT current_setting('max_connections') as max_connections;`	`SELECT pg_catalog.current_setting('max_connections') AS max_connections;`
`@@ -1 +1 @@`
	`SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled;`	`SELECT subenabled::pg_catalog.text AS enabled, pg_catalog.count(*) AS subscriptions_count FROM pg_catalog.pg_subscription GROUP BY subenabled;`
`@@ -1 +1 @@`
	`SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state;`	`SELECT datname, state, pg_catalog.count(*) AS count FROM pg_catalog.pg_stat_activity WHERE state <> '' GROUP BY datname, state;`
`@@ -1 +1 @@`
	`SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;`	`SELECT pg_catalog.pg_size_bytes(pg_catalog.current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;`
`@@ -1 +1 @@`
	`SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size';`	`SELECT setting::pg_catalog.int4 AS max_cluster_size FROM pg_catalog.pg_settings WHERE name = 'neon.max_cluster_size';`