optimise some future sizes

more compact code and more compact futures
remove typesafe transaction type as we already ensure rollback is performed
2026-05-22 23:50:39 +00:00 · 2025-07-23 08:58:09 +01:00 · 2025-07-23 08:58:09 +01:00 · 2025-07-23 08:58:09 +01:00 · 2025-07-23 07:04:36 +01:00 · 2025-07-23 07:04:36 +01:00
151 changed files with 3148 additions and 1200 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1097,7 +1097,7 @@ checksum = "975982cdb7ad6a142be15bdf84aea7ec6a9e5d4d797c004d43185b24cfe4e684"
 dependencies = [
 "clap",
 "heck 0.5.0",
- "indexmap 2.9.0",
+ "indexmap 2.10.0",
 "log",
 "proc-macro2",
 "quote",
@@ -1296,8 +1296,14 @@ dependencies = [
 name = "communicator"
 version = "0.1.0"
 dependencies = [
+ "axum",
 "cbindgen",
- "neon-shmem",
+ "http 1.3.1",
+ "measured",
+ "tokio",
+ "tracing",
+ "tracing-subscriber",
+ "utils",
 "workspace_hack",
 ]

@@ -1307,7 +1313,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "chrono",
- "indexmap 2.9.0",
+ "indexmap 2.10.0",
 "jsonwebtoken",
 "regex",
 "remote_storage",
@@ -1341,7 +1347,10 @@ dependencies = [
 "futures",
 "hostname-validator",
 "http 1.3.1",
- "indexmap 2.9.0",
+ "http-body-util",
+ "hyper 1.4.1",
+ "hyper-util",
+ "indexmap 2.10.0",
 "itertools 0.10.5",
 "jsonwebtoken",
 "metrics",
@@ -1363,6 +1372,7 @@ dependencies = [
 "ring",
 "rlimit",
 "rust-ini",
+ "scopeguard",
 "serde",
 "serde_json",
 "serde_with",
@@ -1373,7 +1383,7 @@ dependencies = [
 "tokio-postgres",
 "tokio-stream",
 "tokio-util",
- "tonic 0.13.1",
+ "tonic",
 "tower 0.5.2",
 "tower-http",
 "tower-otel",
@@ -1451,7 +1461,7 @@ name = "consumption_metrics"
 version = "0.1.0"
 dependencies = [
 "chrono",
- "rand 0.8.5",
+ "rand 0.9.1",
 "serde",
 ]

@@ -1854,7 +1864,7 @@ dependencies = [
 "bytes",
 "hex",
 "parking_lot 0.12.1",
- "rand 0.8.5",
+ "rand 0.9.1",
 "smallvec",
 "tracing",
 "utils",
@@ -2099,7 +2109,7 @@ dependencies = [
 "itertools 0.10.5",
 "jsonwebtoken",
 "prometheus",
- "rand 0.8.5",
+ "rand 0.9.1",
 "remote_storage",
 "serde",
 "serde_json",
@@ -2649,7 +2659,7 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "http 0.2.9",
- "indexmap 2.9.0",
+ "indexmap 2.10.0",
 "slab",
 "tokio",
 "tokio-util",
@@ -2668,7 +2678,7 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "http 1.3.1",
- "indexmap 2.9.0",
+ "indexmap 2.10.0",
 "slab",
 "tokio",
 "tokio-util",
@@ -2927,7 +2937,7 @@ dependencies = [
 "pprof",
 "regex",
 "routerify",
- "rustls 0.23.27",
+ "rustls 0.23.29",
 "rustls-pemfile 2.1.1",
 "serde",
 "serde_json",
@@ -3264,9 +3274,9 @@ dependencies = [

 [[package]]
 name = "indexmap"
-version = "2.9.0"
+version = "2.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e"
+checksum = "fe4cd85333e22411419a0bcae1297d25e58c9443848b11dc6a86fefe8c78a661"
 dependencies = [
 "equivalent",
 "hashbrown 0.15.2",
@@ -3292,7 +3302,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88"
 dependencies = [
 "ahash",
- "indexmap 2.9.0",
+ "indexmap 2.10.0",
 "is-terminal",
 "itoa",
 "log",
@@ -3315,7 +3325,7 @@ dependencies = [
 "crossbeam-utils",
 "dashmap 6.1.0",
 "env_logger",
- "indexmap 2.9.0",
+ "indexmap 2.10.0",
 "itoa",
 "log",
 "num-format",
@@ -3782,8 +3792,8 @@ dependencies = [
 "once_cell",
 "procfs",
 "prometheus",
- "rand 0.8.5",
- "rand_distr 0.4.3",
+ "rand 0.9.1",
+ "rand_distr",
 "twox-hash",
 ]

@@ -3875,7 +3885,7 @@ dependencies = [
 "lock_api",
 "nix 0.30.1",
 "rand 0.9.1",
- "rand_distr 0.5.1",
+ "rand_distr",
 "rustc-hash 2.1.1",
 "tempfile",
 "thiserror 1.0.69",
@@ -4152,23 +4162,23 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"

 [[package]]
 name = "opentelemetry"
-version = "0.27.1"
+version = "0.30.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ab70038c28ed37b97d8ed414b6429d343a8bbf44c9f79ec854f3a643029ba6d7"
+checksum = "aaf416e4cb72756655126f7dd7bb0af49c674f4c1b9903e80c009e0c37e552e6"
 dependencies = [
 "futures-core",
 "futures-sink",
 "js-sys",
 "pin-project-lite",
- "thiserror 1.0.69",
+ "thiserror 2.0.11",
 "tracing",
 ]

 [[package]]
 name = "opentelemetry-http"
-version = "0.27.0"
+version = "0.30.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "10a8a7f5f6ba7c1b286c2fbca0454eaba116f63bbe69ed250b642d36fbb04d80"
+checksum = "50f6639e842a97dbea8886e3439710ae463120091e2e064518ba8e716e6ac36d"
 dependencies = [
 "async-trait",
 "bytes",
@@ -4179,12 +4189,10 @@ dependencies = [

 [[package]]
 name = "opentelemetry-otlp"
-version = "0.27.0"
+version = "0.30.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91cf61a1868dacc576bf2b2a1c3e9ab150af7272909e80085c3173384fe11f76"
+checksum = "dbee664a43e07615731afc539ca60c6d9f1a9425e25ca09c57bc36c87c55852b"
 dependencies = [
- "async-trait",
- "futures-core",
 "http 1.3.1",
 "opentelemetry",
 "opentelemetry-http",
@@ -4192,46 +4200,43 @@ dependencies = [
 "opentelemetry_sdk",
 "prost 0.13.5",
 "reqwest",
- "thiserror 1.0.69",
+ "thiserror 2.0.11",
 ]

 [[package]]
 name = "opentelemetry-proto"
-version = "0.27.0"
+version = "0.30.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6"
+checksum = "2e046fd7660710fe5a05e8748e70d9058dc15c94ba914e7c4faa7c728f0e8ddc"
 dependencies = [
 "opentelemetry",
 "opentelemetry_sdk",
 "prost 0.13.5",
- "tonic 0.12.3",
+ "tonic",
 ]

 [[package]]
 name = "opentelemetry-semantic-conventions"
-version = "0.27.0"
+version = "0.30.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc1b6902ff63b32ef6c489e8048c5e253e2e4a803ea3ea7e783914536eb15c52"
+checksum = "83d059a296a47436748557a353c5e6c5705b9470ef6c95cfc52c21a8814ddac2"

 [[package]]
 name = "opentelemetry_sdk"
-version = "0.27.1"
+version = "0.30.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "231e9d6ceef9b0b2546ddf52335785ce41252bc7474ee8ba05bfad277be13ab8"
+checksum = "11f644aa9e5e31d11896e024305d7e3c98a88884d9f8919dbf37a9991bc47a4b"
 dependencies = [
- "async-trait",
 "futures-channel",
 "futures-executor",
 "futures-util",
- "glob",
 "opentelemetry",
 "percent-encoding",
- "rand 0.8.5",
+ "rand 0.9.1",
 "serde_json",
- "thiserror 1.0.69",
+ "thiserror 2.0.11",
 "tokio",
 "tokio-stream",
- "tracing",
 ]

 [[package]]
@@ -4351,14 +4356,14 @@ dependencies = [
 "pageserver_client_grpc",
 "pageserver_page_api",
 "pprof",
- "rand 0.8.5",
+ "rand 0.9.1",
 "reqwest",
 "serde",
 "serde_json",
 "tokio",
 "tokio-stream",
 "tokio-util",
- "tonic 0.13.1",
+ "tonic",
 "tracing",
 "url",
 "utils",
@@ -4448,14 +4453,14 @@ dependencies = [
 "pprof",
 "pq_proto",
 "procfs",
- "rand 0.8.5",
+ "rand 0.9.1",
 "range-set-blaze",
 "regex",
 "remote_storage",
 "reqwest",
 "rpds",
 "rstest",
- "rustls 0.23.27",
+ "rustls 0.23.29",
 "scopeguard",
 "send-future",
 "serde",
@@ -4479,7 +4484,7 @@ dependencies = [
 "tokio-tar",
 "tokio-util",
 "toml_edit",
- "tonic 0.13.1",
+ "tonic",
 "tonic-reflection",
 "tower 0.5.2",
 "tracing",
@@ -4515,7 +4520,7 @@ dependencies = [
 "postgres_ffi_types",
 "postgres_versioninfo",
 "posthog_client_lite",
- "rand 0.8.5",
+ "rand 0.9.1",
 "remote_storage",
 "reqwest",
 "serde",
@@ -4565,7 +4570,7 @@ dependencies = [
 "tokio",
 "tokio-stream",
 "tokio-util",
- "tonic 0.13.1",
+ "tonic",
 "tracing",
 "utils",
 "workspace_hack",
@@ -4585,7 +4590,7 @@ dependencies = [
 "once_cell",
 "pageserver_api",
 "pin-project-lite",
- "rand 0.8.5",
+ "rand 0.9.1",
 "svg_fmt",
 "tokio",
 "tracing",
@@ -4610,7 +4615,7 @@ dependencies = [
 "thiserror 1.0.69",
 "tokio",
 "tokio-util",
- "tonic 0.13.1",
+ "tonic",
 "tonic-build",
 "utils",
 "workspace_hack",
@@ -4958,7 +4963,7 @@ dependencies = [
 "fallible-iterator",
 "hmac",
 "memchr",
- "rand 0.8.5",
+ "rand 0.9.1",
 "sha2",
 "stringprep",
 "tokio",
@@ -4992,7 +4997,7 @@ dependencies = [
 "bytes",
 "once_cell",
 "pq_proto",
- "rustls 0.23.27",
+ "rustls 0.23.29",
 "rustls-pemfile 2.1.1",
 "serde",
 "thiserror 1.0.69",
@@ -5150,7 +5155,7 @@ dependencies = [
 "bytes",
 "itertools 0.10.5",
 "postgres-protocol",
- "rand 0.8.5",
+ "rand 0.9.1",
 "serde",
 "thiserror 1.0.69",
 "tokio",
@@ -5391,7 +5396,7 @@ dependencies = [
 "hyper 0.14.30",
 "hyper 1.4.1",
 "hyper-util",
- "indexmap 2.9.0",
+ "indexmap 2.10.0",
 "ipnet",
 "itertools 0.10.5",
 "itoa",
@@ -5414,8 +5419,9 @@ dependencies = [
 "postgres-protocol2",
 "postgres_backend",
 "pq_proto",
- "rand 0.8.5",
- "rand_distr 0.4.3",
+ "rand 0.9.1",
+ "rand_core 0.6.4",
+ "rand_distr",
 "rcgen",
 "redis",
 "regex",
@@ -5427,7 +5433,7 @@ dependencies = [
 "rsa",
 "rstest",
 "rustc-hash 2.1.1",
- "rustls 0.23.27",
+ "rustls 0.23.29",
 "rustls-native-certs 0.8.0",
 "rustls-pemfile 2.1.1",
 "scopeguard",
@@ -5617,16 +5623,6 @@ dependencies = [
 "getrandom 0.3.3",
 ]

-[[package]]
-name = "rand_distr"
-version = "0.4.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
-dependencies = [
- "num-traits",
- "rand 0.8.5",
-]
-
 [[package]]
 name = "rand_distr"
 version = "0.5.1"
@@ -5716,7 +5712,7 @@ dependencies = [
 "num-bigint",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.23.27",
+ "rustls 0.23.29",
 "rustls-native-certs 0.8.0",
 "ryu",
 "sha1_smol",
@@ -5840,7 +5836,7 @@ dependencies = [
 "metrics",
 "once_cell",
 "pin-project-lite",
- "rand 0.8.5",
+ "rand 0.9.1",
 "reqwest",
 "scopeguard",
 "serde",
@@ -5945,9 +5941,9 @@ dependencies = [

 [[package]]
 name = "reqwest-tracing"
-version = "0.5.5"
+version = "0.5.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73e6153390585f6961341b50e5a1931d6be6dee4292283635903c26ef9d980d2"
+checksum = "d70ea85f131b2ee9874f0b160ac5976f8af75f3c9badfe0d955880257d10bd83"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -6172,15 +6168,15 @@ dependencies = [

 [[package]]
 name = "rustls"
-version = "0.23.27"
+version = "0.23.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "730944ca083c1c233a75c09f199e973ca499344a2b7ba9e755c457e86fb4a321"
+checksum = "2491382039b29b9b11ff08b76ff6c97cf287671dbb74f0be44bda389fffe9bd1"
 dependencies = [
 "log",
 "once_cell",
 "ring",
 "rustls-pki-types",
- "rustls-webpki 0.103.3",
+ "rustls-webpki 0.103.4",
 "subtle",
 "zeroize",
 ]
@@ -6244,9 +6240,12 @@ dependencies = [

 [[package]]
 name = "rustls-pki-types"
-version = "1.11.0"
+version = "1.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c"
+checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79"
+dependencies = [
+ "zeroize",
+]

 [[package]]
 name = "rustls-webpki"
@@ -6271,9 +6270,9 @@ dependencies = [

 [[package]]
 name = "rustls-webpki"
-version = "0.103.3"
+version = "0.103.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435"
+checksum = "0a17884ae0c1b773f1ccd2bd4a8c72f16da897310a98b0e84bf349ad5ead92fc"
 dependencies = [
 "ring",
 "rustls-pki-types",
@@ -6330,11 +6329,11 @@ dependencies = [
 "postgres_versioninfo",
 "pprof",
 "pq_proto",
- "rand 0.8.5",
+ "rand 0.9.1",
 "regex",
 "remote_storage",
 "reqwest",
- "rustls 0.23.27",
+ "rustls 0.23.29",
 "safekeeper_api",
 "safekeeper_client",
 "scopeguard",
@@ -6524,7 +6523,7 @@ checksum = "255914a8e53822abd946e2ce8baa41d4cded6b8e938913b7f7b9da5b7ab44335"
 dependencies = [
 "httpdate",
 "reqwest",
- "rustls 0.23.27",
+ "rustls 0.23.29",
 "sentry-backtrace",
 "sentry-contexts",
 "sentry-core",
@@ -6656,7 +6655,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9d2de91cf02bbc07cde38891769ccd5d4f073d22a40683aa4bc7a95781aaa2c4"
 dependencies = [
 "form_urlencoded",
- "indexmap 2.9.0",
+ "indexmap 2.10.0",
 "itoa",
 "ryu",
 "serde",
@@ -6737,7 +6736,7 @@ dependencies = [
 "chrono",
 "hex",
 "indexmap 1.9.3",
- "indexmap 2.9.0",
+ "indexmap 2.10.0",
 "serde",
 "serde_derive",
 "serde_json",
@@ -6980,10 +6979,10 @@ dependencies = [
 "once_cell",
 "parking_lot 0.12.1",
 "prost 0.13.5",
- "rustls 0.23.27",
+ "rustls 0.23.29",
 "tokio",
 "tokio-rustls 0.26.2",
- "tonic 0.13.1",
+ "tonic",
 "tonic-build",
 "tracing",
 "utils",
@@ -7024,11 +7023,11 @@ dependencies = [
 "pageserver_client",
 "postgres_connection",
 "posthog_client_lite",
- "rand 0.8.5",
+ "rand 0.9.1",
 "regex",
 "reqwest",
 "routerify",
- "rustls 0.23.27",
+ "rustls 0.23.29",
 "rustls-native-certs 0.8.0",
 "safekeeper_api",
 "safekeeper_client",
@@ -7082,7 +7081,7 @@ dependencies = [
 "postgres_ffi",
 "remote_storage",
 "reqwest",
- "rustls 0.23.27",
+ "rustls 0.23.29",
 "rustls-native-certs 0.8.0",
 "serde",
 "serde_json",
@@ -7621,7 +7620,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
 dependencies = [
 "ring",
- "rustls 0.23.27",
+ "rustls 0.23.29",
 "tokio",
 "tokio-postgres",
 "tokio-rustls 0.26.2",
@@ -7672,7 +7671,7 @@ version = "0.26.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b"
 dependencies = [
- "rustls 0.23.27",
+ "rustls 0.23.29",
 "tokio",
 ]

@@ -7771,34 +7770,13 @@ version = "0.22.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38"
 dependencies = [
- "indexmap 2.9.0",
+ "indexmap 2.10.0",
 "serde",
 "serde_spanned",
 "toml_datetime",
 "winnow",
 ]

-[[package]]
-name = "tonic"
-version = "0.12.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
-dependencies = [
- "async-trait",
- "base64 0.22.1",
- "bytes",
- "http 1.3.1",
- "http-body 1.0.0",
- "http-body-util",
- "percent-encoding",
- "pin-project",
- "prost 0.13.5",
- "tokio-stream",
- "tower-layer",
- "tower-service",
- "tracing",
-]
-
 [[package]]
 name = "tonic"
 version = "0.13.1"
@@ -7856,7 +7834,7 @@ dependencies = [
 "prost-types 0.13.5",
 "tokio",
 "tokio-stream",
- "tonic 0.13.1",
+ "tonic",
 ]

 [[package]]
@@ -7882,7 +7860,7 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
 dependencies = [
 "futures-core",
 "futures-util",
- "indexmap 2.9.0",
+ "indexmap 2.10.0",
 "pin-project-lite",
 "slab",
 "sync_wrapper 1.0.1",
@@ -7920,10 +7898,14 @@ checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"

 [[package]]
 name = "tower-otel"
-version = "0.2.0"
-source = "git+https://github.com/mattiapenati/tower-otel?rev=56a7321053bcb72443888257b622ba0d43a11fcd#56a7321053bcb72443888257b622ba0d43a11fcd"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "345000ea5ae33222624a8ccfdd88892c30db4d413a39c2d4bd714b77e0a4b23c"
 dependencies = [
+ "axum",
+ "cfg-if",
 "http 1.3.1",
+ "http-body 1.0.0",
 "opentelemetry",
 "pin-project",
 "tower-layer",
@@ -8005,9 +7987,9 @@ dependencies = [

 [[package]]
 name = "tracing-opentelemetry"
-version = "0.28.0"
+version = "0.31.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97a971f6058498b5c0f1affa23e7ea202057a7301dbff68e968b2d578bcbd053"
+checksum = "ddcf5959f39507d0d04d6413119c04f33b623f4f951ebcbdddddfad2d0623a9c"
 dependencies = [
 "js-sys",
 "once_cell",
@@ -8215,7 +8197,7 @@ dependencies = [
 "base64 0.22.1",
 "log",
 "once_cell",
- "rustls 0.23.27",
+ "rustls 0.23.29",
 "rustls-pki-types",
 "url",
 "webpki-roots",
@@ -8305,7 +8287,7 @@ dependencies = [
 "postgres_connection",
 "pprof",
 "pq_proto",
- "rand 0.8.5",
+ "rand 0.9.1",
 "regex",
 "scopeguard",
 "sentry",
@@ -8887,7 +8869,7 @@ dependencies = [
 "hyper 0.14.30",
 "hyper 1.4.1",
 "hyper-util",
- "indexmap 2.9.0",
+ "indexmap 2.10.0",
 "itertools 0.12.1",
 "lazy_static",
 "libc",
@@ -8910,14 +8892,14 @@ dependencies = [
 "proc-macro2",
 "prost 0.13.5",
 "quote",
- "rand 0.8.5",
+ "rand 0.9.1",
 "regex",
 "regex-automata 0.4.9",
 "regex-syntax 0.8.5",
 "reqwest",
- "rustls 0.23.27",
+ "rustls 0.23.29",
 "rustls-pki-types",
- "rustls-webpki 0.103.3",
+ "rustls-webpki 0.103.4",
 "scopeguard",
 "sec1 0.7.3",
 "serde",
@@ -8930,6 +8912,7 @@ dependencies = [
 "subtle",
 "syn 2.0.100",
 "sync_wrapper 0.1.2",
+ "thiserror 2.0.11",
 "tikv-jemalloc-ctl",
 "tikv-jemalloc-sys",
 "time",
@@ -8939,6 +8922,7 @@ dependencies = [
 "tokio-stream",
 "tokio-util",
 "toml_edit",
+ "tonic",
 "tower 0.5.2",
 "tracing",
 "tracing-core",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -143,10 +143,10 @@ notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.19"
 once_cell = "1.13"
-opentelemetry = "0.27"
-opentelemetry_sdk = "0.27"
-opentelemetry-otlp = { version = "0.27", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.27"
+opentelemetry = "0.30"
+opentelemetry_sdk = "0.30"
+opentelemetry-otlp = { version = "0.30", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.30"
 parking_lot = "0.12"
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
@@ -158,11 +158,13 @@ procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.13.5"
 prost-types = "0.13.5"
-rand = "0.8"
+rand = "0.9"
+# Remove after p256 is updated to 0.14.
+rand_core = "=0.6"
 redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_27"] }
+reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_30"] }
 reqwest-middleware = "0.4"
 reqwest-retry = "0.7"
 routerify = "3"
@@ -212,15 +214,12 @@ tonic = { version = "0.13.1", default-features = false, features = ["channel", "
 tonic-reflection = { version = "0.13.1", features = ["server"] }
 tower = { version = "0.5.2", default-features = false }
 tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }
-
-# This revision uses opentelemetry 0.27. There's no tag for it.
-tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" }
-
+tower-otel = { version = "0.6", features = ["axum"] }
 tower-service = "0.3.3"
 tracing = "0.1"
 tracing-error = "0.2"
 tracing-log = "0.2"
-tracing-opentelemetry = "0.28"
+tracing-opentelemetry = "0.31"
 tracing-serde = "0.2.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -27,7 +27,10 @@ fail.workspace = true
 flate2.workspace = true
 futures.workspace = true
 http.workspace = true
+http-body-util.workspace = true
 hostname-validator = "1.1"
+hyper.workspace = true
+hyper-util.workspace = true
 indexmap.workspace = true
 itertools.workspace = true
 jsonwebtoken.workspace = true
@@ -44,6 +47,7 @@ postgres.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 ring = "0.17"
+scopeguard.workspace = true
 serde.workspace = true
 serde_with.workspace = true
 serde_json.workspace = true
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -138,6 +138,12 @@ struct Cli {
    /// Run in development mode, skipping VM-specific operations like process termination
    #[arg(long, action = clap::ArgAction::SetTrue)]
    pub dev: bool,
+
+    #[arg(long)]
+    pub pg_init_timeout: Option<u64>,
+
+    #[arg(long, default_value_t = false, action = clap::ArgAction::Set)]
+    pub lakebase_mode: bool,
 }

 impl Cli {
@@ -188,7 +194,7 @@ fn main() -> Result<()> {
        .build()?;
    let _rt_guard = runtime.enter();

-    runtime.block_on(init(cli.dev))?;
+    let tracing_provider = init(cli.dev)?;

    // enable core dumping for all child processes
    setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
@@ -219,6 +225,8 @@ fn main() -> Result<()> {
            installed_extensions_collection_interval: Arc::new(AtomicU64::new(
                cli.installed_extensions_collection_interval,
            )),
+            pg_init_timeout: cli.pg_init_timeout.map(Duration::from_secs),
+            lakebase_mode: cli.lakebase_mode,
        },
        config,
    )?;
@@ -227,11 +235,11 @@ fn main() -> Result<()> {

    scenario.teardown();

-    deinit_and_exit(exit_code);
+    deinit_and_exit(tracing_provider, exit_code);
 }

-async fn init(dev_mode: bool) -> Result<()> {
-    init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?;
+fn init(dev_mode: bool) -> Result<Option<tracing_utils::Provider>> {
+    let provider = init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
    thread::spawn(move || {
@@ -242,7 +250,7 @@ async fn init(dev_mode: bool) -> Result<()> {

    info!("compute build_tag: {}", &BUILD_TAG.to_string());

-    Ok(())
+    Ok(provider)
 }

 fn get_config(cli: &Cli) -> Result<ComputeConfig> {
@@ -267,25 +275,27 @@ fn get_config(cli: &Cli) -> Result<ComputeConfig> {
    }
 }

-fn deinit_and_exit(exit_code: Option<i32>) -> ! {
-    // Shutdown trace pipeline gracefully, so that it has a chance to send any
-    // pending traces before we exit. Shutting down OTEL tracing provider may
-    // hang for quite some time, see, for example:
-    // - https://github.com/open-telemetry/opentelemetry-rust/issues/868
-    // - and our problems with staging https://github.com/neondatabase/cloud/issues/3707#issuecomment-1493983636
-    //
-    // Yet, we want computes to shut down fast enough, as we may need a new one
-    // for the same timeline ASAP. So wait no longer than 2s for the shutdown to
-    // complete, then just error out and exit the main thread.
-    info!("shutting down tracing");
-    let (sender, receiver) = mpsc::channel();
-    let _ = thread::spawn(move || {
-        tracing_utils::shutdown_tracing();
-        sender.send(()).ok()
-    });
-    let shutdown_res = receiver.recv_timeout(Duration::from_millis(2000));
-    if shutdown_res.is_err() {
-        error!("timed out while shutting down tracing, exiting anyway");
+fn deinit_and_exit(tracing_provider: Option<tracing_utils::Provider>, exit_code: Option<i32>) -> ! {
+    if let Some(p) = tracing_provider {
+        // Shutdown trace pipeline gracefully, so that it has a chance to send any
+        // pending traces before we exit. Shutting down OTEL tracing provider may
+        // hang for quite some time, see, for example:
+        // - https://github.com/open-telemetry/opentelemetry-rust/issues/868
+        // - and our problems with staging https://github.com/neondatabase/cloud/issues/3707#issuecomment-1493983636
+        //
+        // Yet, we want computes to shut down fast enough, as we may need a new one
+        // for the same timeline ASAP. So wait no longer than 2s for the shutdown to
+        // complete, then just error out and exit the main thread.
+        info!("shutting down tracing");
+        let (sender, receiver) = mpsc::channel();
+        let _ = thread::spawn(move || {
+            _ = p.shutdown();
+            sender.send(()).ok()
+        });
+        let shutdown_res = receiver.recv_timeout(Duration::from_millis(2000));
+        if shutdown_res.is_err() {
+            error!("timed out while shutting down tracing, exiting anyway");
+        }
    }

    info!("shutting down");
--- a/compute_tools/src/communicator_socket_client.rs
+++ b/compute_tools/src/communicator_socket_client.rs
@@ -0,0 +1,98 @@
+//! Client for making request to a running Postgres server's communicator control socket.
+//!
+//! The storage communicator process that runs inside Postgres exposes an HTTP endpoint in
+//! a Unix Domain Socket in the Postgres data directory. This provides access to it.
+
+use std::path::Path;
+
+use anyhow::Context;
+use hyper::client::conn::http1::SendRequest;
+use hyper_util::rt::TokioIo;
+
+/// Name of the socket within the Postgres data directory. This better match that in
+/// `pgxn/neon/communicator/src/lib.rs`.
+const NEON_COMMUNICATOR_SOCKET_NAME: &str = "neon-communicator.socket";
+
+/// Open a connection to the communicator's control socket, prepare to send requests to it
+/// with hyper.
+pub async fn connect_communicator_socket<B>(pgdata: &Path) -> anyhow::Result<SendRequest<B>>
+where
+    B: hyper::body::Body + 'static + Send,
+    B::Data: Send,
+    B::Error: Into<Box<dyn std::error::Error + Send + Sync>>,
+{
+    let socket_path = pgdata.join(NEON_COMMUNICATOR_SOCKET_NAME);
+    let socket_path_len = socket_path.display().to_string().len();
+
+    // There is a limit of around 100 bytes (108 on Linux?) on the length of the path to a
+    // Unix Domain socket. The limit is on the connect(2) function used to open the
+    // socket, not on the absolute path itself. Postgres changes the current directory to
+    // the data directory and uses a relative path to bind to the socket, and the relative
+    // path "./neon-communicator.socket" is always short, but when compute_ctl needs to
+    // open the socket, we need to use a full path, which can be arbitrarily long.
+    //
+    // There are a few ways we could work around this:
+    //
+    // 1. Change the current directory to the Postgres data directory and use a relative
+    //    path in the connect(2) call. That's problematic because the current directory
+    //    applies to the whole process. We could change the current directory early in
+    //    compute_ctl startup, and that might be a good idea anyway for other reasons too:
+    //    it would be more robust if the data directory is moved around or unlinked for
+    //    some reason, and you would be less likely to accidentally litter other parts of
+    //    the filesystem with e.g. temporary files. However, that's a pretty invasive
+    //    change.
+    //
+    // 2. On Linux, you could open() the data directory, and refer to the the socket
+    //    inside it as "/proc/self/fd/<fd>/neon-communicator.socket". But that's
+    //    Linux-only.
+    //
+    // 3. Create a symbolic link to the socket with a shorter path, and use that.
+    //
+    // We use the symbolic link approach here. Hopefully the paths we use in production
+    // are shorter, so that we can open the socket directly, so that this hack is needed
+    // only in development.
+    let connect_result = if socket_path_len < 100 {
+        // We can open the path directly with no hacks.
+        tokio::net::UnixStream::connect(socket_path).await
+    } else {
+        // The path to the socket is too long. Create a symlink to it with a shorter path.
+        let short_path = std::env::temp_dir().join(format!(
+            "compute_ctl.short-socket.{}.{}",
+            std::process::id(),
+            tokio::task::id()
+        ));
+        std::os::unix::fs::symlink(&socket_path, &short_path)?;
+
+        // Delete the symlink as soon as we have connected to it. There's a small chance
+        // of leaking if the process dies before we remove it, so try to keep that window
+        // as small as possible.
+        scopeguard::defer! {
+            if let Err(err) = std::fs::remove_file(&short_path) {
+                tracing::warn!("could not remove symlink \"{}\" created for socket: {}",
+                               short_path.display(), err);
+            }
+        }
+
+        tracing::info!(
+            "created symlink \"{}\" for socket \"{}\", opening it now",
+            short_path.display(),
+            socket_path.display()
+        );
+
+        tokio::net::UnixStream::connect(&short_path).await
+    };
+
+    let stream = connect_result.context("connecting to communicator control socket")?;
+
+    let io = TokioIo::new(stream);
+    let (request_sender, connection) = hyper::client::conn::http1::handshake(io).await?;
+
+    // spawn a task to poll the connection and drive the HTTP state
+    tokio::spawn(async move {
+        if let Err(err) = connection.await {
+            eprintln!("Error in connection: {err}");
+        }
+    });
+
+    Ok(request_sender)
+}
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -113,6 +113,11 @@ pub struct ComputeNodeParams {

    /// Interval for installed extensions collection
    pub installed_extensions_collection_interval: Arc<AtomicU64>,
+
+    /// Timeout of PG compute startup in the Init state.
+    pub pg_init_timeout: Option<Duration>,
+
+    pub lakebase_mode: bool,
 }

 type TaskHandle = Mutex<Option<JoinHandle<()>>>;
@@ -154,6 +159,7 @@ pub struct RemoteExtensionMetrics {
 #[derive(Clone, Debug)]
 pub struct ComputeState {
    pub start_time: DateTime<Utc>,
+    pub pg_start_time: Option<DateTime<Utc>>,
    pub status: ComputeStatus,
    /// Timestamp of the last Postgres activity. It could be `None` if
    /// compute wasn't used since start.
@@ -191,6 +197,7 @@ impl ComputeState {
    pub fn new() -> Self {
        Self {
            start_time: Utc::now(),
+            pg_start_time: None,
            status: ComputeStatus::Empty,
            last_active: None,
            error: None,
@@ -648,6 +655,9 @@ impl ComputeNode {
            };
            _this_entered = start_compute_span.enter();

+            // Hadron: Record postgres start time (used to enforce pg_init_timeout).
+            state_guard.pg_start_time.replace(Utc::now());
+
            state_guard.set_status(ComputeStatus::Init, &self.state_changed);
            compute_state = state_guard.clone()
        }
@@ -1441,7 +1451,7 @@ impl ComputeNode {
        })?;

        // Update pg_hba.conf received with basebackup.
-        update_pg_hba(pgdata_path)?;
+        update_pg_hba(pgdata_path, None)?;

        // Place pg_dynshmem under /dev/shm. This allows us to use
        // 'dynamic_shared_memory_type = mmap' so that the files are placed in
@@ -1746,6 +1756,7 @@ impl ComputeNode {
        }

        // Run migrations separately to not hold up cold starts
+        let lakebase_mode = self.params.lakebase_mode;
        let params = self.params.clone();
        tokio::spawn(async move {
            let mut conf = conf.as_ref().clone();
@@ -1758,7 +1769,7 @@ impl ComputeNode {
                            eprintln!("connection error: {e}");
                        }
                    });
-                    if let Err(e) = handle_migrations(params, &mut client).await {
+                    if let Err(e) = handle_migrations(params, &mut client, lakebase_mode).await {
                        error!("Failed to run migrations: {}", e);
                    }
                }
--- a/compute_tools/src/hadron_metrics.rs
+++ b/compute_tools/src/hadron_metrics.rs
@@ -0,0 +1,60 @@
+use metrics::{
+    IntCounter, IntGaugeVec, core::Collector, proto::MetricFamily, register_int_counter,
+    register_int_gauge_vec,
+};
+use once_cell::sync::Lazy;
+
+// Counter keeping track of the number of PageStream request errors reported by Postgres.
+// An error is registered every time Postgres calls compute_ctl's /refresh_configuration API.
+// Postgres will invoke this API if it detected trouble with PageStream requests (get_page@lsn,
+// get_base_backup, etc.) it sends to any pageserver. An increase in this counter value typically
+// indicates Postgres downtime, as PageStream requests are critical for Postgres to function.
+pub static POSTGRES_PAGESTREAM_REQUEST_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pg_cctl_pagestream_request_errors_total",
+        "Number of PageStream request errors reported by the postgres process"
+    )
+    .expect("failed to define a metric")
+});
+
+// Counter keeping track of the number of compute configuration errors due to Postgres statement
+// timeouts. An error is registered every time `ComputeNode::reconfigure()` fails due to Postgres
+// error code 57014 (query cancelled). This statement timeout typically occurs when postgres is
+// stuck in a problematic retry loop when the PS is reject its connection requests (usually due
+// to PG pointing at the wrong PS). We should investigate the root cause when this counter value
+// increases by checking PG and PS logs.
+pub static COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pg_cctl_configure_statement_timeout_errors_total",
+        "Number of compute configuration errors due to Postgres statement timeouts."
+    )
+    .expect("failed to define a metric")
+});
+
+pub static COMPUTE_ATTACHED: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pg_cctl_attached",
+        "Compute node attached status (1 if attached)",
+        &[
+            "pg_compute_id",
+            "pg_instance_id",
+            "tenant_id",
+            "timeline_id"
+        ]
+    )
+    .expect("failed to define a metric")
+});
+
+pub fn collect() -> Vec<MetricFamily> {
+    let mut metrics = Vec::new();
+    metrics.extend(POSTGRES_PAGESTREAM_REQUEST_ERRORS.collect());
+    metrics.extend(COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS.collect());
+    metrics.extend(COMPUTE_ATTACHED.collect());
+    metrics
+}
+
+pub fn initialize_metrics() {
+    Lazy::force(&POSTGRES_PAGESTREAM_REQUEST_ERRORS);
+    Lazy::force(&COMPUTE_CONFIGURE_STATEMENT_TIMEOUT_ERRORS);
+    Lazy::force(&COMPUTE_ATTACHED);
+}
--- a/compute_tools/src/http/routes/metrics.rs
+++ b/compute_tools/src/http/routes/metrics.rs
@@ -1,10 +1,18 @@
+use std::path::Path;
+use std::sync::Arc;
+
+use anyhow::Context;
 use axum::body::Body;
+use axum::extract::State;
 use axum::response::Response;
-use http::StatusCode;
 use http::header::CONTENT_TYPE;
+use http_body_util::BodyExt;
+use hyper::{Request, StatusCode};
 use metrics::proto::MetricFamily;
 use metrics::{Encoder, TextEncoder};

+use crate::communicator_socket_client::connect_communicator_socket;
+use crate::compute::ComputeNode;
 use crate::http::JsonResponse;
 use crate::metrics::collect;

@@ -31,3 +39,42 @@ pub(in crate::http) async fn get_metrics() -> Response {
        .body(Body::from(buffer))
        .unwrap()
 }
+
+/// Fetch and forward metrics from the Postgres neon extension's metrics
+/// exporter that are used by autoscaling-agent.
+///
+/// The neon extension exposes these metrics over a Unix domain socket
+/// in the data directory. That's not accessible directly from the outside
+/// world, so we have this endpoint in compute_ctl to expose it
+pub(in crate::http) async fn get_autoscaling_metrics(
+    State(compute): State<Arc<ComputeNode>>,
+) -> Result<Response, Response> {
+    let pgdata = Path::new(&compute.params.pgdata);
+
+    // Connect to the communicator process's metrics socket
+    let mut metrics_client = connect_communicator_socket(pgdata)
+        .await
+        .map_err(|e| JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, format!("{e:#}")))?;
+
+    // Make a request for /autoscaling_metrics
+    let request = Request::builder()
+        .method("GET")
+        .uri("/autoscaling_metrics")
+        .header("Host", "localhost") // hyper requires Host, even though the server won't care
+        .body(Body::from(""))
+        .unwrap();
+    let resp = metrics_client
+        .send_request(request)
+        .await
+        .context("fetching metrics from Postgres metrics service")
+        .map_err(|e| JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, format!("{e:#}")))?;
+
+    // Build a response that just forwards the response we got.
+    let mut response = Response::builder();
+    response = response.status(resp.status());
+    if let Some(content_type) = resp.headers().get(CONTENT_TYPE) {
+        response = response.header(CONTENT_TYPE, content_type);
+    }
+    let body = tonic::service::AxumBody::from_stream(resp.into_body().into_data_stream());
+    Ok(response.body(body).unwrap())
+}
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -81,8 +81,12 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
            Server::External {
                config, compute_id, ..
            } => {
-                let unauthenticated_router =
-                    Router::<Arc<ComputeNode>>::new().route("/metrics", get(metrics::get_metrics));
+                let unauthenticated_router = Router::<Arc<ComputeNode>>::new()
+                    .route("/metrics", get(metrics::get_metrics))
+                    .route(
+                        "/autoscaling_metrics",
+                        get(metrics::get_autoscaling_metrics),
+                    );

                let authenticated_router = Router::<Arc<ComputeNode>>::new()
                    .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -4,6 +4,7 @@
 #![deny(clippy::undocumented_unsafe_blocks)]

 pub mod checker;
+pub mod communicator_socket_client;
 pub mod config;
 pub mod configurator;
 pub mod http;
@@ -15,6 +16,7 @@ pub mod compute_prewarm;
 pub mod compute_promote;
 pub mod disk_quota;
 pub mod extension_server;
+pub mod hadron_metrics;
 pub mod installed_extensions;
 pub mod local_proxy;
 pub mod lsn_lease;
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -13,7 +13,9 @@ use tracing_subscriber::prelude::*;
 /// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See
 /// `tracing-utils` package description.
 ///
-pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
+pub fn init_tracing_and_logging(
+    default_log_level: &str,
+) -> anyhow::Result<Option<tracing_utils::Provider>> {
    // Initialize Logging
    let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
@@ -24,8 +26,9 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result
        .with_writer(std::io::stderr);

    // Initialize OpenTelemetry
-    let otlp_layer =
-        tracing_utils::init_tracing("compute_ctl", tracing_utils::ExportConfig::default()).await;
+    let provider =
+        tracing_utils::init_tracing("compute_ctl", tracing_utils::ExportConfig::default());
+    let otlp_layer = provider.as_ref().map(tracing_utils::layer);

    // Put it all together
    tracing_subscriber::registry()
@@ -37,7 +40,7 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result

    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();

-    Ok(())
+    Ok(provider)
 }

 /// Replace all newline characters with a special character to make it
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -9,15 +9,20 @@ use crate::metrics::DB_MIGRATION_FAILED;
 pub(crate) struct MigrationRunner<'m> {
    client: &'m mut Client,
    migrations: &'m [&'m str],
+    lakebase_mode: bool,
 }

 impl<'m> MigrationRunner<'m> {
    /// Create a new migration runner
-    pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
+    pub fn new(client: &'m mut Client, migrations: &'m [&'m str], lakebase_mode: bool) -> Self {
        // The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64
        assert!(migrations.len() + 1 < i64::MAX as usize);

-        Self { client, migrations }
+        Self {
+            client,
+            migrations,
+            lakebase_mode,
+        }
    }

    /// Get the current value neon_migration.migration_id
@@ -130,8 +135,13 @@ impl<'m> MigrationRunner<'m> {
            // ID is also the next index
            let migration_id = (current_migration + 1) as i64;
            let migration = self.migrations[current_migration];
+            let migration = if self.lakebase_mode {
+                migration.replace("neon_superuser", "databricks_superuser")
+            } else {
+                migration.to_string()
+            };

-            match Self::run_migration(self.client, migration_id, migration).await {
+            match Self::run_migration(self.client, migration_id, &migration).await {
                Ok(_) => {
                    info!("Finished migration id={}", migration_id);
                }
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -11,6 +11,7 @@ use tracing::{Level, error, info, instrument, span};
 use crate::compute::ComputeNode;
 use crate::metrics::{PG_CURR_DOWNTIME_MS, PG_TOTAL_DOWNTIME_MS};

+const PG_DEFAULT_INIT_TIMEOUIT: Duration = Duration::from_secs(60);
 const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);

 /// Struct to store runtime state of the compute monitor thread.
@@ -352,13 +353,47 @@ impl ComputeMonitor {
 // Hang on condition variable waiting until the compute status is `Running`.
 fn wait_for_postgres_start(compute: &ComputeNode) {
    let mut state = compute.state.lock().unwrap();
+    let pg_init_timeout = compute
+        .params
+        .pg_init_timeout
+        .unwrap_or(PG_DEFAULT_INIT_TIMEOUIT);
+
    while state.status != ComputeStatus::Running {
        info!("compute is not running, waiting before monitoring activity");
-        state = compute.state_changed.wait(state).unwrap();
+        if !compute.params.lakebase_mode {
+            state = compute.state_changed.wait(state).unwrap();

-        if state.status == ComputeStatus::Running {
-            break;
+            if state.status == ComputeStatus::Running {
+                break;
+            }
+            continue;
        }
+
+        if state.pg_start_time.is_some()
+            && Utc::now()
+                .signed_duration_since(state.pg_start_time.unwrap())
+                .to_std()
+                .unwrap_or_default()
+                > pg_init_timeout
+        {
+            // If Postgres isn't up and running with working PS/SK connections within POSTGRES_STARTUP_TIMEOUT, it is
+            // possible that we started Postgres with a wrong spec (so it is talking to the wrong PS/SK nodes). To prevent
+            // deadends we simply exit (panic) the compute node so it can restart with the latest spec.
+            //
+            // NB: We skip this check if we have not attempted to start PG yet (indicated by state.pg_start_up == None).
+            // This is to make sure the more appropriate errors are surfaced if we encounter issues before we even attempt
+            // to start PG (e.g., if we can't pull the spec, can't sync safekeepers, or can't get the basebackup).
+            error!(
+                "compute did not enter Running state in {} seconds, exiting",
+                pg_init_timeout.as_secs()
+            );
+            std::process::exit(1);
+        }
+        state = compute
+            .state_changed
+            .wait_timeout(state, Duration::from_secs(5))
+            .unwrap()
+            .0;
    }
 }

--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -11,7 +11,9 @@ use std::time::{Duration, Instant};

 use anyhow::{Result, bail};
 use compute_api::responses::TlsConfig;
-use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
+use compute_api::spec::{
+    Database, DatabricksSettings, GenericOption, GenericOptions, PgIdent, Role,
+};
 use futures::StreamExt;
 use indexmap::IndexMap;
 use ini::Ini;
@@ -184,6 +186,42 @@ impl DatabaseExt for Database {
    }
 }

+pub trait DatabricksSettingsExt {
+    fn as_pg_settings(&self) -> String;
+}
+
+impl DatabricksSettingsExt for DatabricksSettings {
+    fn as_pg_settings(&self) -> String {
+        // Postgres GUCs rendered from DatabricksSettings
+        vec![
+            // ssl_ca_file
+            Some(format!(
+                "ssl_ca_file = '{}'",
+                self.pg_compute_tls_settings.ca_file
+            )),
+            // [Optional] databricks.workspace_url
+            Some(format!(
+                "databricks.workspace_url = '{}'",
+                &self.databricks_workspace_host
+            )),
+            // todo(vikas.jain): these are not required anymore as they are moved to static
+            // conf but keeping these to avoid image mismatch between hcc and pg.
+            // Once hcc and pg are in sync, we can remove these.
+            //
+            // databricks.enable_databricks_identity_login
+            Some("databricks.enable_databricks_identity_login = true".to_string()),
+            // databricks.enable_sql_restrictions
+            Some("databricks.enable_sql_restrictions = true".to_string()),
+        ]
+        .into_iter()
+        // Removes `None`s
+        .flatten()
+        .collect::<Vec<String>>()
+        .join("\n")
+            + "\n"
+    }
+}
+
 /// Generic trait used to provide quoting / encoding for strings used in the
 /// Postgres SQL queries and DATABASE_URL.
 pub trait Escaping {
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,4 +1,6 @@
 use std::fs::File;
+use std::fs::{self, Permissions};
+use std::os::unix::fs::PermissionsExt;
 use std::path::Path;

 use anyhow::{Result, anyhow, bail};
@@ -133,10 +135,25 @@ pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result
 }

 /// Check `pg_hba.conf` and update if needed to allow external connections.
-pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
+pub fn update_pg_hba(pgdata_path: &Path, databricks_pg_hba: Option<&String>) -> Result<()> {
    // XXX: consider making it a part of config.json
    let pghba_path = pgdata_path.join("pg_hba.conf");

+    // Update pg_hba to contains databricks specfic settings before adding neon settings
+    // PG uses the first record that matches to perform authentication, so we need to have
+    // our rules before the default ones from neon.
+    // See https://www.postgresql.org/docs/16/auth-pg-hba-conf.html
+    if let Some(databricks_pg_hba) = databricks_pg_hba {
+        if config::line_in_file(
+            &pghba_path,
+            &format!("include_if_exists {}\n", *databricks_pg_hba),
+        )? {
+            info!("updated pg_hba.conf to include databricks_pg_hba.conf");
+        } else {
+            info!("pg_hba.conf already included databricks_pg_hba.conf");
+        }
+    }
+
    if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? {
        info!("updated pg_hba.conf to allow external connections");
    } else {
@@ -146,6 +163,59 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
    Ok(())
 }

+/// Check `pg_ident.conf` and update if needed to allow databricks config.
+pub fn update_pg_ident(pgdata_path: &Path, databricks_pg_ident: Option<&String>) -> Result<()> {
+    info!("checking pg_ident.conf");
+    let pghba_path = pgdata_path.join("pg_ident.conf");
+
+    // Update pg_ident to contains databricks specfic settings
+    if let Some(databricks_pg_ident) = databricks_pg_ident {
+        if config::line_in_file(
+            &pghba_path,
+            &format!("include_if_exists {}\n", *databricks_pg_ident),
+        )? {
+            info!("updated pg_ident.conf to include databricks_pg_ident.conf");
+        } else {
+            info!("pg_ident.conf already included databricks_pg_ident.conf");
+        }
+    }
+
+    Ok(())
+}
+
+/// Copy tls key_file and cert_file from k8s secret mount directory
+/// to pgdata and set private key file permissions as expected by Postgres.
+/// See this doc for expected permission <https://www.postgresql.org/docs/current/ssl-tcp.html>
+/// K8s secrets mount on dblet does not honor permission and ownership
+/// specified in the Volume or VolumeMount. So we need to explicitly copy the file and set the permissions.
+pub fn copy_tls_certificates(
+    key_file: &String,
+    cert_file: &String,
+    pgdata_path: &Path,
+) -> Result<()> {
+    let files = [cert_file, key_file];
+    for file in files.iter() {
+        let source = Path::new(file);
+        let dest = pgdata_path.join(source.file_name().unwrap());
+        if !dest.exists() {
+            std::fs::copy(source, &dest)?;
+            info!(
+                "Copying tls file: {} to {}",
+                &source.display(),
+                &dest.display()
+            );
+        }
+        if *file == key_file {
+            // Postgres requires private key to be readable only by the owner by having
+            // chmod 600 permissions.
+            let permissions = Permissions::from_mode(0o600);
+            fs::set_permissions(&dest, permissions)?;
+            info!("Setting permission on {}.", &dest.display());
+        }
+    }
+    Ok(())
+}
+
 /// Create a standby.signal file
 pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
    // XXX: consider making it a part of config.json
@@ -170,7 +240,11 @@ pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
 }

 #[instrument(skip_all)]
-pub async fn handle_migrations(params: ComputeNodeParams, client: &mut Client) -> Result<()> {
+pub async fn handle_migrations(
+    params: ComputeNodeParams,
+    client: &mut Client,
+    lakebase_mode: bool,
+) -> Result<()> {
    info!("handle migrations");

    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -234,7 +308,7 @@ pub async fn handle_migrations(params: ComputeNodeParams, client: &mut Client) -
        ),
    ];

-    MigrationRunner::new(client, &migrations)
+    MigrationRunner::new(client, &migrations, lakebase_mode)
        .run_migrations()
        .await?;

--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -407,6 +407,12 @@ struct StorageControllerStartCmdArgs {
        help = "Base port for the storage controller instance idenfified by instance-id (defaults to pageserver cplane api)"
    )]
    base_port: Option<u16>,
+
+    #[clap(
+        long,
+        help = "Whether the storage controller should handle pageserver-reported local disk loss events."
+    )]
+    handle_ps_local_disk_loss: Option<bool>,
 }

 #[derive(clap::Args)]
@@ -1809,6 +1815,7 @@ async fn handle_storage_controller(
                instance_id: args.instance_id,
                base_port: args.base_port,
                start_timeout: args.start_timeout,
+                handle_ps_local_disk_loss: args.handle_ps_local_disk_loss,
            };

            if let Err(e) = svc.start(start_args).await {
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -56,6 +56,7 @@ pub struct NeonStorageControllerStartArgs {
    pub instance_id: u8,
    pub base_port: Option<u16>,
    pub start_timeout: humantime::Duration,
+    pub handle_ps_local_disk_loss: Option<bool>,
 }

 impl NeonStorageControllerStartArgs {
@@ -64,6 +65,7 @@ impl NeonStorageControllerStartArgs {
            instance_id: 1,
            base_port: None,
            start_timeout,
+            handle_ps_local_disk_loss: None,
        }
    }
 }
@@ -669,6 +671,10 @@ impl StorageController {

        println!("Starting storage controller at {scheme}://{host}:{listen_port}");

+        if start_args.handle_ps_local_disk_loss.unwrap_or_default() {
+            args.push("--handle-ps-local-disk-loss".to_string());
+        }
+
        background_process::start_process(
            COMMAND,
            &instance_dir,
--- a/endpoint_storage/src/app.rs
+++ b/endpoint_storage/src/app.rs
@@ -233,7 +233,7 @@ mod tests {
                .unwrap()
                .as_millis();
            use rand::Rng;
-            let random = rand::thread_rng().r#gen::<u32>();
+            let random = rand::rng().random::<u32>();

            let s3_config = remote_storage::S3Config {
                bucket_name: var(REAL_S3_BUCKET).unwrap(),
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -416,6 +416,32 @@ pub struct GenericOption {
    pub vartype: String,
 }

+/// Postgres compute TLS settings.
+#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)]
+pub struct PgComputeTlsSettings {
+    // Absolute path to the certificate file for server-side TLS.
+    pub cert_file: String,
+    // Absolute path to the private key file for server-side TLS.
+    pub key_file: String,
+    // Absolute path to the certificate authority file for verifying client certificates.
+    pub ca_file: String,
+}
+
+/// Databricks specific options for compute instance.
+/// This is used to store any other settings that needs to be propagate to Compute
+/// but should not be persisted to ComputeSpec in the database.
+#[derive(Clone, Debug, Deserialize, Serialize, PartialEq)]
+pub struct DatabricksSettings {
+    pub pg_compute_tls_settings: PgComputeTlsSettings,
+    // Absolute file path to databricks_pg_hba.conf file.
+    pub databricks_pg_hba: String,
+    // Absolute file path to databricks_pg_ident.conf file.
+    pub databricks_pg_ident: String,
+    // Hostname portion of the Databricks workspace URL of the endpoint, or empty string if not known.
+    // A valid hostname is required for the compute instance to support PAT logins.
+    pub databricks_workspace_host: String,
+}
+
 /// Optional collection of `GenericOption`'s. Type alias allows us to
 /// declare a `trait` on it.
 pub type GenericOptions = Option<Vec<GenericOption>>;
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -90,7 +90,7 @@ impl<'a> IdempotencyKey<'a> {
        IdempotencyKey {
            now: Utc::now(),
            node_id,
-            nonce: rand::thread_rng().gen_range(0..=9999),
+            nonce: rand::rng().random_range(0..=9999),
        }
    }

--- a/libs/desim/src/node_os.rs
+++ b/libs/desim/src/node_os.rs
@@ -41,7 +41,7 @@ impl NodeOs {

    /// Generate a random number in range [0, max).
    pub fn random(&self, max: u64) -> u64 {
-        self.internal.rng.lock().gen_range(0..max)
+        self.internal.rng.lock().random_range(0..max)
    }

    /// Append a new event to the world event log.
--- a/libs/desim/src/options.rs
+++ b/libs/desim/src/options.rs
@@ -32,10 +32,10 @@ impl Delay {
    /// Generate a random delay in range [min, max]. Return None if the
    /// message should be dropped.
    pub fn delay(&self, rng: &mut StdRng) -> Option<u64> {
-        if rng.gen_bool(self.fail_prob) {
+        if rng.random_bool(self.fail_prob) {
            return None;
        }
-        Some(rng.gen_range(self.min..=self.max))
+        Some(rng.random_range(self.min..=self.max))
    }
 }

--- a/libs/desim/src/world.rs
+++ b/libs/desim/src/world.rs
@@ -69,7 +69,7 @@ impl World {
    /// Create a new random number generator.
    pub fn new_rng(&self) -> StdRng {
        let mut rng = self.rng.lock();
-        StdRng::from_rng(rng.deref_mut()).unwrap()
+        StdRng::from_rng(rng.deref_mut())
    }

    /// Create a new node.
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -17,5 +17,5 @@ procfs.workspace = true
 measured-process.workspace = true

 [dev-dependencies]
-rand = "0.8"
-rand_distr = "0.4.3"
+rand.workspace = true
+rand_distr = "0.5"
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -260,7 +260,7 @@ mod tests {

    #[test]
    fn test_cardinality_small() {
-        let (actual, estimate) = test_cardinality(100, Zipf::new(100, 1.2f64).unwrap());
+        let (actual, estimate) = test_cardinality(100, Zipf::new(100.0, 1.2f64).unwrap());

        assert_eq!(actual, [46, 30, 32]);
        assert!(51.3 < estimate[0] && estimate[0] < 51.4);
@@ -270,7 +270,7 @@ mod tests {

    #[test]
    fn test_cardinality_medium() {
-        let (actual, estimate) = test_cardinality(10000, Zipf::new(10000, 1.2f64).unwrap());
+        let (actual, estimate) = test_cardinality(10000, Zipf::new(10000.0, 1.2f64).unwrap());

        assert_eq!(actual, [2529, 1618, 1629]);
        assert!(2309.1 < estimate[0] && estimate[0] < 2309.2);
@@ -280,7 +280,8 @@ mod tests {

    #[test]
    fn test_cardinality_large() {
-        let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(1_000_000, 1.2f64).unwrap());
+        let (actual, estimate) =
+            test_cardinality(1_000_000, Zipf::new(1_000_000.0, 1.2f64).unwrap());

        assert_eq!(actual, [129077, 79579, 79630]);
        assert!(126067.2 < estimate[0] && estimate[0] < 126067.3);
@@ -290,7 +291,7 @@ mod tests {

    #[test]
    fn test_cardinality_small2() {
-        let (actual, estimate) = test_cardinality(100, Zipf::new(200, 0.8f64).unwrap());
+        let (actual, estimate) = test_cardinality(100, Zipf::new(200.0, 0.8f64).unwrap());

        assert_eq!(actual, [92, 58, 60]);
        assert!(116.1 < estimate[0] && estimate[0] < 116.2);
@@ -300,7 +301,7 @@ mod tests {

    #[test]
    fn test_cardinality_medium2() {
-        let (actual, estimate) = test_cardinality(10000, Zipf::new(20000, 0.8f64).unwrap());
+        let (actual, estimate) = test_cardinality(10000, Zipf::new(20000.0, 0.8f64).unwrap());

        assert_eq!(actual, [8201, 5131, 5051]);
        assert!(6846.4 < estimate[0] && estimate[0] < 6846.5);
@@ -310,7 +311,8 @@ mod tests {

    #[test]
    fn test_cardinality_large2() {
-        let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(2_000_000, 0.8f64).unwrap());
+        let (actual, estimate) =
+            test_cardinality(1_000_000, Zipf::new(2_000_000.0, 0.8f64).unwrap());

        assert_eq!(actual, [777847, 482069, 482246]);
        assert!(699437.4 < estimate[0] && estimate[0] < 699437.5);
--- a/libs/neon-shmem/Cargo.toml
+++ b/libs/neon-shmem/Cargo.toml
@@ -16,5 +16,5 @@ rustc-hash.workspace = true
 tempfile = "3.14.0"

 [dev-dependencies]
-rand = "0.9"
+rand.workspace = true
 rand_distr = "0.5.1"
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -394,7 +394,7 @@ impl From<&OtelExporterConfig> for tracing_utils::ExportConfig {
        tracing_utils::ExportConfig {
            endpoint: Some(val.endpoint.clone()),
            protocol: val.protocol.into(),
-            timeout: val.timeout,
+            timeout: Some(val.timeout),
        }
    }
 }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -981,12 +981,12 @@ mod tests {
        let mut rng = rand::rngs::StdRng::seed_from_u64(42);

        let key = Key {
-            field1: rng.r#gen(),
-            field2: rng.r#gen(),
-            field3: rng.r#gen(),
-            field4: rng.r#gen(),
-            field5: rng.r#gen(),
-            field6: rng.r#gen(),
+            field1: rng.random(),
+            field2: rng.random(),
+            field3: rng.random(),
+            field4: rng.random(),
+            field5: rng.random(),
+            field6: rng.random(),
        };

        assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -443,9 +443,9 @@ pub struct ImportPgdataIdempotencyKey(pub String);
 impl ImportPgdataIdempotencyKey {
    pub fn random() -> Self {
        use rand::Rng;
-        use rand::distributions::Alphanumeric;
+        use rand::distr::Alphanumeric;
        Self(
-            rand::thread_rng()
+            rand::rng()
                .sample_iter(&Alphanumeric)
                .take(20)
                .map(char::from)
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -21,6 +21,14 @@ pub struct ReAttachRequest {
    /// if the node already has a node_id set.
    #[serde(skip_serializing_if = "Option::is_none", default)]
    pub register: Option<NodeRegisterRequest>,
+
+    /// Hadron: Optional flag to indicate whether the node is starting with an empty local disk.
+    /// Will be set to true if the node couldn't find any local tenant data on startup, could be
+    /// due to the node starting for the first time or due to a local SSD failure/disk wipe event.
+    /// The flag may be used by the storage controller to update its observed state of the world
+    /// to make sure that it sends explicit location_config calls to the node following the
+    /// re-attach request.
+    pub empty_local_disk: Option<bool>,
 }

 #[derive(Serialize, Deserialize, Debug)]
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -203,12 +203,12 @@ impl fmt::Display for CancelKeyData {
    }
 }

-use rand::distributions::{Distribution, Standard};
-impl Distribution<CancelKeyData> for Standard {
+use rand::distr::{Distribution, StandardUniform};
+impl Distribution<CancelKeyData> for StandardUniform {
    fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> CancelKeyData {
        CancelKeyData {
-            backend_pid: rng.r#gen(),
-            cancel_key: rng.r#gen(),
+            backend_pid: rng.random(),
+            cancel_key: rng.random(),
        }
    }
 }
--- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
+++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
@@ -155,10 +155,10 @@ pub struct ScramSha256 {

 fn nonce() -> String {
    // rand 0.5's ThreadRng is cryptographically secure
-    let mut rng = rand::thread_rng();
+    let mut rng = rand::rng();
    (0..NONCE_LENGTH)
        .map(|_| {
-            let mut v = rng.gen_range(0x21u8..0x7e);
+            let mut v = rng.random_range(0x21u8..0x7e);
            if v == 0x2c {
                v = 0x7e
            }
--- a/libs/proxy/postgres-protocol2/src/message/backend.rs
+++ b/libs/proxy/postgres-protocol2/src/message/backend.rs
@@ -600,6 +600,7 @@ impl ParameterStatusBody {
    }
 }

+#[derive(Clone, Copy)]
 pub struct ReadyForQueryBody {
    status: u8,
 }
--- a/libs/proxy/postgres-protocol2/src/password/mod.rs
+++ b/libs/proxy/postgres-protocol2/src/password/mod.rs
@@ -28,7 +28,7 @@ const SCRAM_DEFAULT_SALT_LEN: usize = 16;
 /// special characters that would require escaping in an SQL command.
 pub async fn scram_sha_256(password: &[u8]) -> String {
    let mut salt: [u8; SCRAM_DEFAULT_SALT_LEN] = [0; SCRAM_DEFAULT_SALT_LEN];
-    let mut rng = rand::thread_rng();
+    let mut rng = rand::rng();
    rng.fill_bytes(&mut salt);
    scram_sha_256_salt(password, salt).await
 }
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -18,10 +18,7 @@ use crate::config::{Host, SslMode};
 use crate::query::RowStream;
 use crate::simple_query::SimpleQueryStream;
 use crate::types::{Oid, Type};
-use crate::{
-    CancelToken, Error, ReadyForQueryStatus, SimpleQueryMessage, Transaction, TransactionBuilder,
-    query, simple_query,
-};
+use crate::{CancelToken, Error, ReadyForQueryStatus, SimpleQueryMessage, query, simple_query};

 pub struct Responses {
    /// new messages from conn
@@ -32,6 +29,9 @@ pub struct Responses {
    waiting: usize,
    /// number of ReadyForQuery messages received.
    received: usize,
+
+    /// The last query status we received.
+    last_status: ReadyForQueryStatus,
 }

 impl Responses {
@@ -42,7 +42,8 @@ impl Responses {
                let received = self.received;

                // increase the query head if this is the last message.
-                if let Message::ReadyForQuery(_) = message {
+                if let Message::ReadyForQuery(ref status) = message {
+                    self.last_status = (*status).into();
                    self.received += 1;
                }

@@ -71,6 +72,15 @@ impl Responses {
    pub async fn next(&mut self) -> Result<Message, Error> {
        future::poll_fn(|cx| self.poll_next(cx)).await
    }
+
+    pub async fn wait_until_ready(&mut self) -> Result<ReadyForQueryStatus, Error> {
+        while self.received < self.waiting {
+            if let Message::ReadyForQuery(status) = self.next().await? {
+                return Ok(status.into());
+            }
+        }
+        Ok(self.last_status)
+    }
 }

 /// A cache of type info and prepared statements for fetching type info
@@ -95,13 +105,6 @@ impl InnerClient {
        Ok(PartialQuery(Some(self)))
    }

-    // pub fn send_with_sync<F>(&mut self, f: F) -> Result<&mut Responses, Error>
-    // where
-    //     F: FnOnce(&mut BytesMut) -> Result<(), Error>,
-    // {
-    //     self.start()?.send_with_sync(f)
-    // }
-
    pub fn send_simple_query(&mut self, query: &str) -> Result<&mut Responses, Error> {
        self.responses.waiting += 1;

@@ -200,6 +203,8 @@ impl Client {
                    cur: BackendMessages::empty(),
                    waiting: 0,
                    received: 0,
+                    // new connections are always idle.
+                    last_status: ReadyForQueryStatus::Idle,
                },
                buffer: Default::default(),
            },
@@ -233,6 +238,11 @@ impl Client {
        rx
    }

+    /// Wait until this connection has no more active queries.
+    pub async fn wait_until_ready(&mut self) -> Result<ReadyForQueryStatus, Error> {
+        self.inner_mut().responses.wait_until_ready().await
+    }
+
    /// Pass text directly to the Postgres backend to allow it to sort out typing itself and
    /// to save a roundtrip
    pub async fn query_raw_txt<S, I>(
@@ -292,52 +302,32 @@ impl Client {
        simple_query::batch_execute(self.inner_mut(), query).await
    }

-    pub async fn discard_all(&mut self) -> Result<ReadyForQueryStatus, Error> {
-        self.batch_execute("discard all").await
-    }
-
-    /// Begins a new database transaction.
+    /// Similar to `discard_all`, but it does not clear any query plans
    ///
-    /// The transaction will roll back by default - use the `commit` method to commit it.
-    pub async fn transaction(&mut self) -> Result<Transaction<'_>, Error> {
-        struct RollbackIfNotDone<'me> {
-            client: &'me mut Client,
-            done: bool,
-        }
+    /// This runs in the background, so it can be executed without `await`ing.
+    pub fn reset_session_background(&mut self) -> Result<(), Error> {
+        // "CLOSE ALL": closes any cursors
+        // "SET SESSION AUTHORIZATION DEFAULT": resets the current_user back to the session_user
+        // "RESET ALL": resets any GUCs back to their session defaults.
+        // "DEALLOCATE ALL": deallocates any prepared statements
+        // "UNLISTEN *": stops listening on all channels
+        // "SELECT pg_advisory_unlock_all();": unlocks all advisory locks
+        // "DISCARD TEMP;": drops all temporary tables
+        // "DISCARD SEQUENCES;": deallocates all cached sequence state

-        impl Drop for RollbackIfNotDone<'_> {
-            fn drop(&mut self) {
-                if self.done {
-                    return;
-                }
+        let _responses = self.inner_mut().send_simple_query(
+            "ROLLBACK;
+            CLOSE ALL;
+            SET SESSION AUTHORIZATION DEFAULT;
+            RESET ALL;
+            DEALLOCATE ALL;
+            UNLISTEN *;
+            SELECT pg_advisory_unlock_all();
+            DISCARD TEMP;
+            DISCARD SEQUENCES;",
+        )?;

-                let _ = self.client.inner.send_simple_query("ROLLBACK");
-            }
-        }
-
-        // This is done, as `Future` created by this method can be dropped after
-        // `RequestMessages` is synchronously send to the `Connection` by
-        // `batch_execute()`, but before `Responses` is asynchronously polled to
-        // completion. In that case `Transaction` won't be created and thus
-        // won't be rolled back.
-        {
-            let mut cleaner = RollbackIfNotDone {
-                client: self,
-                done: false,
-            };
-            cleaner.client.batch_execute("BEGIN").await?;
-            cleaner.done = true;
-        }
-
-        Ok(Transaction::new(self))
-    }
-
-    /// Returns a builder for a transaction with custom settings.
-    ///
-    /// Unlike the `transaction` method, the builder can be used to control the transaction's isolation level and other
-    /// attributes.
-    pub fn build_transaction(&mut self) -> TransactionBuilder<'_> {
-        TransactionBuilder::new(self)
+        Ok(())
    }

    /// Constructs a cancellation token that can later be used to request cancellation of a query running on the
--- a/libs/proxy/tokio-postgres2/src/generic_client.rs
+++ b/libs/proxy/tokio-postgres2/src/generic_client.rs
@@ -1,58 +0,0 @@
-#![allow(async_fn_in_trait)]
-
-use crate::query::RowStream;
-use crate::{Client, Error, Transaction};
-
-mod private {
-    pub trait Sealed {}
-}
-
-/// A trait allowing abstraction over connections and transactions.
-///
-/// This trait is "sealed", and cannot be implemented outside of this crate.
-pub trait GenericClient: private::Sealed {
-    /// Like `Client::query_raw_txt`.
-    async fn query_raw_txt<S, I>(
-        &mut self,
-        statement: &str,
-        params: I,
-    ) -> Result<RowStream<'_>, Error>
-    where
-        S: AsRef<str> + Sync + Send,
-        I: IntoIterator<Item = Option<S>> + Sync + Send,
-        I::IntoIter: ExactSizeIterator + Sync + Send;
-}
-
-impl private::Sealed for Client {}
-
-impl GenericClient for Client {
-    async fn query_raw_txt<S, I>(
-        &mut self,
-        statement: &str,
-        params: I,
-    ) -> Result<RowStream<'_>, Error>
-    where
-        S: AsRef<str> + Sync + Send,
-        I: IntoIterator<Item = Option<S>> + Sync + Send,
-        I::IntoIter: ExactSizeIterator + Sync + Send,
-    {
-        self.query_raw_txt(statement, params).await
-    }
-}
-
-impl private::Sealed for Transaction<'_> {}
-
-impl GenericClient for Transaction<'_> {
-    async fn query_raw_txt<S, I>(
-        &mut self,
-        statement: &str,
-        params: I,
-    ) -> Result<RowStream<'_>, Error>
-    where
-        S: AsRef<str> + Sync + Send,
-        I: IntoIterator<Item = Option<S>> + Sync + Send,
-        I::IntoIter: ExactSizeIterator + Sync + Send,
-    {
-        self.query_raw_txt(statement, params).await
-    }
-}
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -9,13 +9,11 @@ pub use crate::config::Config;
 pub use crate::connect_raw::RawConnection;
 pub use crate::connection::Connection;
 pub use crate::error::Error;
-pub use crate::generic_client::GenericClient;
 pub use crate::query::RowStream;
 pub use crate::row::{Row, SimpleQueryRow};
 pub use crate::simple_query::SimpleQueryStream;
 pub use crate::statement::{Column, Statement};
 pub use crate::tls::NoTls;
-pub use crate::transaction::Transaction;
 pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};

 /// After executing a query, the connection will be in one of these states
@@ -55,7 +53,6 @@ mod connect_socket;
 mod connect_tls;
 mod connection;
 pub mod error;
-mod generic_client;
 pub mod maybe_tls_stream;
 mod prepare;
 mod query;
@@ -63,7 +60,6 @@ pub mod row;
 mod simple_query;
 mod statement;
 pub mod tls;
-mod transaction;
 mod transaction_builder;
 pub mod types;

--- a/libs/proxy/tokio-postgres2/src/transaction.rs
+++ b/libs/proxy/tokio-postgres2/src/transaction.rs
@@ -1,73 +0,0 @@
-use crate::query::RowStream;
-use crate::{CancelToken, Client, Error, ReadyForQueryStatus};
-
-/// A representation of a PostgreSQL database transaction.
-///
-/// Transactions will implicitly roll back when dropped. Use the `commit` method to commit the changes made in the
-/// transaction. Transactions can be nested, with inner transactions implemented via safepoints.
-pub struct Transaction<'a> {
-    client: &'a mut Client,
-    done: bool,
-}
-
-impl Drop for Transaction<'_> {
-    fn drop(&mut self) {
-        if self.done {
-            return;
-        }
-
-        let _ = self.client.inner_mut().send_simple_query("ROLLBACK");
-    }
-}
-
-impl<'a> Transaction<'a> {
-    pub(crate) fn new(client: &'a mut Client) -> Transaction<'a> {
-        Transaction {
-            client,
-            done: false,
-        }
-    }
-
-    /// Consumes the transaction, committing all changes made within it.
-    pub async fn commit(mut self) -> Result<ReadyForQueryStatus, Error> {
-        self.done = true;
-        self.client.batch_execute("COMMIT").await
-    }
-
-    /// Rolls the transaction back, discarding all changes made within it.
-    ///
-    /// This is equivalent to `Transaction`'s `Drop` implementation, but provides any error encountered to the caller.
-    pub async fn rollback(mut self) -> Result<ReadyForQueryStatus, Error> {
-        self.done = true;
-        self.client.batch_execute("ROLLBACK").await
-    }
-
-    /// Like `Client::query_raw_txt`.
-    pub async fn query_raw_txt<S, I>(
-        &mut self,
-        statement: &str,
-        params: I,
-    ) -> Result<RowStream<'_>, Error>
-    where
-        S: AsRef<str>,
-        I: IntoIterator<Item = Option<S>>,
-        I::IntoIter: ExactSizeIterator,
-    {
-        self.client.query_raw_txt(statement, params).await
-    }
-
-    /// Like `Client::cancel_token`.
-    pub fn cancel_token(&self) -> CancelToken {
-        self.client.cancel_token()
-    }
-
-    /// Returns a reference to the underlying `Client`.
-    pub fn client(&self) -> &Client {
-        self.client
-    }
-
-    /// Returns a reference to the underlying `Client`.
-    pub fn client_mut(&mut self) -> &mut Client {
-        self.client
-    }
-}
--- a/libs/proxy/tokio-postgres2/src/transaction_builder.rs
+++ b/libs/proxy/tokio-postgres2/src/transaction_builder.rs
@@ -1,5 +1,3 @@
-use crate::{Client, Error, Transaction};
-
 /// The isolation level of a database transaction.
 #[derive(Debug, Copy, Clone)]
 #[non_exhaustive]
@@ -20,49 +18,17 @@ pub enum IsolationLevel {
 }

 /// A builder for database transactions.
-pub struct TransactionBuilder<'a> {
-    client: &'a mut Client,
-    isolation_level: Option<IsolationLevel>,
-    read_only: Option<bool>,
-    deferrable: Option<bool>,
+pub struct TransactionBuilder {
+    pub isolation_level: Option<IsolationLevel>,
+    pub read_only: Option<bool>,
+    pub deferrable: Option<bool>,
 }

-impl<'a> TransactionBuilder<'a> {
-    pub(crate) fn new(client: &'a mut Client) -> TransactionBuilder<'a> {
-        TransactionBuilder {
-            client,
-            isolation_level: None,
-            read_only: None,
-            deferrable: None,
-        }
-    }
-
-    /// Sets the isolation level of the transaction.
-    pub fn isolation_level(mut self, isolation_level: IsolationLevel) -> Self {
-        self.isolation_level = Some(isolation_level);
-        self
-    }
-
-    /// Sets the access mode of the transaction.
-    pub fn read_only(mut self, read_only: bool) -> Self {
-        self.read_only = Some(read_only);
-        self
-    }
-
-    /// Sets the deferrability of the transaction.
-    ///
-    /// If the transaction is also serializable and read only, creation of the transaction may block, but when it
-    /// completes the transaction is able to run with less overhead and a guarantee that it will not be aborted due to
-    /// serialization failure.
-    pub fn deferrable(mut self, deferrable: bool) -> Self {
-        self.deferrable = Some(deferrable);
-        self
-    }
-
+impl TransactionBuilder {
    /// Begins the transaction.
    ///
    /// The transaction will roll back by default - use the `commit` method to commit it.
-    pub async fn start(self) -> Result<Transaction<'a>, Error> {
+    pub fn format(self) -> String {
        let mut query = "START TRANSACTION".to_string();
        let mut first = true;

@@ -106,8 +72,6 @@ impl<'a> TransactionBuilder<'a> {
            query.push_str(s);
        }

-        self.client.batch_execute(&query).await?;
-
-        Ok(Transaction::new(self.client))
+        query
    }
 }
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -43,7 +43,7 @@ itertools.workspace = true
 sync_wrapper = { workspace = true, features = ["futures"] }

 byteorder = "1.4"
-rand = "0.8.5"
+rand.workspace = true

 [dev-dependencies]
 camino-tempfile.workspace = true
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -81,7 +81,7 @@ impl UnreliableWrapper {
    ///
    fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
        let mut attempts = self.attempts.lock().unwrap();
-        let mut rng = rand::thread_rng();
+        let mut rng = rand::rng();

        match attempts.entry(op) {
            Entry::Occupied(mut e) => {
@@ -94,7 +94,7 @@ impl UnreliableWrapper {
                /* BEGIN_HADRON */
                // If there are more attempts to fail, fail the request by probability.
                if (attempts_before_this < self.attempts_to_fail)
-                    && (rng.gen_range(0..=100) < self.attempt_failure_probability)
+                    && (rng.random_range(0..=100) < self.attempt_failure_probability)
                {
                    let error =
                        anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -208,7 +208,7 @@ async fn create_azure_client(
        .as_millis();

    // because nanos can be the same for two threads so can millis, add randomness
-    let random = rand::thread_rng().r#gen::<u32>();
+    let random = rand::rng().random::<u32>();

    let remote_storage_config = RemoteStorageConfig {
        storage: RemoteStorageKind::AzureContainer(AzureConfig {
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -385,7 +385,7 @@ async fn create_s3_client(
        .as_millis();

    // because nanos can be the same for two threads so can millis, add randomness
-    let random = rand::thread_rng().r#gen::<u32>();
+    let random = rand::rng().random::<u32>();

    let remote_storage_config = RemoteStorageConfig {
        storage: RemoteStorageKind::AwsS3(S3Config {
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -1,11 +1,5 @@
 //! Helper functions to set up OpenTelemetry tracing.
 //!
-//! This comes in two variants, depending on whether you have a Tokio runtime available.
-//! If you do, call `init_tracing()`. It sets up the trace processor and exporter to use
-//! the current tokio runtime. If you don't have a runtime available, or you don't want
-//! to share the runtime with the tracing tasks, call `init_tracing_without_runtime()`
-//! instead. It sets up a dedicated single-threaded Tokio runtime for the tracing tasks.
-//!
 //! Example:
 //!
 //! ```rust,no_run
@@ -21,7 +15,8 @@
 //!         .with_writer(std::io::stderr);
 //!
 //!     // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces
-//!     let otlp_layer = tracing_utils::init_tracing("my_application", tracing_utils::ExportConfig::default()).await;
+//!     let provider = tracing_utils::init_tracing("my_application", tracing_utils::ExportConfig::default());
+//!     let otlp_layer = provider.as_ref().map(tracing_utils::layer);
 //!
 //!     // Put it all together
 //!     tracing_subscriber::registry()
@@ -36,16 +31,18 @@
 pub mod http;
 pub mod perf_span;

-use opentelemetry::KeyValue;
 use opentelemetry::trace::TracerProvider;
 use opentelemetry_otlp::WithExportConfig;
 pub use opentelemetry_otlp::{ExportConfig, Protocol};
+use opentelemetry_sdk::trace::SdkTracerProvider;
 use tracing::level_filters::LevelFilter;
 use tracing::{Dispatch, Subscriber};
 use tracing_subscriber::Layer;
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::registry::LookupSpan;

+pub type Provider = SdkTracerProvider;
+
 /// Set up OpenTelemetry exporter, using configuration from environment variables.
 ///
 /// `service_name` is set as the OpenTelemetry 'service.name' resource (see
@@ -70,16 +67,7 @@ use tracing_subscriber::registry::LookupSpan;
 /// If you need some other setting, please test if it works first. And perhaps
 /// add a comment in the list above to save the effort of testing for the next
 /// person.
-///
-/// This doesn't block, but is marked as 'async' to hint that this must be called in
-/// asynchronous execution context.
-pub async fn init_tracing<S>(
-    service_name: &str,
-    export_config: ExportConfig,
-) -> Option<impl Layer<S>>
-where
-    S: Subscriber + for<'span> LookupSpan<'span>,
-{
+pub fn init_tracing(service_name: &str, export_config: ExportConfig) -> Option<Provider> {
    if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
        return None;
    };
@@ -89,52 +77,14 @@ where
    ))
 }

-/// Like `init_tracing`, but creates a separate tokio Runtime for the tracing
-/// tasks.
-pub fn init_tracing_without_runtime<S>(
-    service_name: &str,
-    export_config: ExportConfig,
-) -> Option<impl Layer<S>>
+pub fn layer<S>(p: &Provider) -> impl Layer<S>
 where
    S: Subscriber + for<'span> LookupSpan<'span>,
 {
-    if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
-        return None;
-    };
-
-    // The opentelemetry batch processor and the OTLP exporter needs a Tokio
-    // runtime. Create a dedicated runtime for them. One thread should be
-    // enough.
-    //
-    // (Alternatively, instead of batching, we could use the "simple
-    // processor", which doesn't need Tokio, and use "reqwest-blocking"
-    // feature for the OTLP exporter, which also doesn't need Tokio.  However,
-    // batching is considered best practice, and also I have the feeling that
-    // the non-Tokio codepaths in the opentelemetry crate are less used and
-    // might be more buggy, so better to stay on the well-beaten path.)
-    //
-    // We leak the runtime so that it keeps running after we exit the
-    // function.
-    let runtime = Box::leak(Box::new(
-        tokio::runtime::Builder::new_multi_thread()
-            .enable_all()
-            .thread_name("otlp runtime thread")
-            .worker_threads(1)
-            .build()
-            .unwrap(),
-    ));
-    let _guard = runtime.enter();
-
-    Some(init_tracing_internal(
-        service_name.to_string(),
-        export_config,
-    ))
+    tracing_opentelemetry::layer().with_tracer(p.tracer("global"))
 }

-fn init_tracing_internal<S>(service_name: String, export_config: ExportConfig) -> impl Layer<S>
-where
-    S: Subscriber + for<'span> LookupSpan<'span>,
-{
+fn init_tracing_internal(service_name: String, export_config: ExportConfig) -> Provider {
    // Sets up exporter from the provided [`ExportConfig`] parameter.
    // If the endpoint is not specified, it is loaded from the
    // OTEL_EXPORTER_OTLP_ENDPOINT environment variable.
@@ -153,22 +103,14 @@ where
        opentelemetry_sdk::propagation::TraceContextPropagator::new(),
    );

-    let tracer = opentelemetry_sdk::trace::TracerProvider::builder()
-        .with_batch_exporter(exporter, opentelemetry_sdk::runtime::Tokio)
-        .with_resource(opentelemetry_sdk::Resource::new(vec![KeyValue::new(
-            opentelemetry_semantic_conventions::resource::SERVICE_NAME,
-            service_name,
-        )]))
+    Provider::builder()
+        .with_batch_exporter(exporter)
+        .with_resource(
+            opentelemetry_sdk::Resource::builder()
+                .with_service_name(service_name)
+                .build(),
+        )
        .build()
-        .tracer("global");
-
-    tracing_opentelemetry::layer().with_tracer(tracer)
-}
-
-// Shutdown trace pipeline gracefully, so that it has a chance to send any
-// pending traces before we exit.
-pub fn shutdown_tracing() {
-    opentelemetry::global::shutdown_tracer_provider();
 }

 pub enum OtelEnablement {
@@ -176,17 +118,17 @@ pub enum OtelEnablement {
    Enabled {
        service_name: String,
        export_config: ExportConfig,
-        runtime: &'static tokio::runtime::Runtime,
    },
 }

 pub struct OtelGuard {
+    provider: Provider,
    pub dispatch: Dispatch,
 }

 impl Drop for OtelGuard {
    fn drop(&mut self) {
-        shutdown_tracing();
+        _ = self.provider.shutdown();
    }
 }

@@ -199,22 +141,19 @@ impl Drop for OtelGuard {
 /// The lifetime of the guard should match taht of the application. On drop, it tears down the
 /// OTEL infra.
 pub fn init_performance_tracing(otel_enablement: OtelEnablement) -> Option<OtelGuard> {
-    let otel_subscriber = match otel_enablement {
+    match otel_enablement {
        OtelEnablement::Disabled => None,
        OtelEnablement::Enabled {
            service_name,
            export_config,
-            runtime,
        } => {
-            let otel_layer = runtime
-                .block_on(init_tracing(&service_name, export_config))
-                .with_filter(LevelFilter::INFO);
+            let provider = init_tracing(&service_name, export_config)?;
+
+            let otel_layer = layer(&provider).with_filter(LevelFilter::INFO);
            let otel_subscriber = tracing_subscriber::registry().with(otel_layer);
-            let otel_dispatch = Dispatch::new(otel_subscriber);
+            let dispatch = Dispatch::new(otel_subscriber);

-            Some(otel_dispatch)
+            Some(OtelGuard { dispatch, provider })
        }
-    };
-
-    otel_subscriber.map(|dispatch| OtelGuard { dispatch })
+    }
 }
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -104,7 +104,7 @@ impl Id {

    pub fn generate() -> Self {
        let mut tli_buf = [0u8; 16];
-        rand::thread_rng().fill(&mut tli_buf);
+        rand::rng().fill(&mut tli_buf);
        Id::from(tli_buf)
    }

--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -364,42 +364,37 @@ impl MonotonicCounter<Lsn> for RecordLsn {
    }
 }

-/// Implements  [`rand::distributions::uniform::UniformSampler`] so we can sample [`Lsn`]s.
+/// Implements  [`rand::distr::uniform::UniformSampler`] so we can sample [`Lsn`]s.
 ///
 /// This is used by the `pagebench` pageserver benchmarking tool.
-pub struct LsnSampler(<u64 as rand::distributions::uniform::SampleUniform>::Sampler);
+pub struct LsnSampler(<u64 as rand::distr::uniform::SampleUniform>::Sampler);

-impl rand::distributions::uniform::SampleUniform for Lsn {
+impl rand::distr::uniform::SampleUniform for Lsn {
    type Sampler = LsnSampler;
 }

-impl rand::distributions::uniform::UniformSampler for LsnSampler {
+impl rand::distr::uniform::UniformSampler for LsnSampler {
    type X = Lsn;

-    fn new<B1, B2>(low: B1, high: B2) -> Self
+    fn new<B1, B2>(low: B1, high: B2) -> Result<Self, rand::distr::uniform::Error>
    where
-        B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
-        B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
+        B1: rand::distr::uniform::SampleBorrow<Self::X> + Sized,
+        B2: rand::distr::uniform::SampleBorrow<Self::X> + Sized,
    {
-        Self(
-            <u64 as rand::distributions::uniform::SampleUniform>::Sampler::new(
-                low.borrow().0,
-                high.borrow().0,
-            ),
-        )
+        <u64 as rand::distr::uniform::SampleUniform>::Sampler::new(low.borrow().0, high.borrow().0)
+            .map(Self)
    }

-    fn new_inclusive<B1, B2>(low: B1, high: B2) -> Self
+    fn new_inclusive<B1, B2>(low: B1, high: B2) -> Result<Self, rand::distr::uniform::Error>
    where
-        B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
-        B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
+        B1: rand::distr::uniform::SampleBorrow<Self::X> + Sized,
+        B2: rand::distr::uniform::SampleBorrow<Self::X> + Sized,
    {
-        Self(
-            <u64 as rand::distributions::uniform::SampleUniform>::Sampler::new_inclusive(
-                low.borrow().0,
-                high.borrow().0,
-            ),
+        <u64 as rand::distr::uniform::SampleUniform>::Sampler::new_inclusive(
+            low.borrow().0,
+            high.borrow().0,
        )
+        .map(Self)
    }

    fn sample<R: rand::prelude::Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -11,7 +11,8 @@ use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::storage_layer::{LayerName, PersistentLayerDesc};
 use pageserver_api::key::Key;
 use pageserver_api::shard::TenantShardId;
-use rand::prelude::{SeedableRng, SliceRandom, StdRng};
+use rand::prelude::{SeedableRng, StdRng};
+use rand::seq::IndexedRandom;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

--- a/pageserver/compaction/src/bin/compaction-simulator.rs
+++ b/pageserver/compaction/src/bin/compaction-simulator.rs
@@ -89,7 +89,7 @@ async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()>
            let cold_key_range = splitpoint..key_range.end;

            for i in 0..cmd.num_records {
-                let chosen_range = if rand::thread_rng().gen_bool(0.9) {
+                let chosen_range = if rand::rng().random_bool(0.9) {
                    &hot_key_range
                } else {
                    &cold_key_range
--- a/pageserver/compaction/src/simulator.rs
+++ b/pageserver/compaction/src/simulator.rs
@@ -300,9 +300,9 @@ impl MockTimeline {
        key_range: &Range<Key>,
    ) -> anyhow::Result<()> {
        crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]);
-        let mut rng = rand::thread_rng();
+        let mut rng = rand::rng();
        for _ in 0..num_records {
-            self.ingest_record(rng.gen_range(key_range.clone()), len);
+            self.ingest_record(rng.random_range(key_range.clone()), len);
            self.wal_ingested += len;
        }
        Ok(())
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -188,9 +188,9 @@ async fn main_impl(
        start_work_barrier.wait().await;
        loop {
            let (timeline, work) = {
-                let mut rng = rand::thread_rng();
+                let mut rng = rand::rng();
                let target = all_targets.choose(&mut rng).unwrap();
-                let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
+                let lsn = target.lsn_range.clone().map(|r| rng.random_range(r));
                (target.timeline, Work { lsn })
            };
            let sender = work_senders.get(&timeline).unwrap();
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -326,8 +326,7 @@ async fn main_impl(
            .cloned()
            .collect();
        let weights =
-            rand::distributions::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len()))
-                .unwrap();
+            rand::distr::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len())).unwrap();

        Box::pin(async move {
            let scheme = match Url::parse(&args.page_service_connstring) {
@@ -427,7 +426,7 @@ async fn run_worker(
    cancel: CancellationToken,
    rps_period: Option<Duration>,
    ranges: Vec<KeyRange>,
-    weights: rand::distributions::weighted::WeightedIndex<i128>,
+    weights: rand::distr::weighted::WeightedIndex<i128>,
 ) {
    shared_state.start_work_barrier.wait().await;
    let client_start = Instant::now();
@@ -469,9 +468,9 @@ async fn run_worker(
                }

                // Pick a random page from a random relation.
-                let mut rng = rand::thread_rng();
+                let mut rng = rand::rng();
                let r = &ranges[weights.sample(&mut rng)];
-                let key: i128 = rng.gen_range(r.start..r.end);
+                let key: i128 = rng.random_range(r.start..r.end);
                let (rel_tag, block_no) = key_to_block(key);

                let mut blks = VecDeque::with_capacity(batch_size);
@@ -502,7 +501,7 @@ async fn run_worker(
                // We assume that the entire batch can fit within the relation.
                assert_eq!(blks.len(), batch_size, "incomplete batch");

-                let req_lsn = if rng.gen_bool(args.req_latest_probability) {
+                let req_lsn = if rng.random_bool(args.req_latest_probability) {
                    Lsn::MAX
                } else {
                    r.timeline_lsn
--- a/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
+++ b/pageserver/pagebench/src/cmd/ondemand_download_churn.rs
@@ -7,7 +7,7 @@ use std::time::{Duration, Instant};
 use pageserver_api::models::HistoricLayerInfo;
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
-use rand::seq::SliceRandom;
+use rand::seq::IndexedMutRandom;
 use tokio::sync::{OwnedSemaphorePermit, mpsc};
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
@@ -260,7 +260,7 @@ async fn timeline_actor(

            loop {
                let layer_tx = {
-                    let mut rng = rand::thread_rng();
+                    let mut rng = rand::rng();
                    timeline.layers.choose_mut(&mut rng).expect("no layers")
                };
                match layer_tx.try_send(permit.take().unwrap()) {
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -11,6 +11,7 @@
 //! from data stored in object storage.
 //!
 use std::fmt::Write as FmtWrite;
+use std::sync::Arc;
 use std::time::{Instant, SystemTime};

 use anyhow::{Context, anyhow};
@@ -420,12 +421,16 @@ where
        }

        let mut min_restart_lsn: Lsn = Lsn::MAX;
+
+        let mut dbdir_cnt = 0;
+        let mut rel_cnt = 0;
+
        // Create tablespace directories
        for ((spcnode, dbnode), has_relmap_file) in
            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
        {
            self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
-
+            dbdir_cnt += 1;
            // If full backup is requested, include all relation files.
            // Otherwise only include init forks of unlogged relations.
            let rels = self
@@ -433,6 +438,7 @@ where
                .list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx)
                .await?;
            for &rel in rels.iter() {
+                rel_cnt += 1;
                // Send init fork as main fork to provide well formed empty
                // contents of UNLOGGED relations. Postgres copies it in
                // `reinit.c` during recovery.
@@ -455,6 +461,10 @@ where
            }
        }

+        self.timeline
+            .db_rel_count
+            .store(Some(Arc::new((dbdir_cnt, rel_cnt))));
+
        let start_time = Instant::now();
        let aux_files = self
            .timeline
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -126,7 +126,6 @@ fn main() -> anyhow::Result<()> {
        Some(cfg) => tracing_utils::OtelEnablement::Enabled {
            service_name: "pageserver".to_string(),
            export_config: (&cfg.export_config).into(),
-            runtime: *COMPUTE_REQUEST_RUNTIME,
        },
        None => tracing_utils::OtelEnablement::Disabled,
    };
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -42,6 +42,7 @@ pub trait StorageControllerUpcallApi {
    fn re_attach(
        &self,
        conf: &PageServerConf,
+        empty_local_disk: bool,
    ) -> impl Future<
        Output = Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError>,
    > + Send;
@@ -155,6 +156,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
    async fn re_attach(
        &self,
        conf: &PageServerConf,
+        empty_local_disk: bool,
    ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
        let url = self
            .base_url
@@ -226,6 +228,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
        let request = ReAttachRequest {
            node_id: self.node_id,
            register: register.clone(),
+            empty_local_disk: Some(empty_local_disk),
        };

        let response: ReAttachResponse = self
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -768,6 +768,7 @@ mod test {
        async fn re_attach(
            &self,
            _conf: &PageServerConf,
+            _empty_local_disk: bool,
        ) -> Result<HashMap<TenantShardId, ReAttachResponseTenant>, RetryForeverError> {
            unimplemented!()
        }
--- a/pageserver/src/feature_resolver.rs
+++ b/pageserver/src/feature_resolver.rs
@@ -155,7 +155,9 @@ impl FeatureResolver {
                    );

                    let tenant_properties = PerTenantProperties {
-                        remote_size_mb: Some(rand::thread_rng().gen_range(100.0..1000000.00)),
+                        remote_size_mb: Some(rand::rng().random_range(100.0..1000000.00)),
+                        db_count_max: Some(rand::rng().random_range(1..1000)),
+                        rel_count_max: Some(rand::rng().random_range(1..1000)),
                    }
                    .into_posthog_properties();

@@ -344,6 +346,8 @@ impl FeatureResolver {

 struct PerTenantProperties {
    pub remote_size_mb: Option<f64>,
+    pub db_count_max: Option<usize>,
+    pub rel_count_max: Option<usize>,
 }

 impl PerTenantProperties {
@@ -355,6 +359,18 @@ impl PerTenantProperties {
                PostHogFlagFilterPropertyValue::Number(remote_size_mb),
            );
        }
+        if let Some(db_count) = self.db_count_max {
+            properties.insert(
+                "tenant_db_count_max".to_string(),
+                PostHogFlagFilterPropertyValue::Number(db_count as f64),
+            );
+        }
+        if let Some(rel_count) = self.rel_count_max {
+            properties.insert(
+                "tenant_rel_count_max".to_string(),
+                PostHogFlagFilterPropertyValue::Number(rel_count as f64),
+            );
+        }
        properties
    }
 }
@@ -409,7 +425,11 @@ impl TenantFeatureResolver {

    /// Refresh the cached properties and flags on the critical path.
    pub fn refresh_properties_and_flags(&self, tenant_shard: &TenantShard) {
+        // Any of the remote size is none => this property is none.
        let mut remote_size_mb = Some(0.0);
+        // Any of the db or rel count is available => this property is available.
+        let mut db_count_max = None;
+        let mut rel_count_max = None;
        for timeline in tenant_shard.list_timelines() {
            let size = timeline.metrics.resident_physical_size_get();
            if size == 0 {
@@ -419,9 +439,25 @@ impl TenantFeatureResolver {
            if let Some(ref mut remote_size_mb) = remote_size_mb {
                *remote_size_mb += size as f64 / 1024.0 / 1024.0;
            }
+            if let Some(data) = timeline.db_rel_count.load_full() {
+                let (db_count, rel_count) = *data.as_ref();
+                if db_count_max.is_none() {
+                    db_count_max = Some(db_count);
+                }
+                if rel_count_max.is_none() {
+                    rel_count_max = Some(rel_count);
+                }
+                db_count_max = db_count_max.map(|max| max.max(db_count));
+                rel_count_max = rel_count_max.map(|max| max.max(rel_count));
+            }
        }
        self.cached_tenant_properties.store(Arc::new(
-            PerTenantProperties { remote_size_mb }.into_posthog_properties(),
+            PerTenantProperties {
+                remote_size_mb,
+                db_count_max,
+                rel_count_max,
+            }
+            .into_posthog_properties(),
        ));

        // BEGIN: Update the feature flag on the critical path.
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -8,6 +8,7 @@
 //!
 use std::collections::{HashMap, HashSet, hash_map};
 use std::ops::{ControlFlow, Range};
+use std::sync::Arc;

 use crate::walingest::{WalIngestError, WalIngestErrorKind};
 use crate::{PERF_TRACE_TARGET, ensure_walingest};
@@ -1254,11 +1255,16 @@ impl Timeline {
        let dbdir = DbDirectory::des(&buf)?;

        let mut total_size: u64 = 0;
+        let mut dbdir_cnt = 0;
+        let mut rel_cnt = 0;
+
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
+            dbdir_cnt += 1;
            for rel in self
                .list_rels(*spcnode, *dbnode, Version::at(lsn), ctx)
                .await?
            {
+                rel_cnt += 1;
                if self.cancel.is_cancelled() {
                    return Err(CalculateLogicalSizeError::Cancelled);
                }
@@ -1269,6 +1275,10 @@ impl Timeline {
                total_size += relsize as u64;
            }
        }
+
+        self.db_rel_count
+            .store(Some(Arc::new((dbdir_cnt, rel_cnt))));
+
        Ok(total_size * BLCKSZ as u64)
    }

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -6161,11 +6161,11 @@ mod tests {
    use pageserver_api::keyspace::KeySpaceRandomAccum;
    use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings, LsnLease};
    use pageserver_compaction::helpers::overlaps_with;
+    use rand::Rng;
    #[cfg(feature = "testing")]
    use rand::SeedableRng;
    #[cfg(feature = "testing")]
    use rand::rngs::StdRng;
-    use rand::{Rng, thread_rng};
    #[cfg(feature = "testing")]
    use std::ops::Range;
    use storage_layer::{IoConcurrency, PersistentLayerKey};
@@ -6286,8 +6286,8 @@ mod tests {
            while lsn < lsn_range.end {
                let mut key = key_range.start;
                while key < key_range.end {
-                    let gap = random.gen_range(1..=100) <= spec.gap_chance;
-                    let will_init = random.gen_range(1..=100) <= spec.will_init_chance;
+                    let gap = random.random_range(1..=100) <= spec.gap_chance;
+                    let will_init = random.random_range(1..=100) <= spec.will_init_chance;

                    if gap {
                        continue;
@@ -6330,8 +6330,8 @@ mod tests {
            while lsn < lsn_range.end {
                let mut key = key_range.start;
                while key < key_range.end {
-                    let gap = random.gen_range(1..=100) <= spec.gap_chance;
-                    let will_init = random.gen_range(1..=100) <= spec.will_init_chance;
+                    let gap = random.random_range(1..=100) <= spec.gap_chance;
+                    let will_init = random.random_range(1..=100) <= spec.will_init_chance;

                    if gap {
                        continue;
@@ -7808,7 +7808,7 @@ mod tests {
        for _ in 0..50 {
            for _ in 0..NUM_KEYS {
                lsn = Lsn(lsn.0 + 0x10);
-                let blknum = thread_rng().gen_range(0..NUM_KEYS);
+                let blknum = rand::rng().random_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
                let mut writer = tline.writer().await;
                writer
@@ -7897,7 +7897,7 @@ mod tests {

            for _ in 0..NUM_KEYS {
                lsn = Lsn(lsn.0 + 0x10);
-                let blknum = thread_rng().gen_range(0..NUM_KEYS);
+                let blknum = rand::rng().random_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
                let mut writer = tline.writer().await;
                writer
@@ -7965,7 +7965,7 @@ mod tests {

            for _ in 0..NUM_KEYS {
                lsn = Lsn(lsn.0 + 0x10);
-                let blknum = thread_rng().gen_range(0..NUM_KEYS);
+                let blknum = rand::rng().random_range(0..NUM_KEYS);
                test_key.field6 = blknum as u32;
                let mut writer = tline.writer().await;
                writer
@@ -8229,7 +8229,7 @@ mod tests {

            for _ in 0..NUM_KEYS {
                lsn = Lsn(lsn.0 + 0x10);
-                let blknum = thread_rng().gen_range(0..NUM_KEYS);
+                let blknum = rand::rng().random_range(0..NUM_KEYS);
                test_key.field6 = (blknum * STEP) as u32;
                let mut writer = tline.writer().await;
                writer
@@ -8502,7 +8502,7 @@ mod tests {
        for iter in 1..=10 {
            for _ in 0..NUM_KEYS {
                lsn = Lsn(lsn.0 + 0x10);
-                let blknum = thread_rng().gen_range(0..NUM_KEYS);
+                let blknum = rand::rng().random_range(0..NUM_KEYS);
                test_key.field6 = (blknum * STEP) as u32;
                let mut writer = tline.writer().await;
                writer
@@ -11291,10 +11291,10 @@ mod tests {
    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_read_path() -> anyhow::Result<()> {
-        use rand::seq::SliceRandom;
+        use rand::seq::IndexedRandom;

        let seed = if cfg!(feature = "fuzz-read-path") {
-            let seed: u64 = thread_rng().r#gen();
+            let seed: u64 = rand::rng().random();
            seed
        } else {
            // Use a hard-coded seed when not in fuzzing mode.
@@ -11308,8 +11308,8 @@ mod tests {

        let (queries, will_init_chance, gap_chance) = if cfg!(feature = "fuzz-read-path") {
            const QUERIES: u64 = 5000;
-            let will_init_chance: u8 = random.gen_range(0..=10);
-            let gap_chance: u8 = random.gen_range(0..=50);
+            let will_init_chance: u8 = random.random_range(0..=10);
+            let gap_chance: u8 = random.random_range(0..=50);

            (QUERIES, will_init_chance, gap_chance)
        } else {
@@ -11410,7 +11410,8 @@ mod tests {

                while used_keys.len() < tenant.conf.max_get_vectored_keys.get() {
                    let selected_lsn = interesting_lsns.choose(&mut random).expect("not empty");
-                    let mut selected_key = start_key.add(random.gen_range(0..KEY_DIMENSION_SIZE));
+                    let mut selected_key =
+                        start_key.add(random.random_range(0..KEY_DIMENSION_SIZE));

                    while used_keys.len() < tenant.conf.max_get_vectored_keys.get() {
                        if used_keys.contains(&selected_key)
@@ -11425,7 +11426,7 @@ mod tests {
                            .add_key(selected_key);
                        used_keys.insert(selected_key);

-                        let pick_next = random.gen_range(0..=100) <= PICK_NEXT_CHANCE;
+                        let pick_next = random.random_range(0..=100) <= PICK_NEXT_CHANCE;
                        if pick_next {
                            selected_key = selected_key.next();
                        } else {
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -535,8 +535,8 @@ pub(crate) mod tests {
    }

    pub(crate) fn random_array(len: usize) -> Vec<u8> {
-        let mut rng = rand::thread_rng();
-        (0..len).map(|_| rng.r#gen()).collect::<_>()
+        let mut rng = rand::rng();
+        (0..len).map(|_| rng.random()).collect::<_>()
    }

    #[tokio::test]
@@ -588,9 +588,9 @@ pub(crate) mod tests {
        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
        let blobs = (0..1024)
            .map(|_| {
-                let mut sz: u16 = rng.r#gen();
+                let mut sz: u16 = rng.random();
                // Make 50% of the arrays small
-                if rng.r#gen() {
+                if rng.random() {
                    sz &= 63;
                }
                random_array(sz.into())
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -1090,7 +1090,7 @@ pub(crate) mod tests {
        const NUM_KEYS: usize = 100000;
        let mut all_data: BTreeMap<u128, u64> = BTreeMap::new();
        for idx in 0..NUM_KEYS {
-            let u: f64 = rand::thread_rng().gen_range(0.0..1.0);
+            let u: f64 = rand::rng().random_range(0.0..1.0);
            let t = -(f64::ln(u));
            let key_int = (t * 1000000.0) as u128;

@@ -1116,7 +1116,7 @@ pub(crate) mod tests {

        // Test get() operations on random keys, most of which will not exist
        for _ in 0..100000 {
-            let key_int = rand::thread_rng().r#gen::<u128>();
+            let key_int = rand::rng().random::<u128>();
            let search_key = u128::to_be_bytes(key_int);
            assert!(reader.get(&search_key, &ctx).await? == all_data.get(&key_int).cloned());
        }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -508,8 +508,8 @@ mod tests {

        let write_nbytes = cap * 2 + cap / 2;

-        let content: Vec<u8> = rand::thread_rng()
-            .sample_iter(rand::distributions::Standard)
+        let content: Vec<u8> = rand::rng()
+            .sample_iter(rand::distr::StandardUniform)
            .take(write_nbytes)
            .collect();

@@ -565,8 +565,8 @@ mod tests {
        let cap = writer.mutable().capacity();
        drop(writer);

-        let content: Vec<u8> = rand::thread_rng()
-            .sample_iter(rand::distributions::Standard)
+        let content: Vec<u8> = rand::rng()
+            .sample_iter(rand::distr::StandardUniform)
            .take(cap * 2 + cap / 2)
            .collect();

@@ -614,8 +614,8 @@ mod tests {
        let cap = mutable.capacity();
        let align = mutable.align();
        drop(writer);
-        let content: Vec<u8> = rand::thread_rng()
-            .sample_iter(rand::distributions::Standard)
+        let content: Vec<u8> = rand::rng()
+            .sample_iter(rand::distr::StandardUniform)
            .take(cap * 2 + cap / 2)
            .collect();

--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -19,7 +19,7 @@ use pageserver_api::shard::{
 };
 use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::Rng;
-use rand::distributions::Alphanumeric;
+use rand::distr::Alphanumeric;
 use remote_storage::TimeoutOrCancel;
 use sysinfo::SystemExt;
 use tokio::fs;
@@ -218,7 +218,7 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
            std::io::ErrorKind::InvalidInput,
            "Path must be absolute",
        ))?;
-    let rand_suffix = rand::thread_rng()
+    let rand_suffix = rand::rng()
        .sample_iter(&Alphanumeric)
        .take(8)
        .map(char::from)
@@ -352,7 +352,8 @@ async fn init_load_generations(
        let client = StorageControllerUpcallClient::new(conf, cancel);
        info!("Calling {} API to re-attach tenants", client.base_url());
        // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
-        match client.re_attach(conf).await {
+        let empty_local_disk = tenant_confs.is_empty();
+        match client.re_attach(conf, empty_local_disk).await {
            Ok(tenants) => tenants
                .into_iter()
                .flat_map(|(id, rart)| {
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -25,7 +25,7 @@ pub(super) fn period_jitter(d: Duration, pct: u32) -> Duration {
    if d == Duration::ZERO {
        d
    } else {
-        rand::thread_rng().gen_range((d * (100 - pct)) / 100..(d * (100 + pct)) / 100)
+        rand::rng().random_range((d * (100 - pct)) / 100..(d * (100 + pct)) / 100)
    }
 }

@@ -35,7 +35,7 @@ pub(super) fn period_warmup(period: Duration) -> Duration {
    if period == Duration::ZERO {
        period
    } else {
-        rand::thread_rng().gen_range(Duration::ZERO..period)
+        rand::rng().random_range(Duration::ZERO..period)
    }
 }

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1634,7 +1634,8 @@ pub(crate) mod test {
    use bytes::Bytes;
    use itertools::MinMaxResult;
    use postgres_ffi::PgMajorVersion;
-    use rand::prelude::{SeedableRng, SliceRandom, StdRng};
+    use rand::prelude::{SeedableRng, StdRng};
+    use rand::seq::IndexedRandom;
    use rand::{Rng, RngCore};

    /// Construct an index for a fictional delta layer and and then
@@ -1788,14 +1789,14 @@ pub(crate) mod test {

        let mut entries = Vec::new();
        for _ in 0..constants::KEY_COUNT {
-            let count = rng.gen_range(1..constants::MAX_ENTRIES_PER_KEY);
+            let count = rng.random_range(1..constants::MAX_ENTRIES_PER_KEY);
            let mut lsns_iter =
                std::iter::successors(Some(Lsn(constants::LSN_OFFSET.0 + 0x08)), |lsn| {
                    Some(Lsn(lsn.0 + 0x08))
                });
            let mut lsns = Vec::new();
            while lsns.len() < count as usize {
-                let take = rng.gen_bool(0.5);
+                let take = rng.random_bool(0.5);
                let lsn = lsns_iter.next().unwrap();
                if take {
                    lsns.push(lsn);
@@ -1869,12 +1870,13 @@ pub(crate) mod test {
        for _ in 0..constants::RANGES_COUNT {
            let mut range: Option<Range<Key>> = Option::default();
            while range.is_none() || keyspace.overlaps(range.as_ref().unwrap()) {
-                let range_start = rng.gen_range(start..end);
+                let range_start = rng.random_range(start..end);
                let range_end_offset = range_start + constants::MIN_RANGE_SIZE;
                if range_end_offset >= end {
                    range = Some(Key::from_i128(range_start)..Key::from_i128(end));
                } else {
-                    let range_end = rng.gen_range((range_start + constants::MIN_RANGE_SIZE)..end);
+                    let range_end =
+                        rng.random_range((range_start + constants::MIN_RANGE_SIZE)..end);
                    range = Some(Key::from_i128(range_start)..Key::from_i128(range_end));
                }
            }
--- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
@@ -440,8 +440,8 @@ mod tests {
    impl InMemoryFile {
        fn new_random(len: usize) -> Self {
            Self {
-                content: rand::thread_rng()
-                    .sample_iter(rand::distributions::Standard)
+                content: rand::rng()
+                    .sample_iter(rand::distr::StandardUniform)
                    .take(len)
                    .collect(),
            }
@@ -498,7 +498,7 @@ mod tests {
                    len
                }
            };
-            rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[nread..]); // to discover bugs
+            rand::Rng::fill(&mut rand::rng(), &mut dst_slice[nread..]); // to discover bugs
            Ok((dst, nread))
        }
    }
@@ -763,7 +763,7 @@ mod tests {
                    let len = std::cmp::min(dst.bytes_total(), mocked_bytes.len());
                    let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed();
                    dst_slice[..len].copy_from_slice(&mocked_bytes[..len]);
-                    rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[len..]); // to discover bugs
+                    rand::Rng::fill(&mut rand::rng(), &mut dst_slice[len..]); // to discover bugs
                    Ok((dst, len))
                }
                Err(e) => Err(std::io::Error::other(e)),
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -515,7 +515,7 @@ pub(crate) async fn sleep_random_range(
    interval: RangeInclusive<Duration>,
    cancel: &CancellationToken,
 ) -> Result<Duration, Cancelled> {
-    let delay = rand::thread_rng().gen_range(interval);
+    let delay = rand::rng().random_range(interval);
    if delay == Duration::ZERO {
        return Ok(delay);
    }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -287,7 +287,7 @@ pub struct Timeline {
    ancestor_lsn: Lsn,

    // The LSN of gc-compaction that was last applied to this timeline.
-    gc_compaction_state: ArcSwap<Option<GcCompactionState>>,
+    gc_compaction_state: ArcSwapOption<GcCompactionState>,

    pub(crate) metrics: Arc<TimelineMetrics>,

@@ -448,7 +448,11 @@ pub struct Timeline {
    /// A channel to send async requests to prepare a basebackup for the basebackup cache.
    basebackup_cache: Arc<BasebackupCache>,

+    #[expect(dead_code)]
    feature_resolver: Arc<TenantFeatureResolver>,
+
+    /// Basebackup will collect the count and store it here. Used for reldirv2 rollout.
+    pub(crate) db_rel_count: ArcSwapOption<(usize, usize)>,
 }

 pub(crate) enum PreviousHeatmap {
@@ -2826,7 +2830,7 @@ impl Timeline {
                if r.numerator == 0 {
                    false
                } else {
-                    rand::thread_rng().gen_range(0..r.denominator) < r.numerator
+                    rand::rng().random_range(0..r.denominator) < r.numerator
                }
            }
            None => false,
@@ -3236,7 +3240,7 @@ impl Timeline {
                }),
                disk_consistent_lsn: AtomicLsn::new(disk_consistent_lsn.0),

-                gc_compaction_state: ArcSwap::new(Arc::new(gc_compaction_state)),
+                gc_compaction_state: ArcSwapOption::from_pointee(gc_compaction_state),

                last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0),
                last_freeze_ts: RwLock::new(Instant::now()),
@@ -3341,6 +3345,8 @@ impl Timeline {
                basebackup_cache: resources.basebackup_cache,

                feature_resolver: resources.feature_resolver.clone(),
+
+                db_rel_count: ArcSwapOption::from_pointee(None),
            };

            result.repartition_threshold =
@@ -3412,7 +3418,7 @@ impl Timeline {
        gc_compaction_state: GcCompactionState,
    ) -> anyhow::Result<()> {
        self.gc_compaction_state
-            .store(Arc::new(Some(gc_compaction_state.clone())));
+            .store(Some(Arc::new(gc_compaction_state.clone())));
        self.remote_client
            .schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state)
    }
@@ -3428,7 +3434,10 @@ impl Timeline {
    }

    pub(crate) fn get_gc_compaction_state(&self) -> Option<GcCompactionState> {
-        self.gc_compaction_state.load_full().as_ref().clone()
+        self.gc_compaction_state
+            .load()
+            .as_ref()
+            .map(|x| x.as_ref().clone())
    }

    /// Creates and starts the wal receiver.
@@ -3908,7 +3917,7 @@ impl Timeline {
                                // 1hour base
                                (60_i64 * 60_i64)
                                    // 10min jitter
-                                    + rand::thread_rng().gen_range(-10 * 60..10 * 60),
+                                    + rand::rng().random_range(-10 * 60..10 * 60),
                            )
                            .expect("10min < 1hour"),
                        );
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1326,13 +1326,7 @@ impl Timeline {
                .max()
        };

-        let (partition_mode, partition_lsn) = if cfg!(test)
-            || cfg!(feature = "testing")
-            || self
-                .feature_resolver
-                .evaluate_boolean("image-compaction-boundary")
-                .is_ok()
-        {
+        let (partition_mode, partition_lsn) = {
            let last_repartition_lsn = self.partitioning.read().1;
            let lsn = match l0_l1_boundary_lsn {
                Some(boundary) => gc_cutoff
@@ -1348,8 +1342,6 @@ impl Timeline {
            } else {
                ("l0_l1_boundary", lsn)
            }
-        } else {
-            ("latest_record", self.get_last_record_lsn())
        };

        // 2. Repartition and create image layers if necessary
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -362,7 +362,7 @@ impl<T: Types> Cache<T> {
                        tokio::time::sleep(RETRY_BACKOFF).await;
                        continue;
                    } else {
-                        tracing::warn!(
+                        tracing::info!(
                            "Failed to resolve tenant shard after {} attempts: {:?}",
                            GET_MAX_RETRIES,
                            e
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -1275,8 +1275,8 @@ mod tests {
    use std::sync::Arc;

    use owned_buffers_io::io_buf_ext::IoBufExt;
+    use rand::Rng;
    use rand::seq::SliceRandom;
-    use rand::{Rng, thread_rng};

    use super::*;
    use crate::context::DownloadBehavior;
@@ -1358,7 +1358,7 @@ mod tests {

        // Check that all the other FDs still work too. Use them in random order for
        // good measure.
-        file_b_dupes.as_mut_slice().shuffle(&mut thread_rng());
+        file_b_dupes.as_mut_slice().shuffle(&mut rand::rng());
        for vfile in file_b_dupes.iter_mut() {
            assert_first_512_eq(vfile, b"content_b").await;
        }
@@ -1413,9 +1413,8 @@ mod tests {
            let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error);
            let hdl = rt.spawn(async move {
                let mut buf = IoBufferMut::with_capacity_zeroed(SIZE);
-                let mut rng = rand::rngs::OsRng;
                for _ in 1..1000 {
-                    let f = &files[rng.gen_range(0..files.len())];
+                    let f = &files[rand::rng().random_range(0..files.len())];
                    buf = f
                        .read_exact_at(buf.slice_full(), 0, &ctx)
                        .await
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -5,6 +5,7 @@ MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
 	communicator.o \
+	communicator_process.o \
 	extension_server.o \
 	file_cache.o \
 	hll.o \
@@ -29,6 +30,11 @@ PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl

+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S), Darwin)
+    SHLIB_LINK += -framework Security -framework CoreFoundation -framework SystemConfiguration
+endif
+
 EXTENSION = neon
 DATA = \
 	neon--1.0.sql \
@@ -57,7 +63,8 @@ WALPROP_OBJS = \

 # libcommunicator.a is built by cargo from the Rust sources under communicator/
 # subdirectory. `cargo build` also generates communicator_bindings.h.
-neon.o: communicator/communicator_bindings.h
+communicator_process.o: communicator/communicator_bindings.h
+file_cache.o: communicator/communicator_bindings.h

 $(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h &:
 	(cd $(srcdir)/communicator && cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE))
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
@@ -1820,12 +1820,12 @@ nm_to_string(NeonMessage *msg)
 			}
 		case T_NeonGetPageResponse:
 			{
-#if 0
 				NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg;
-#endif

 				appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\"");
-				appendStringInfo(&s, ", \"page\": \"XXX\"}");
+				appendStringInfo(&s, ", \"rinfo\": %u/%u/%u", RelFileInfoFmt(msg_resp->req.rinfo));
+				appendStringInfo(&s, ", \"forknum\": %d", msg_resp->req.forknum);
+				appendStringInfo(&s, ", \"blkno\": %u", msg_resp->req.blkno);
 				appendStringInfoChar(&s, '}');
 				break;
 			}
--- a/pgxn/neon/communicator/Cargo.toml
+++ b/pgxn/neon/communicator/Cargo.toml
@@ -16,7 +16,14 @@ testing = []
 rest_broker = []

 [dependencies]
-neon-shmem.workspace = true
+axum.workspace = true
+http.workspace = true
+tokio = { workspace = true, features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] }
+tracing.workspace = true
+tracing-subscriber.workspace = true
+
+measured.workspace = true
+utils.workspace = true
 workspace_hack = { version = "0.1", path = "../../../workspace_hack" }

 [build-dependencies]
--- a/pgxn/neon/communicator/README.md
+++ b/pgxn/neon/communicator/README.md
@@ -1,7 +1,22 @@
-This package will evolve into a "compute-pageserver communicator"
-process and machinery. For now, it's just a dummy that doesn't do
-anything interesting, but it allows us to test the compilation and
-linking of Rust code into the Postgres extensions.
+# Communicator
+
+This package provides the so-called "compute-pageserver communicator",
+or just "communicator" in short. The communicator is a separate
+background worker process that runs in the PostgreSQL server. It's
+part of the neon extension. Currently, it only provides an HTTP
+endpoint for metrics, but in the future it will evolve to handle all
+communications with the pageservers.
+
+## Source code view
+
+pgxn/neon/communicator_process.c
+    Contains code needed to start up the communicator process, and
+    the glue that interacts with PostgreSQL code and the Rust
+    code in the communicator process.
+
+
+pgxn/neon/communicator/src/worker_process/
+    Worker process main loop and glue code

 At compilation time, pgxn/neon/communicator/ produces a static
 library, libcommunicator.a. It is linked to the neon.so extension
--- a/pgxn/neon/communicator/src/lib.rs
+++ b/pgxn/neon/communicator/src/lib.rs
@@ -1,6 +1,5 @@
-/// dummy function, just to test linking Rust functions into the C
-/// extension
-#[unsafe(no_mangle)]
-pub extern "C" fn communicator_dummy(arg: u32) -> u32 {
-    arg + 1
-}
+mod worker_process;
+
+/// Name of the Unix Domain Socket that serves the metrics, and other APIs in the
+/// future. This is within the Postgres data directory.
+const NEON_COMMUNICATOR_SOCKET_NAME: &str = "neon-communicator.socket";
--- a/pgxn/neon/communicator/src/worker_process/callbacks.rs
+++ b/pgxn/neon/communicator/src/worker_process/callbacks.rs
@@ -0,0 +1,51 @@
+//! C callbacks to PostgreSQL facilities that the neon extension needs to provide. These
+//! are implemented in `neon/pgxn/communicator_process.c`. The function signatures better
+//! match!
+//!
+//! These are called from the communicator threads! Careful what you do, most Postgres
+//! functions are not safe to call in that context.
+
+#[cfg(not(test))]
+unsafe extern "C" {
+    pub fn callback_set_my_latch_unsafe();
+    pub fn callback_get_lfc_metrics_unsafe() -> LfcMetrics;
+}
+
+// Compile unit tests with dummy versions of the functions. Unit tests cannot call back
+// into the C code. (As of this writing, no unit tests even exists in the communicator
+// package, but the code coverage build still builds these and tries to link with the
+// external C code.)
+#[cfg(test)]
+unsafe fn callback_set_my_latch_unsafe() {
+    panic!("not usable in unit tests");
+}
+#[cfg(test)]
+unsafe fn callback_get_lfc_metrics_unsafe() -> LfcMetrics {
+    panic!("not usable in unit tests");
+}
+
+// safe wrappers
+
+pub(super) fn callback_set_my_latch() {
+    unsafe { callback_set_my_latch_unsafe() };
+}
+
+pub(super) fn callback_get_lfc_metrics() -> LfcMetrics {
+    unsafe { callback_get_lfc_metrics_unsafe() }
+}
+
+/// Return type of the callback_get_lfc_metrics() function.
+#[repr(C)]
+pub struct LfcMetrics {
+    pub lfc_cache_size_limit: i64,
+    pub lfc_hits: i64,
+    pub lfc_misses: i64,
+    pub lfc_used: i64,
+    pub lfc_writes: i64,
+
+    // working set size looking back 1..60 minutes.
+    //
+    // Index 0 is the size of the working set accessed within last 1 minute,
+    // index 59 is the size of the working set accessed within last 60 minutes.
+    pub lfc_approximate_working_set_size_windows: [i64; 60],
+}
--- a/pgxn/neon/communicator/src/worker_process/control_socket.rs
+++ b/pgxn/neon/communicator/src/worker_process/control_socket.rs
@@ -0,0 +1,102 @@
+//! Communicator control socket.
+//!
+//! Currently, the control socket is used to provide information about the communicator
+//! process, file cache etc. as prometheus metrics. In the future, it can be used to
+//! expose more things.
+//!
+//! The exporter speaks HTTP, listens on a Unix Domain Socket under the Postgres
+//! data directory. For debugging, you can access it with curl:
+//!
+//! ```sh
+//! curl --unix-socket neon-communicator.socket http://localhost/metrics
+//! ```
+//!
+use axum::Router;
+use axum::body::Body;
+use axum::extract::State;
+use axum::response::Response;
+use http::StatusCode;
+use http::header::CONTENT_TYPE;
+
+use measured::MetricGroup;
+use measured::text::BufferedTextEncoder;
+
+use std::io::ErrorKind;
+
+use tokio::net::UnixListener;
+
+use crate::NEON_COMMUNICATOR_SOCKET_NAME;
+use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
+
+impl CommunicatorWorkerProcessStruct {
+    /// Launch the listener
+    pub(crate) async fn launch_control_socket_listener(
+        &'static self,
+    ) -> Result<(), std::io::Error> {
+        use axum::routing::get;
+        let app = Router::new()
+            .route("/metrics", get(get_metrics))
+            .route("/autoscaling_metrics", get(get_autoscaling_metrics))
+            .route("/debug/panic", get(handle_debug_panic))
+            .with_state(self);
+
+        // If the server is restarted, there might be an old socket still
+        // lying around. Remove it first.
+        match std::fs::remove_file(NEON_COMMUNICATOR_SOCKET_NAME) {
+            Ok(()) => {
+                tracing::warn!("removed stale control socket");
+            }
+            Err(e) if e.kind() == ErrorKind::NotFound => {}
+            Err(e) => {
+                tracing::error!("could not remove stale control socket: {e:#}");
+                // Try to proceed anyway. It will likely fail below though.
+            }
+        };
+
+        // Create the unix domain socket and start listening on it
+        let listener = UnixListener::bind(NEON_COMMUNICATOR_SOCKET_NAME)?;
+
+        tokio::spawn(async {
+            tracing::info!("control socket listener spawned");
+            axum::serve(listener, app)
+                .await
+                .expect("axum::serve never returns")
+        });
+
+        Ok(())
+    }
+}
+
+/// Expose all Prometheus metrics.
+async fn get_metrics(State(state): State<&CommunicatorWorkerProcessStruct>) -> Response {
+    tracing::trace!("/metrics requested");
+    metrics_to_response(&state).await
+}
+
+/// Expose Prometheus metrics, for use by the autoscaling agent.
+///
+/// This is a subset of all the metrics.
+async fn get_autoscaling_metrics(
+    State(state): State<&CommunicatorWorkerProcessStruct>,
+) -> Response {
+    tracing::trace!("/metrics requested");
+    metrics_to_response(&state.lfc_metrics).await
+}
+
+async fn handle_debug_panic(State(_state): State<&CommunicatorWorkerProcessStruct>) -> Response {
+    panic!("test HTTP handler task panic");
+}
+
+/// Helper function to convert prometheus metrics to a text response
+async fn metrics_to_response(metrics: &(dyn MetricGroup<BufferedTextEncoder> + Sync)) -> Response {
+    let mut enc = BufferedTextEncoder::new();
+    metrics
+        .collect_group_into(&mut enc)
+        .unwrap_or_else(|never| match never {});
+
+    Response::builder()
+        .status(StatusCode::OK)
+        .header(CONTENT_TYPE, "application/text")
+        .body(Body::from(enc.finish()))
+        .unwrap()
+}
--- a/pgxn/neon/communicator/src/worker_process/lfc_metrics.rs
+++ b/pgxn/neon/communicator/src/worker_process/lfc_metrics.rs
@@ -0,0 +1,83 @@
+use measured::{
+    FixedCardinalityLabel, Gauge, GaugeVec, LabelGroup, MetricGroup,
+    label::{LabelName, LabelValue, StaticLabelSet},
+    metric::{MetricEncoding, gauge::GaugeState, group::Encoding},
+};
+
+use super::callbacks::callback_get_lfc_metrics;
+
+pub(crate) struct LfcMetricsCollector;
+
+#[derive(MetricGroup)]
+#[metric(new())]
+struct LfcMetricsGroup {
+    /// LFC cache size limit in bytes
+    lfc_cache_size_limit: Gauge,
+    /// LFC cache hits
+    lfc_hits: Gauge,
+    /// LFC cache misses
+    lfc_misses: Gauge,
+    /// LFC chunks used (chunk = 1MB)
+    lfc_used: Gauge,
+    /// LFC cache writes
+    lfc_writes: Gauge,
+    /// Approximate working set size in pages of 8192 bytes
+    #[metric(init = GaugeVec::dense())]
+    lfc_approximate_working_set_size_windows: GaugeVec<StaticLabelSet<MinuteAsSeconds>>,
+}
+
+impl<T: Encoding> MetricGroup<T> for LfcMetricsCollector
+where
+    GaugeState: MetricEncoding<T>,
+{
+    fn collect_group_into(&self, enc: &mut T) -> Result<(), <T as Encoding>::Err> {
+        let g = LfcMetricsGroup::new();
+
+        let lfc_metrics = callback_get_lfc_metrics();
+
+        g.lfc_cache_size_limit.set(lfc_metrics.lfc_cache_size_limit);
+        g.lfc_hits.set(lfc_metrics.lfc_hits);
+        g.lfc_misses.set(lfc_metrics.lfc_misses);
+        g.lfc_used.set(lfc_metrics.lfc_used);
+        g.lfc_writes.set(lfc_metrics.lfc_writes);
+
+        for i in 0..60 {
+            let val = lfc_metrics.lfc_approximate_working_set_size_windows[i];
+            g.lfc_approximate_working_set_size_windows
+                .set(MinuteAsSeconds(i), val);
+        }
+
+        g.collect_group_into(enc)
+    }
+}
+
+/// This stores the values in range 0..60,
+/// encodes them as seconds (60, 120, 180, ..., 3600)
+#[derive(Clone, Copy)]
+struct MinuteAsSeconds(usize);
+
+impl FixedCardinalityLabel for MinuteAsSeconds {
+    fn cardinality() -> usize {
+        60
+    }
+
+    fn encode(&self) -> usize {
+        self.0
+    }
+
+    fn decode(value: usize) -> Self {
+        Self(value)
+    }
+}
+
+impl LabelValue for MinuteAsSeconds {
+    fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
+        v.write_int((self.0 + 1) as i64 * 60)
+    }
+}
+
+impl LabelGroup for MinuteAsSeconds {
+    fn visit_values(&self, v: &mut impl measured::label::LabelGroupVisitor) {
+        v.write_value(LabelName::from_str("duration_seconds"), self);
+    }
+}
--- a/pgxn/neon/communicator/src/worker_process/logging.rs
+++ b/pgxn/neon/communicator/src/worker_process/logging.rs
@@ -0,0 +1,250 @@
+//! Glue code to hook up Rust logging with the `tracing` crate to the PostgreSQL log
+//!
+//! In the Rust threads, the log messages are written to a mpsc Channel, and the Postgres
+//! process latch is raised. That wakes up the loop in the main thread, see
+//! `communicator_new_bgworker_main()`. It reads the message from the channel and
+//! ereport()s it. This ensures that only one thread, the main thread, calls the
+//! PostgreSQL logging routines at any time.
+
+use std::ffi::c_char;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::mpsc::sync_channel;
+use std::sync::mpsc::{Receiver, SyncSender};
+use std::sync::mpsc::{TryRecvError, TrySendError};
+
+use tracing::info;
+use tracing::{Event, Level, Metadata, Subscriber};
+use tracing_subscriber::filter::LevelFilter;
+use tracing_subscriber::fmt::format::Writer;
+use tracing_subscriber::fmt::{FmtContext, FormatEvent, FormatFields, FormattedFields, MakeWriter};
+use tracing_subscriber::registry::LookupSpan;
+
+use crate::worker_process::callbacks::callback_set_my_latch;
+
+/// This handle is passed to the C code, and used by [`communicator_worker_poll_logging`]
+pub struct LoggingReceiver {
+    receiver: Receiver<FormattedEventWithMeta>,
+}
+
+/// This is passed to `tracing`
+struct LoggingSender {
+    sender: SyncSender<FormattedEventWithMeta>,
+}
+
+static DROPPED_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
+
+/// Called once, at worker process startup. The returned LoggingState is passed back
+/// in the subsequent calls to `pump_logging`. It is opaque to the C code.
+#[unsafe(no_mangle)]
+pub extern "C" fn communicator_worker_configure_logging() -> Box<LoggingReceiver> {
+    let (sender, receiver) = sync_channel(1000);
+
+    let receiver = LoggingReceiver { receiver };
+    let sender = LoggingSender { sender };
+
+    use tracing_subscriber::prelude::*;
+    let r = tracing_subscriber::registry();
+
+    let r = r.with(
+        tracing_subscriber::fmt::layer()
+            .with_ansi(false)
+            .event_format(SimpleFormatter)
+            .with_writer(sender)
+            // TODO: derive this from log_min_messages? Currently the code in
+            // communicator_process.c forces log_min_messages='INFO'.
+            .with_filter(LevelFilter::from_level(Level::INFO)),
+    );
+    r.init();
+
+    info!("communicator process logging started");
+
+    Box::new(receiver)
+}
+
+/// Read one message from the logging queue. This is essentially a wrapper to Receiver,
+/// with a C-friendly signature.
+///
+/// The message is copied into *errbuf, which is a caller-supplied buffer of size
+/// `errbuf_len`.  If the message doesn't fit in the buffer, it is truncated. It is always
+/// NULL-terminated.
+///
+/// The error level is returned *elevel_p. It's one of the PostgreSQL error levels, see
+/// elog.h
+///
+/// If there was a message, *dropped_event_count_p is also updated with a counter of how
+/// many log messages in total has been dropped. By comparing that with the value from
+/// previous call, you can tell how many were dropped since last call.
+///
+/// Returns:
+///
+///   0 if there were no messages
+///   1 if there was a message. The message and its level are returned in
+///     *errbuf and *elevel_p. *dropped_event_count_p is also updated.
+///  -1 on error, i.e the other end of the queue was disconnected
+#[unsafe(no_mangle)]
+pub extern "C" fn communicator_worker_poll_logging(
+    state: &mut LoggingReceiver,
+    errbuf: *mut c_char,
+    errbuf_len: u32,
+    elevel_p: &mut i32,
+    dropped_event_count_p: &mut u64,
+) -> i32 {
+    let msg = match state.receiver.try_recv() {
+        Err(TryRecvError::Empty) => return 0,
+        Err(TryRecvError::Disconnected) => return -1,
+        Ok(msg) => msg,
+    };
+
+    let src: &[u8] = &msg.message;
+    let dst: *mut u8 = errbuf.cast();
+    let len = std::cmp::min(src.len(), errbuf_len as usize - 1);
+    unsafe {
+        std::ptr::copy_nonoverlapping(src.as_ptr(), dst, len);
+        *(dst.add(len)) = b'\0'; // NULL terminator
+    }
+
+    // Map the tracing Level to PostgreSQL elevel.
+    //
+    // XXX: These levels are copied from PostgreSQL's elog.h. Introduce another enum to
+    // hide these?
+    *elevel_p = match msg.level {
+        Level::TRACE => 10, // DEBUG5
+        Level::DEBUG => 14, // DEBUG1
+        Level::INFO => 17,  // INFO
+        Level::WARN => 19,  // WARNING
+        Level::ERROR => 21, // ERROR
+    };
+
+    *dropped_event_count_p = DROPPED_EVENT_COUNT.load(Ordering::Relaxed);
+
+    1
+}
+
+//---- The following functions can be called from any thread ----
+
+#[derive(Clone)]
+struct FormattedEventWithMeta {
+    message: Vec<u8>,
+    level: tracing::Level,
+}
+
+impl Default for FormattedEventWithMeta {
+    fn default() -> Self {
+        FormattedEventWithMeta {
+            message: Vec::new(),
+            level: tracing::Level::DEBUG,
+        }
+    }
+}
+
+struct EventBuilder<'a> {
+    event: FormattedEventWithMeta,
+
+    sender: &'a LoggingSender,
+}
+
+impl std::io::Write for EventBuilder<'_> {
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        self.event.message.write(buf)
+    }
+    fn flush(&mut self) -> std::io::Result<()> {
+        self.sender.send_event(self.event.clone());
+        Ok(())
+    }
+}
+
+impl Drop for EventBuilder<'_> {
+    fn drop(&mut self) {
+        let sender = self.sender;
+        let event = std::mem::take(&mut self.event);
+
+        sender.send_event(event);
+    }
+}
+
+impl<'a> MakeWriter<'a> for LoggingSender {
+    type Writer = EventBuilder<'a>;
+
+    fn make_writer(&'a self) -> Self::Writer {
+        panic!("not expected to be called when make_writer_for is implemented");
+    }
+
+    fn make_writer_for(&'a self, meta: &Metadata<'_>) -> Self::Writer {
+        EventBuilder {
+            event: FormattedEventWithMeta {
+                message: Vec::new(),
+                level: *meta.level(),
+            },
+            sender: self,
+        }
+    }
+}
+
+impl LoggingSender {
+    fn send_event(&self, e: FormattedEventWithMeta) {
+        match self.sender.try_send(e) {
+            Ok(()) => {
+                // notify the main thread
+                callback_set_my_latch();
+            }
+            Err(TrySendError::Disconnected(_)) => {}
+            Err(TrySendError::Full(_)) => {
+                // The queue is full, cannot send any more. To avoid blocking the tokio
+                // thread, simply drop the message. Better to lose some logs than get
+                // stuck if there's a problem with the logging.
+                //
+                // Record the fact that was a message was dropped by incrementing the
+                // counter.
+                DROPPED_EVENT_COUNT.fetch_add(1, Ordering::Relaxed);
+            }
+        }
+    }
+}
+
+/// Simple formatter implementation for tracing_subscriber, which prints the log spans and
+/// message part like the default formatter, but no timestamp or error level. The error
+/// level is captured separately by `FormattedEventWithMeta', and when the error is
+/// printed by the main thread, with PostgreSQL ereport(), it gets a timestamp at that
+/// point. (The timestamp printed will therefore lag behind the timestamp on the event
+/// here, if the main thread doesn't process the log message promptly)
+struct SimpleFormatter;
+
+impl<S, N> FormatEvent<S, N> for SimpleFormatter
+where
+    S: Subscriber + for<'a> LookupSpan<'a>,
+    N: for<'a> FormatFields<'a> + 'static,
+{
+    fn format_event(
+        &self,
+        ctx: &FmtContext<'_, S, N>,
+        mut writer: Writer<'_>,
+        event: &Event<'_>,
+    ) -> std::fmt::Result {
+        // Format all the spans in the event's span context.
+        if let Some(scope) = ctx.event_scope() {
+            for span in scope.from_root() {
+                write!(writer, "{}", span.name())?;
+
+                // `FormattedFields` is a formatted representation of the span's fields,
+                // which is stored in its extensions by the `fmt` layer's `new_span`
+                // method. The fields will have been formatted by the same field formatter
+                // that's provided to the event formatter in the `FmtContext`.
+                let ext = span.extensions();
+                let fields = &ext
+                    .get::<FormattedFields<N>>()
+                    .expect("will never be `None`");
+
+                // Skip formatting the fields if the span had no fields.
+                if !fields.is_empty() {
+                    write!(writer, "{{{fields}}}")?;
+                }
+                write!(writer, ": ")?;
+            }
+        }
+
+        // Write fields on the event
+        ctx.field_format().format_fields(writer.by_ref(), event)?;
+
+        Ok(())
+    }
+}
--- a/pgxn/neon/communicator/src/worker_process/main_loop.rs
+++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs
@@ -0,0 +1,66 @@
+use std::str::FromStr as _;
+
+use crate::worker_process::lfc_metrics::LfcMetricsCollector;
+
+use measured::MetricGroup;
+use measured::metric::MetricEncoding;
+use measured::metric::gauge::GaugeState;
+use measured::metric::group::Encoding;
+use utils::id::{TenantId, TimelineId};
+
+pub struct CommunicatorWorkerProcessStruct {
+    runtime: tokio::runtime::Runtime,
+
+    /*** Metrics ***/
+    pub(crate) lfc_metrics: LfcMetricsCollector,
+}
+
+/// Launch the communicator process's Rust subsystems
+pub(super) fn init(
+    tenant_id: Option<&str>,
+    timeline_id: Option<&str>,
+) -> Result<&'static CommunicatorWorkerProcessStruct, String> {
+    // The caller validated these already
+    let _tenant_id = tenant_id
+        .map(TenantId::from_str)
+        .transpose()
+        .map_err(|e| format!("invalid tenant ID: {e}"))?;
+    let _timeline_id = timeline_id
+        .map(TimelineId::from_str)
+        .transpose()
+        .map_err(|e| format!("invalid timeline ID: {e}"))?;
+
+    let runtime = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .thread_name("communicator thread")
+        .build()
+        .unwrap();
+
+    let worker_struct = CommunicatorWorkerProcessStruct {
+        // Note: it's important to not drop the runtime, or all the tasks are dropped
+        // too. Including it in the returned struct is one way to keep it around.
+        runtime,
+
+        // metrics
+        lfc_metrics: LfcMetricsCollector,
+    };
+    let worker_struct = Box::leak(Box::new(worker_struct));
+
+    // Start the listener on the control socket
+    worker_struct
+        .runtime
+        .block_on(worker_struct.launch_control_socket_listener())
+        .map_err(|e| e.to_string())?;
+
+    Ok(worker_struct)
+}
+
+impl<T> MetricGroup<T> for CommunicatorWorkerProcessStruct
+where
+    T: Encoding,
+    GaugeState: MetricEncoding<T>,
+{
+    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
+        self.lfc_metrics.collect_group_into(enc)
+    }
+}
--- a/pgxn/neon/communicator/src/worker_process/mod.rs
+++ b/pgxn/neon/communicator/src/worker_process/mod.rs
@@ -0,0 +1,13 @@
+//! This code runs in the communicator worker process. This provides
+//! the glue code to:
+//!
+//! - launch the main loop,
+//! - receive IO requests from backends and process them,
+//! - write results back to backends.
+
+mod callbacks;
+mod control_socket;
+mod lfc_metrics;
+mod logging;
+mod main_loop;
+mod worker_interface;
--- a/pgxn/neon/communicator/src/worker_process/worker_interface.rs
+++ b/pgxn/neon/communicator/src/worker_process/worker_interface.rs
@@ -0,0 +1,60 @@
+//! Functions called from the C code in the worker process
+
+use std::ffi::{CStr, CString, c_char};
+
+use crate::worker_process::main_loop;
+use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
+
+/// Launch the communicator's tokio tasks, which do most of the work.
+///
+/// The caller has initialized the process as a regular PostgreSQL background worker
+/// process.
+///
+/// Inputs:
+///   `tenant_id` and `timeline_id` can be NULL, if we're been launched in "non-Neon" mode,
+///   where we use local storage instead of connecting to remote neon storage. That's
+///   currently only used in some unit tests.
+///
+/// Result:
+///   Returns pointer to CommunicatorWorkerProcessStruct, which is a handle to running
+///   Rust tasks. The C code can use it to interact with the Rust parts. On failure, returns
+///   None/NULL, and an error message is returned in *error_p
+///
+/// This is called only once in the process, so the returned struct, and error message in
+/// case of failure, are simply leaked.
+#[unsafe(no_mangle)]
+pub extern "C" fn communicator_worker_launch(
+    tenant_id: *const c_char,
+    timeline_id: *const c_char,
+    error_p: *mut *const c_char,
+) -> Option<&'static CommunicatorWorkerProcessStruct> {
+    // Convert the arguments into more convenient Rust types
+    let tenant_id = if tenant_id.is_null() {
+        None
+    } else {
+        let cstr = unsafe { CStr::from_ptr(tenant_id) };
+        Some(cstr.to_str().expect("assume UTF-8"))
+    };
+    let timeline_id = if timeline_id.is_null() {
+        None
+    } else {
+        let cstr = unsafe { CStr::from_ptr(timeline_id) };
+        Some(cstr.to_str().expect("assume UTF-8"))
+    };
+
+    // The `init` function does all the work.
+    let result = main_loop::init(tenant_id, timeline_id);
+
+    // On failure, return the error message to the C caller in *error_p.
+    match result {
+        Ok(worker_struct) => Some(worker_struct),
+        Err(errmsg) => {
+            let errmsg = CString::new(errmsg).expect("no nuls within error message");
+            let errmsg = Box::leak(errmsg.into_boxed_c_str());
+            let p: *const c_char = errmsg.as_ptr();
+
+            unsafe { *error_p = p };
+            None
+        }
+    }
+}
--- a/pgxn/neon/communicator_process.c
+++ b/pgxn/neon/communicator_process.c
@@ -0,0 +1,273 @@
+/*-------------------------------------------------------------------------
+ *
+ * communicator_process.c
+ *	  Functions for starting up the communicator background worker process.
+ *
+ * Currently, the communicator process only functions as a metrics
+ * exporter. It provides an HTTP endpoint for polling a limited set of
+ * metrics. TODO: In the future, it will do much more, i.e. handle all
+ * the communications with the pageservers.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "miscadmin.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/interrupt.h"
+#include "postmaster/postmaster.h"
+#include "replication/walsender.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/pmsignal.h"
+#include "storage/procsignal.h"
+#include "tcop/tcopprot.h"
+#include "utils/timestamp.h"
+
+#include "communicator_process.h"
+#include "file_cache.h"
+#include "neon.h"
+#include "neon_perf_counters.h"
+
+/* the rust bindings, generated by cbindgen */
+#include "communicator/communicator_bindings.h"
+
+static void pump_logging(struct LoggingReceiver *logging);
+PGDLLEXPORT void communicator_new_bgworker_main(Datum main_arg);
+
+/**** Initialization functions. These run in postmaster ****/
+
+void
+pg_init_communicator_process(void)
+{
+	BackgroundWorker bgw;
+
+	/* Initialize the background worker process */
+	memset(&bgw, 0, sizeof(bgw));
+	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
+	bgw.bgw_start_time = BgWorkerStart_PostmasterStart;
+	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
+	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "communicator_new_bgworker_main");
+	snprintf(bgw.bgw_name, BGW_MAXLEN, "Storage communicator process");
+	snprintf(bgw.bgw_type, BGW_MAXLEN, "Storage communicator process");
+	bgw.bgw_restart_time = 5;
+	bgw.bgw_notify_pid = 0;
+	bgw.bgw_main_arg = (Datum) 0;
+
+	RegisterBackgroundWorker(&bgw);
+}
+
+/**** Worker process functions. These run in the communicator worker process ****/
+
+/*
+ * Entry point for the communicator bgworker process
+ */
+void
+communicator_new_bgworker_main(Datum main_arg)
+{
+	struct LoggingReceiver *logging;
+	const char *errmsg = NULL;
+	const struct CommunicatorWorkerProcessStruct *proc_handle;
+
+	/*
+	 * Pretend that this process is a WAL sender. That affects the shutdown
+	 * sequence: WAL senders are shut down last, after the final checkpoint
+	 * has been written. That's what we want for the communicator process too.
+	 */
+	am_walsender = true;
+	MarkPostmasterChildWalSender();
+
+	/* Establish signal handlers. */
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+	/*
+	 * Postmaster sends us SIGUSR2 when all regular backends and bgworkers
+	 * have exited, and it's time for us to exit too
+	 */
+	pqsignal(SIGUSR2, die);
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
+	pqsignal(SIGTERM, die);
+
+	BackgroundWorkerUnblockSignals();
+
+	/*
+	 * By default, INFO messages are not printed to the log. We want
+	 * `tracing::info!` messages emitted from the communicator to be printed,
+	 * however, so increase the log level.
+	 *
+	 * XXX: This overrides any user-set value from the config file. That's not
+	 * great, but on the other hand, there should be little reason for user to
+	 * control the verbosity of the communicator. It's not too verbose by
+	 * default.
+	 */
+	SetConfigOption("log_min_messages", "INFO", PGC_SUSET, PGC_S_OVERRIDE);
+
+	logging = communicator_worker_configure_logging();
+
+	proc_handle = communicator_worker_launch(
+		neon_tenant[0] == '\0' ? NULL : neon_tenant,
+		neon_timeline[0] == '\0' ? NULL : neon_timeline,
+		&errmsg
+		);
+	if (proc_handle == NULL)
+	{
+		/*
+		 * Something went wrong. Before exiting, forward any log messages that
+		 * might've been generated during the failed launch.
+		 */
+		pump_logging(logging);
+
+		elog(PANIC, "%s", errmsg);
+	}
+
+	/*
+	 * The Rust tokio runtime has been launched, and it's running in the
+	 * background now. This loop in the main thread handles any interactions
+	 * we need with the rest of PostgreSQL.
+	 *
+	 * NB: This process is now multi-threaded! The Rust threads do not call
+	 * into any Postgres functions, but it's not entirely clear which Postgres
+	 * functions are safe to call from this main thread either. Be very
+	 * careful about adding anything non-trivial here.
+	 *
+	 * Also note that we try to react quickly to any log messages arriving
+	 * from the Rust thread. Be careful to not do anything too expensive here
+	 * that might cause delays.
+	 */
+	elog(LOG, "communicator threads started");
+	for (;;)
+	{
+		TimestampTz before;
+		long		duration;
+
+		ResetLatch(MyLatch);
+
+		/*
+		 * Forward any log messages from the Rust threads into the normal
+		 * Postgres logging facility.
+		 */
+		pump_logging(logging);
+
+		/*
+		 * Check interrupts like system shutdown or config reload
+		 *
+		 * We mustn't block for too long within this loop, or we risk the log
+		 * queue to fill up and messages to be lost. Also, even if we can keep
+		 * up, if there's a long delay between sending a message and printing
+		 * it to the log, the timestamps on the messages get skewed, which is
+		 * confusing.
+		 *
+		 * We expect processing interrupts to happen fast enough that it's OK,
+		 * but measure it just in case, and print a warning if it takes longer
+		 * than 100 ms.
+		 */
+#define LOG_SKEW_WARNING_MS			100
+		before = GetCurrentTimestamp();
+
+		CHECK_FOR_INTERRUPTS();
+		if (ConfigReloadPending)
+		{
+			ConfigReloadPending = false;
+			ProcessConfigFile(PGC_SIGHUP);
+		}
+
+		duration = TimestampDifferenceMilliseconds(before, GetCurrentTimestamp());
+		if (duration > LOG_SKEW_WARNING_MS)
+			elog(WARNING, "handling interrupts took %ld ms, communicator log timestamps might be skewed", duration);
+
+		/*
+		 * Wait until we are woken up. The rust threads will set the latch
+		 * when there's a log message to forward.
+		 */
+		(void) WaitLatch(MyLatch,
+						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
+						 0,
+						 PG_WAIT_EXTENSION);
+	}
+}
+
+static void
+pump_logging(struct LoggingReceiver *logging)
+{
+	char		errbuf[1000];
+	int			elevel;
+	int32		rc;
+	static uint64_t last_dropped_event_count = 0;
+	uint64_t		dropped_event_count;
+	uint64_t		dropped_now;
+
+	for (;;)
+	{
+		rc = communicator_worker_poll_logging(logging,
+											  errbuf,
+											  sizeof(errbuf),
+											  &elevel,
+											  &dropped_event_count);
+		if (rc == 0)
+		{
+			/* nothing to do */
+			break;
+		}
+		else if (rc == 1)
+		{
+			/* Because we don't want to exit on error */
+
+			if (message_level_is_interesting(elevel))
+			{
+				/*
+				 * Prevent interrupts while cleaning up.
+				 *
+				 * (Not sure if this is required, but all the error handlers
+				 * in Postgres that are installed as sigsetjmp() targets do
+				 * this, so let's follow the example)
+				 */
+				HOLD_INTERRUPTS();
+
+				errstart(elevel, TEXTDOMAIN);
+				errmsg_internal("[COMMUNICATOR] %s", errbuf);
+				EmitErrorReport();
+				FlushErrorState();
+
+				/* Now we can allow interrupts again */
+				RESUME_INTERRUPTS();
+			}
+		}
+		else if (rc == -1)
+		{
+			elog(ERROR, "logging channel was closed unexpectedly");
+		}
+	}
+
+	/*
+	 * If the queue was full at any time since the last time we reported it,
+	 * report how many messages were lost. We do this outside the loop, so
+	 * that if the logging system is clogged, we don't exacerbate it by
+	 * printing lots of warnings about dropped messages.
+	 */
+	dropped_now = dropped_event_count - last_dropped_event_count;
+	if (dropped_now != 0)
+	{
+		elog(WARNING, "%lu communicator log messages were dropped because the log buffer was full",
+			 (unsigned long) dropped_now);
+		last_dropped_event_count = dropped_event_count;
+	}
+}
+
+/****
+ * Callbacks from the rust code, in the communicator process.
+ *
+ * NOTE: These must be thread-safe! It's very limited which PostgreSQL
+ * functions you can use!!!
+ *
+ * The signatures of these need to match those in the Rust code.
+ */
+
+void
+callback_set_my_latch_unsafe(void)
+{
+	SetLatch(MyLatch);
+}
--- a/pgxn/neon/communicator_process.h
+++ b/pgxn/neon/communicator_process.h
@@ -0,0 +1,17 @@
+/*-------------------------------------------------------------------------
+ *
+ * communicator_process.h
+ *		Communicator process
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef COMMUNICATOR_PROCESS_H
+#define COMMUNICATOR_PROCESS_H
+
+extern void pg_init_communicator_process(void);
+
+#endif			/* COMMUNICATOR_PROCESS_H */
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -52,6 +52,8 @@
 #include "pagestore_client.h"
 #include "communicator.h"

+#include "communicator/communicator_bindings.h"
+
 #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "LFC: assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)

 /*
@@ -2156,6 +2158,38 @@ lfc_approximate_working_set_size_seconds(time_t duration, bool reset)
 	return dc;
 }

+/*
+ * Get metrics, for the built-in metrics exporter that's part of the communicator
+ * process.
+ *
+ * NB: This is called from a Rust tokio task inside the communicator process.
+ * Acquiring lwlocks, elog(), allocating memory or anything else non-trivial
+ * is strictly prohibited here!
+ */
+struct LfcMetrics
+callback_get_lfc_metrics_unsafe(void)
+{
+	struct LfcMetrics result = {
+		.lfc_cache_size_limit = (int64) lfc_size_limit * 1024 * 1024,
+		.lfc_hits = lfc_ctl ? lfc_ctl->hits : 0,
+		.lfc_misses = lfc_ctl ? lfc_ctl->misses : 0,
+		.lfc_used = lfc_ctl ? lfc_ctl->used : 0,
+		.lfc_writes = lfc_ctl ? lfc_ctl->writes : 0,
+	};
+
+	if (lfc_ctl)
+	{
+		for (int minutes = 1; minutes <= 60; minutes++)
+		{
+			result.lfc_approximate_working_set_size_windows[minutes - 1] =
+				lfc_approximate_working_set_size_seconds(minutes * 60, false);
+		}
+	}
+
+	return result;
+}
+
+
 PG_FUNCTION_INFO_V1(get_local_cache_state);

 Datum
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -178,6 +178,8 @@ static PageServer page_servers[MAX_SHARDS];
 static bool pageserver_flush(shardno_t shard_no);
 static void pageserver_disconnect(shardno_t shard_no);
 static void pageserver_disconnect_shard(shardno_t shard_no);
+// HADRON
+shardno_t get_num_shards(void);

 static bool
 PagestoreShmemIsValid(void)
@@ -286,6 +288,22 @@ AssignPageserverConnstring(const char *newval, void *extra)
 	}
 }

+/* BEGIN_HADRON */
+/**
+ * Return the total number of shards seen in the shard map.
+ */
+shardno_t get_num_shards(void)
+{
+	const ShardMap *shard_map;
+
+	Assert(pagestore_shared);
+	shard_map = &pagestore_shared->shard_map;
+
+	Assert(shard_map != NULL);
+	return shard_map->num_shards;
+}
+/* END_HADRON */
+
 /*
 * Get the current number of shards, and/or the connection string for a
 * particular shard from the shard map in shared memory.
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -31,6 +31,7 @@
 #include "utils/guc_tables.h"

 #include "communicator.h"
+#include "communicator_process.h"
 #include "extension_server.h"
 #include "file_cache.h"
 #include "neon.h"
@@ -44,9 +45,6 @@
 #include "storage/ipc.h"
 #endif

-/* the rust bindings, generated by cbindgen */
-#include "communicator/communicator_bindings.h"
-
 PG_MODULE_MAGIC;
 void		_PG_init(void);

@@ -457,9 +455,6 @@ _PG_init(void)
 	load_file("$libdir/neon_rmgr", false);
 #endif

-	/* dummy call to a Rust function in the communicator library, to check that it works */
-	(void) communicator_dummy(123);
-
 	/*
 	 * Initializing a pre-loaded Postgres extension happens in three stages:
 	 *
@@ -497,6 +492,8 @@ _PG_init(void)
 	pg_init_walproposer();
 	init_lwlsncache();

+	pg_init_communicator_process();
+
 	pg_init_communicator();
 	Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;

--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -72,22 +72,21 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
 		(tag).rnode = (rinfo); \
 	} while (false)

-#define BufTagGetNRelFileInfo(tag) tag.rnode
+#define BufTagGetNRelFileInfo(tag) (tag).rnode

 #define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode)

-#define BufTagInit(tag, relNumber, forknum, blkno, spcOid, dbOid) \
+#define BufTagInit(tag, rel_number, fork_number, block_number, spc_oid, db_oid) \
 	do { \
-		RelFileNode rnode = { .spcNode = spcOid, .dbNode = dbOid, .relNode = relNumber}; \
-		(tag).forkNum = forknum; \
-		(tag).blockNum = blkno; \
-		(tag).rnode = rnode; \
+		RelFileNode rnode = { .spcNode = (spc_oid), .dbNode = (db_oid), .relNode = (rel_number)}; \
+		(tag).forkNum = (fork_number);								\
+		(tag).blockNum = (block_number);							\
+		(tag).rnode = rnode;										\
 	} while (false)

 #define InvalidRelFileNumber InvalidOid

-#define SMgrRelGetRelInfo(reln)				\
-	(reln->smgr_rnode.node)
+#define SMgrRelGetRelInfo(reln)	   	((reln)->smgr_rnode.node)

 #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers

@@ -133,17 +132,16 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
 		.relNumber = (tag).relNumber, \
 	})

-#define BufTagInit(tag, relNumber, forknum, blkno, spcOid, dbOid) \
+#define BufTagInit(tag, rel_number, fork_number, block_number, spc_oid, db_oid) \
 	do { \
-		(tag).forkNum = forknum; \
-		(tag).blockNum = blkno; \
-		(tag).spcOid = spcOid; \
-		(tag).dbOid = dbOid; \
-		(tag).relNumber = relNumber; \
+		(tag).forkNum = (fork_number);					\
+		(tag).blockNum = (block_number);				\
+		(tag).spcOid = (spc_oid);						\
+		(tag).dbOid = (db_oid);							\
+		(tag).relNumber = (rel_number);					\
 	} while (false)

-#define SMgrRelGetRelInfo(reln) \
-	((reln)->smgr_rlocator)
+#define SMgrRelGetRelInfo(reln)	   	((reln)->smgr_rlocator)

 #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
 #endif
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -110,6 +110,9 @@ static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);

 static void CheckGracefulShutdown(WalProposer *wp);

+// HADRON
+shardno_t get_num_shards(void);
+
 static void
 init_walprop_config(bool syncSafekeepers)
 {
@@ -646,18 +649,19 @@ walprop_pg_get_shmem_state(WalProposer *wp)
 * Record new ps_feedback in the array with shards and update min_feedback.
 */
 static PageserverFeedback
-record_pageserver_feedback(PageserverFeedback *ps_feedback)
+record_pageserver_feedback(PageserverFeedback *ps_feedback, shardno_t num_shards)
 {
 	PageserverFeedback min_feedback;

 	Assert(ps_feedback->present);
 	Assert(ps_feedback->shard_number < MAX_SHARDS);
+	Assert(ps_feedback->shard_number < num_shards);

 	SpinLockAcquire(&walprop_shared->mutex);

-	/* Update the number of shards */
-	if (ps_feedback->shard_number + 1 > walprop_shared->num_shards)
-		walprop_shared->num_shards = ps_feedback->shard_number + 1;
+	// Hadron: Update the num_shards from the source-of-truth (shard map) lazily when we receive 
+	// a new pageserver feedback.
+	walprop_shared->num_shards = Max(walprop_shared->num_shards, num_shards);

 	/* Update the feedback */
 	memcpy(&walprop_shared->shard_ps_feedback[ps_feedback->shard_number], ps_feedback, sizeof(PageserverFeedback));
@@ -2023,19 +2027,43 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, Safekeeper *sk)
 	if (wp->config->syncSafekeepers)
 		return;

+
 	/* handle fresh ps_feedback */
 	if (sk->appendResponse.ps_feedback.present)
 	{
-		PageserverFeedback min_feedback = record_pageserver_feedback(&sk->appendResponse.ps_feedback);
+		shardno_t num_shards = get_num_shards();

-		/* Only one main shard sends non-zero currentClusterSize */
-		if (sk->appendResponse.ps_feedback.currentClusterSize > 0)
-			SetNeonCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize);
-
-		if (min_feedback.disk_consistent_lsn != standby_apply_lsn)
+		// During shard split, we receive ps_feedback from child shards before
+		// the split commits and our shard map GUC has been updated. We must
+		// filter out such feedback here because record_pageserver_feedback()
+		// doesn't do it.
+		//
+		// NB: what we would actually want to happen is that we only receive
+		// ps_feedback from the parent shards when the split is committed, then
+		// apply the split to our set of tracked feedback and from here on only
+		// receive ps_feedback from child shards. This filter condition doesn't
+		// do that: if we split from N parent to 2N child shards, the first N
+		// child shards' feedback messages will pass this condition, even before
+		// the split is committed. That's a bit sloppy, but OK for now.
+		if (sk->appendResponse.ps_feedback.shard_number < num_shards)
 		{
-			standby_apply_lsn = min_feedback.disk_consistent_lsn;
-			needToAdvanceSlot = true;
+			PageserverFeedback min_feedback = record_pageserver_feedback(&sk->appendResponse.ps_feedback, num_shards);
+
+			/* Only one main shard sends non-zero currentClusterSize */
+			if (sk->appendResponse.ps_feedback.currentClusterSize > 0)
+				SetNeonCurrentClusterSize(sk->appendResponse.ps_feedback.currentClusterSize);
+	
+			if (min_feedback.disk_consistent_lsn != standby_apply_lsn)
+			{
+				standby_apply_lsn = min_feedback.disk_consistent_lsn;
+				needToAdvanceSlot = true;
+			}
+		}
+		else
+		{
+			// HADRON
+			elog(DEBUG2, "Ignoring pageserver feedback for unknown shard %d (current shard number %d)",
+				sk->appendResponse.ps_feedback.shard_number, num_shards);
 		}
 	}

--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand.

 [[package]]
 name = "aiohappyeyeballs"
@@ -3068,6 +3068,21 @@ urllib3 = ">=1.21.1,<3"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]

+[[package]]
+name = "requests-unixsocket"
+version = "0.4.1"
+description = "Use requests to talk HTTP via a UNIX domain socket"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "requests_unixsocket-0.4.1-py3-none-any.whl", hash = "sha256:60c4942e9dbecc2f64d611039fb1dfc25da382083c6434ac0316dca3ff908f4d"},
+    {file = "requests_unixsocket-0.4.1.tar.gz", hash = "sha256:b2596158c356ecee68d27ba469a52211230ac6fb0cde8b66afb19f0ed47a1995"},
+]
+
+[package.dependencies]
+requests = ">=1.1"
+
 [[package]]
 name = "responses"
 version = "0.25.3"
@@ -3844,4 +3859,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "6a1e8ba06b8194bf28d87fd5e184e2ddc2b4a19dffcbe3953b26da3d55c9212f"
+content-hash = "b08aba407631b0341d2ef8bf9acffd733bfc7d32b12d344717ab4c7fef697625"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -66,6 +66,7 @@ postgres-client = { package = "tokio-postgres2", path = "../libs/proxy/tokio-pos
 postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" }
 pq_proto.workspace = true
 rand.workspace = true
+rand_core.workspace = true
 regex.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 reqwest = { workspace = true, features = ["rustls-tls-native-roots"] }
@@ -133,6 +134,6 @@ pbkdf2 = { workspace = true, features = ["simple", "std"] }
 rcgen.workspace = true
 rstest.workspace = true
 walkdir.workspace = true
-rand_distr = "0.4"
+rand_distr = "0.5"
 tokio-postgres.workspace = true
 tracing-test = "0.2"
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -803,7 +803,7 @@ mod tests {
    use http_body_util::Full;
    use hyper::service::service_fn;
    use hyper_util::rt::TokioIo;
-    use rand::rngs::OsRng;
+    use rand_core::OsRng;
    use rsa::pkcs8::DecodePrivateKey;
    use serde::Serialize;
    use serde_json::json;
--- a/proxy/src/binary/pg_sni_router.rs
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -76,7 +76,7 @@ fn cli() -> clap::Command {
 }

 pub async fn run() -> anyhow::Result<()> {
-    let _logging_guard = crate::logging::init().await?;
+    let _logging_guard = crate::logging::init()?;
    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);

--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -13,7 +13,7 @@ use arc_swap::ArcSwapOption;
 use camino::Utf8PathBuf;
 use futures::future::Either;
 use itertools::{Itertools, Position};
-use rand::{Rng, thread_rng};
+use rand::Rng;
 use remote_storage::RemoteStorageConfig;
 use tokio::net::TcpListener;
 #[cfg(any(test, feature = "testing"))]
@@ -334,7 +334,7 @@ struct PgSniRouterArgs {
 }

 pub async fn run() -> anyhow::Result<()> {
-    let _logging_guard = crate::logging::init().await?;
+    let _logging_guard = crate::logging::init()?;
    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);

@@ -573,7 +573,7 @@ pub async fn run() -> anyhow::Result<()> {
                            attempt.into_inner()
                        );
                    }
-                    let jitter = thread_rng().gen_range(0..100);
+                    let jitter = rand::rng().random_range(0..100);
                    tokio::time::sleep(Duration::from_millis(1000 + jitter)).await;
                }
            }
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -5,7 +5,7 @@ use std::time::Duration;
 use async_trait::async_trait;
 use clashmap::ClashMap;
 use clashmap::mapref::one::Ref;
-use rand::{Rng, thread_rng};
+use rand::Rng;
 use tokio::time::Instant;
 use tracing::{debug, info};

@@ -343,7 +343,7 @@ impl ProjectInfoCacheImpl {
    }

    fn gc(&self) {
-        let shard = thread_rng().gen_range(0..self.project2ep.shards().len());
+        let shard = rand::rng().random_range(0..self.project2ep.shards().len());
        debug!(shard, "project_info_cache: performing epoch reclamation");

        // acquire a random shard lock
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -523,29 +523,29 @@ mod tests {

    fn generate_request_data(rng: &mut impl Rng) -> RequestData {
        RequestData {
-            session_id: uuid::Builder::from_random_bytes(rng.r#gen()).into_uuid(),
-            peer_addr: Ipv4Addr::from(rng.r#gen::<[u8; 4]>()).to_string(),
+            session_id: uuid::Builder::from_random_bytes(rng.random()).into_uuid(),
+            peer_addr: Ipv4Addr::from(rng.random::<[u8; 4]>()).to_string(),
            timestamp: chrono::DateTime::from_timestamp_millis(
-                rng.gen_range(1703862754..1803862754),
+                rng.random_range(1703862754..1803862754),
            )
            .unwrap()
            .naive_utc(),
            application_name: Some("test".to_owned()),
            user_agent: Some("test-user-agent".to_owned()),
-            username: Some(hex::encode(rng.r#gen::<[u8; 4]>())),
-            endpoint_id: Some(hex::encode(rng.r#gen::<[u8; 16]>())),
-            database: Some(hex::encode(rng.r#gen::<[u8; 16]>())),
-            project: Some(hex::encode(rng.r#gen::<[u8; 16]>())),
-            branch: Some(hex::encode(rng.r#gen::<[u8; 16]>())),
+            username: Some(hex::encode(rng.random::<[u8; 4]>())),
+            endpoint_id: Some(hex::encode(rng.random::<[u8; 16]>())),
+            database: Some(hex::encode(rng.random::<[u8; 16]>())),
+            project: Some(hex::encode(rng.random::<[u8; 16]>())),
+            branch: Some(hex::encode(rng.random::<[u8; 16]>())),
            pg_options: None,
            auth_method: None,
            jwt_issuer: None,
-            protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
+            protocol: ["tcp", "ws", "http"][rng.random_range(0..3)],
            region: String::new(),
            error: None,
-            success: rng.r#gen(),
+            success: rng.random(),
            cold_start_info: "no",
-            duration_us: rng.gen_range(0..30_000_000),
+            duration_us: rng.random_range(0..30_000_000),
            disconnect_timestamp: None,
        }
    }
@@ -622,15 +622,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1313953, 3, 6000),
-                (1313942, 3, 6000),
-                (1314001, 3, 6000),
-                (1313958, 3, 6000),
-                (1314094, 3, 6000),
-                (1313931, 3, 6000),
-                (1313725, 3, 6000),
-                (1313960, 3, 6000),
-                (438318, 1, 2000)
+                (1313878, 3, 6000),
+                (1313891, 3, 6000),
+                (1314058, 3, 6000),
+                (1313914, 3, 6000),
+                (1313760, 3, 6000),
+                (1314084, 3, 6000),
+                (1313965, 3, 6000),
+                (1313911, 3, 6000),
+                (438290, 1, 2000)
            ]
        );

@@ -662,11 +662,11 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1205810, 5, 10000),
-                (1205534, 5, 10000),
-                (1205835, 5, 10000),
-                (1205820, 5, 10000),
-                (1206074, 5, 10000)
+                (1206039, 5, 10000),
+                (1205798, 5, 10000),
+                (1205776, 5, 10000),
+                (1206051, 5, 10000),
+                (1205746, 5, 10000)
            ]
        );

@@ -691,15 +691,15 @@ mod tests {
        assert_eq!(
            file_stats,
            [
-                (1313953, 3, 6000),
-                (1313942, 3, 6000),
-                (1314001, 3, 6000),
-                (1313958, 3, 6000),
-                (1314094, 3, 6000),
-                (1313931, 3, 6000),
-                (1313725, 3, 6000),
-                (1313960, 3, 6000),
-                (438318, 1, 2000)
+                (1313878, 3, 6000),
+                (1313891, 3, 6000),
+                (1314058, 3, 6000),
+                (1313914, 3, 6000),
+                (1313760, 3, 6000),
+                (1314084, 3, 6000),
+                (1313965, 3, 6000),
+                (1313911, 3, 6000),
+                (438290, 1, 2000)
            ]
        );

@@ -736,7 +736,7 @@ mod tests {
        // files are smaller than the size threshold, but they took too long to fill so were flushed early
        assert_eq!(
            file_stats,
-            [(658584, 2, 3001), (658298, 2, 3000), (658094, 2, 2999)]
+            [(658552, 2, 3001), (658265, 2, 3000), (658061, 2, 2999)]
        );

        tmpdir.close().unwrap();
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -247,7 +247,7 @@ mod tests {
        use rand::{Rng, SeedableRng};
        use rand_distr::Zipf;

-        let endpoint_dist = Zipf::new(500000, 0.8).unwrap();
+        let endpoint_dist = Zipf::new(500000.0, 0.8).unwrap();
        let endpoints = StdRng::seed_from_u64(272488357).sample_iter(endpoint_dist);

        let interner = MyId::get_interner();
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -26,7 +26,7 @@ use crate::metrics::Metrics;
 /// configuration from environment variables. For example, to change the
 /// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`.
 /// See <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables>
-pub async fn init() -> anyhow::Result<LoggingGuard> {
+pub fn init() -> anyhow::Result<LoggingGuard> {
    let logfmt = LogFormat::from_env()?;

    let env_filter = EnvFilter::builder()
@@ -43,8 +43,8 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
                .expect("this should be a valid filter directive"),
        );

-    let otlp_layer =
-        tracing_utils::init_tracing("proxy", tracing_utils::ExportConfig::default()).await;
+    let provider = tracing_utils::init_tracing("proxy", tracing_utils::ExportConfig::default());
+    let otlp_layer = provider.as_ref().map(tracing_utils::layer);

    let json_log_layer = if logfmt == LogFormat::Json {
        Some(JsonLoggingLayer::new(
@@ -76,7 +76,7 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
        .with(text_log_layer)
        .try_init()?;

-    Ok(LoggingGuard)
+    Ok(LoggingGuard(provider))
 }

 /// Initialize logging for local_proxy with log prefix and no opentelemetry.
@@ -97,7 +97,7 @@ pub fn init_local_proxy() -> anyhow::Result<LoggingGuard> {
        .with(fmt_layer)
        .try_init()?;

-    Ok(LoggingGuard)
+    Ok(LoggingGuard(None))
 }

 pub struct LocalProxyFormatter(Format<Full, SystemTime>);
@@ -118,14 +118,16 @@ where
    }
 }

-pub struct LoggingGuard;
+pub struct LoggingGuard(Option<tracing_utils::Provider>);

 impl Drop for LoggingGuard {
    fn drop(&mut self) {
-        // Shutdown trace pipeline gracefully, so that it has a chance to send any
-        // pending traces before we exit.
-        tracing::info!("shutting down the tracing machinery");
-        tracing_utils::shutdown_tracing();
+        if let Some(p) = &self.0 {
+            // Shutdown trace pipeline gracefully, so that it has a chance to send any
+            // pending traces before we exit.
+            tracing::info!("shutting down the tracing machinery");
+            drop(p.shutdown());
+        }
    }
 }

--- a/proxy/src/pqproto.rs
+++ b/proxy/src/pqproto.rs
@@ -7,7 +7,7 @@ use std::io::{self, Cursor};

 use bytes::{Buf, BufMut};
 use itertools::Itertools;
-use rand::distributions::{Distribution, Standard};
+use rand::distr::{Distribution, StandardUniform};
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
 use zerocopy::{FromBytes, Immutable, IntoBytes, big_endian};

@@ -458,9 +458,9 @@ impl fmt::Display for CancelKeyData {
            .finish()
    }
 }
-impl Distribution<CancelKeyData> for Standard {
+impl Distribution<CancelKeyData> for StandardUniform {
    fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> CancelKeyData {
-        id_to_cancel_key(rng.r#gen())
+        id_to_cancel_key(rng.random())
    }
 }

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Conrad Ludgate	e965bd96bb	optimise some future sizes	2025-07-23 08:58:09 +01:00
Conrad Ludgate	14daaec98e	more compact code and more compact futures	2025-07-23 08:58:09 +01:00
Conrad Ludgate	286ac97a9c	remove typesafe transaction type as we already ensure rollback is performed	2025-07-23 08:58:09 +01:00
Conrad Ludgate	20355cb5f0	fix rest.rs	2025-07-23 07:04:36 +01:00
Conrad Ludgate	634dbd29b6	python lints	2025-07-23 07:04:36 +01:00
Conrad Ludgate	a235b241d5	ruff format	2025-07-23 07:04:36 +01:00
Conrad Ludgate	539652fa4e	rollback safety	2025-07-23 07:04:36 +01:00
Conrad Ludgate	11294ca322	rename Send to LocalProxyClient, etc	2025-07-23 07:04:36 +01:00
Conrad Ludgate	84020c1328	fix python lints	2025-07-23 07:04:36 +01:00
Conrad Ludgate	0cc7415691	remove explicit discard_all for local_proxy	2025-07-23 07:04:36 +01:00
Conrad Ludgate	38df46b381	fix session state by resetting it	2025-07-23 07:04:36 +01:00
Conrad Ludgate	cdc73ad051	add regression test	2025-07-23 07:04:36 +01:00
Tristan Partin	fc242afcc2	PG ignore PageserverFeedback from unknown shards (#12671 ) ## Problem When testing tenant splits, I found that PG can get backpressure throttled indefinitely if the split is aborted afterwards. It turns out that each PageServer activates new shard separately even before the split is committed and they may start sending PageserverFeedback to PG directly. As a result, if the split is aborted, no one resets the pageserver feedback in PG, and thus PG will be backpressure throttled forever unless it's restarted manually. ## Summary of changes This PR fixes this problem by having `walprop_pg_process_safekeeper_feedback` simply ignore all pageserver feedback from unknown shards. The source of truth here is defined by the shard map, which is guaranteed to be reloaded only after the split is committed. Co-authored-by: Chen Luo <chen.luo@databricks.com>	2025-07-22 21:41:56 +00:00
Suhas Thalanki	e275221aef	add hadron-specific metrics (#12686 )	2025-07-22 21:17:45 +00:00
Alex Chi Z.	f859354466	feat(pageserver): add db rel count as feature flag property (#12632 ) ## Problem As part of the reldirv2 rollout: LKB-197. We will use number of db/rels as a criteria whether to rollout reldirv2 directly on the write path (simplest and easiest way of rollout). If the number of rel/db is small then it shouldn't take too long time on the write path. ## Summary of changes * Compute db/rel count during basebackup. * Also compute it during logical size computation. * Collect maximum number of db/rel across all timelines in the feature flag propeties. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-07-22 17:55:07 +00:00
Konstantin Knizhnik	b00a0096bf	Reintialize page in allocNewBuffer only when buffer is returned (#12399 ) ## Problem See https://github.com/neondatabase/neon/issues/12387 `allocNewBuffer` initialise page with zeros but not always return it because of parity checks. In case of wrong parity the page is rejected and as a result we have dirty page with zero LSN, which cause assertion failure on neon_write when page is evicted from shared buffers. ## Summary of changes Perform, page initialisation in `allocNewBuffer` only when buffer is returned (parity check is passed). Postgres PRs: https://github.com/neondatabase/postgres/pull/661 https://github.com/neondatabase/postgres/pull/662 https://github.com/neondatabase/postgres/pull/663 https://github.com/neondatabase/postgres/pull/664 --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech> Co-authored-by: Kosntantin Knizhnik <konstantin.knizhnik@databricks.com>	2025-07-22 17:50:26 +00:00
a-masterov	b3844903e5	Add new operations to Random operations test (#12213 ) ## Problem We did not test some Public API calls, such as using a timestamp to create a branch, reset_to_parent. ## Summary of changes Tests now include some other operations: reset_to_parent, a branch creation from any time in the past, etc. Currently, the API calls are only exposed; the semantics are not verified. --------- Co-authored-by: Alexey Masterov <alexey.masterov@databricks.com>	2025-07-22 17:43:01 +00:00
Vlad Lazar	5b0972151c	pageserver: silence shard resolution warning (#12685 ) ## Problem We drive the get page requests that have started processing to completion. So in the case when the compute received a reconfiguration request and the old connection has a request procesing on the pageserver, we are going to issue the warning. I spot checked a few instances of the warning and in all cases the compute was already connected to the correct pageserver. ## Summary of Changes Downgrade to INFO. It would be nice to somehow figure out if the connection has been terminated in the meantime, but the terminate libpq message is still in the pipe while we're doing the shard resolution. Closes LKB-2381	2025-07-22 17:34:23 +00:00
Heikki Linnakangas	51ffeef93f	Fix postgres version compatibility macros (#12658 ) The argument to BufTagInit was called 'spcOid', and it was also setting a field called 'spcOid'. The field name would erroneously also be expanded with the macro arg. It happened to work so far, because all the users of the macro pass a variable called 'spcOid' for the 'spcOid' argument, but as soon as you try to pass anything else, it fails. And same story for 'dbOid' and 'relNumber'. Rename the arguments to avoid the name collision. Also while we're at it, add parens around the arguments in a few macros, to make them safer if you pass something non-trivial as the argument.	2025-07-22 16:52:57 +00:00
Erik Grinaker	0fe07dec32	test_runner: allow stuck reconciliation errors (#12682 ) This log message was added in #12589. During chaos tests, reconciles may not succeed for some time, triggering the log message. Resolves [LKB-2467](https://databricks.atlassian.net/browse/LKB-2467).	2025-07-22 16:43:35 +00:00
HaoyuHuang	8de320ab9b	Add a few compute_tool changes (#12677 ) ## Summary of changes All changes are no-op.	2025-07-22 16:22:18 +00:00
Folke Behrens	108f7ec544	Bump opentelemetry crates to 0.30 (#12680 ) This rebuilds #11552 on top the current Cargo.lock. --------- Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>	2025-07-22 16:05:35 +00:00
Tristan Partin	63d2b1844d	Fix final pyright issues with neon_api.py (#8476 ) Fix final pyright issues with neon_api.py Signed-off-by: Tristan Partin <tristan.partin@databricks.com>	2025-07-22 16:04:52 +00:00
Dmitrii Kovalkov	133f16e9b5	storcon: finish safekeeper migration gracefully (#12528 ) ## Problem We don't detect if safekeeper migration fails after the the commiting the membership configuration to the database. As a result, we might leave stale timelines on excluded safekeepers and do not notify cplane/safekepeers about new configuration. - Implements solution proposed in https://github.com/neondatabase/neon/pull/12432 - Closes: https://github.com/neondatabase/neon/issues/12192 - Closes: [LKB-944](https://databricks.atlassian.net/browse/LKB-944) ## Summary of changes - Add `sk_set_notified_generation` column to `timelines` database - Update `_notified_generation` in database during the finish state. - Commit reconciliation requests to database atomically with membership configuration. - Reload pending ops and retry "finish" step if we detect `_notified_generation` mismatch. - Add failpoints and test that we handle failures well	2025-07-22 14:58:20 +00:00
Alex Chi Z.	88391ce069	feat(pageserver): create image layers at L0-L1 boundary by default (#12669 ) ## Problem Post LKB-198 rollout. We added a new strategy to generate image layers at the L0-L1 boundary instead of the latest LSN to ensure too many L0 layers do not trigger image layer creation. ## Summary of changes We already rolled it out to all users so we can remove the feature flag now. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-07-22 14:29:26 +00:00
Heikki Linnakangas	8bb45fd5da	Introduce built-in Prometheus exporter to the Postgres extension (#12591 ) Currently, the exporter exposes the same LFC metrics that are exposed by the "autoscaling" sql_exporter in the docker image. With this, we can remove the dedicated sql_exporter instance. (Actually doing the removal is left as a TODO until this is rolled out to production and we have changed autoscaling-agent to fetch the metrics from this new endpoint.) The exporter runs as a Postgres background worker process. This is extracted from the Rust communicator rewrite project, which will use the same worker process for much more, to handle the communications with the pageservers. For now, though, it merely handles the metrics requests. In the future, we will add more metrics, and perhaps even APIs to control the running Postgres instance. The exporter listens on a Unix Domain socket within the Postgres data directory. A Unix Domain socket is a bit unconventional, but it has some advantages: - Permissions are taken care of. Only processes that can access the data directory, and therefore already have full access to the running Postgres instance, can connect to it. - No need to allocate and manage a new port number for the listener It has some downsides too: it's not immediately accessible from the outside world, and the functions to work with Unix Domain sockets are more low-level than TCP sockets (see the symlink hack in `postgres_metrics_client.rs`, for example). To expose the metrics from the local Unix Domain Socket to the autoscaling agent, introduce a new '/autoscaling_metrics' endpoint in the compute_ctl's HTTP server. Currently it merely forwards the request to the Postgres instance, but we could add rate limiting and access control there in the future. --------- Co-authored-by: Conrad Ludgate <conrad@neon.tech>	2025-07-22 12:00:20 +00:00
Vlad Lazar	88bc06f148	communicator: debug log more fields of the get page response (#12644 ) It's helpful to correlate requests and responses in local investigations where the issue is reproducible. Hence, log the rel, fork and block of the get page response.	2025-07-22 11:25:11 +00:00
Vlad Lazar	d91d018afa	storcon: handle pageserver disk loss (#12667 ) NB: effectively a no-op in the neon env since the handling is config gated in storcon ## Problem When a pageserver suffers from a local disk/node failure and restarts, the storage controller will receive a re-attach call and return all the tenants the pageserver is suppose to attach, but the pageserver will not act on any tenants that it doesn't know about locally. As a result, the pageserver will not rehydrate any tenants from remote storage if it restarted following a local disk loss, while the storage controller still thinks that the pageserver have all the tenants attached. This leaves the system in a bad state, and the symptom is that PG's pageserver connections will fail with "tenant not found" errors. ## Summary of changes Made a slight change to the storage controller's `re_attach` API: * The pageserver will set an additional bit `empty_local_disk` in the reattach request, indicating whether it has started with an empty disk or does not know about any tenants. * Upon receiving the reattach request, if this `empty_local_disk` bit is set, the storage controller will go ahead and clear all observed locations referencing the pageserver. The reconciler will then discover the discrepancy between the intended state and observed state of the tenant and take care of the situation. To facilitate rollouts this extra behavior in the `re_attach` API is guarded by the `handle_ps_local_disk_loss` command line flag of the storage controller. --------- Co-authored-by: William Huang <william.huang@databricks.com>	2025-07-22 11:04:03 +00:00
Folke Behrens	9c0efba91e	Bump rand crate to 0.9 (#12674 )	2025-07-22 09:31:39 +00:00