[DO NOT MERGE] more debug logging to prove hypothesis that it's the fsyncs

force CI run after adding run-benchmarks label
re-enable the n_tenants=100
2026-05-23 08:00:37 +00:00 · 2024-01-26 10:54:35 +00:00 · 2024-01-25 20:13:13 +00:00 · 2024-01-25 20:11:17 +00:00 · 2024-01-25 20:10:54 +00:00 · 2024-01-25 19:25:29 +00:00
111 changed files with 4337 additions and 1639 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -21,6 +21,9 @@ env:
  COPT: '-Werror'
  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+  NEXTEST_RETRIES: 3
+  # A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
+  E2E_CONCURRENCY_GROUP: ${{ github.repository }}-${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}

 jobs:
  check-permissions:
@@ -44,6 +47,20 @@ jobs:

        exit 1

+  cancel-previous-e2e-tests:
+    needs: [ check-permissions ]
+    if: github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Cancel previous e2e-tests runs for this PR
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
+        run: |
+          gh workflow --repo neondatabase/cloud \
+            run cancel-previous-in-concurrency-group.yml \
+              --field concurrency_group="${{ env.E2E_CONCURRENCY_GROUP }}"
+
  tag:
    needs: [ check-permissions ]
    runs-on: [ self-hosted, gen3, small ]
@@ -695,7 +712,8 @@ jobs:
                \"commit_hash\": \"$COMMIT_SHA\",
                \"remote_repo\": \"${{ github.repository }}\",
                \"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
-                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
+                \"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
+                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
              }
            }"

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10,9 +10,9 @@ checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5"

 [[package]]
 name = "addr2line"
-version = "0.19.0"
+version = "0.21.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a76fd60b23679b7d19bd066031410fb7e458ccc5e958eb5c325888ce4baedc97"
+checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
 dependencies = [
 "gimli",
 ]
@@ -286,6 +286,7 @@ dependencies = [
 "pageserver_client",
 "postgres_backend",
 "postgres_connection",
+ "scopeguard",
 "serde",
 "serde_json",
 "thiserror",
@@ -840,15 +841,15 @@ dependencies = [

 [[package]]
 name = "backtrace"
-version = "0.3.67"
+version = "0.3.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "233d376d6d185f2a3093e58f283f60f880315b6c60075b01f36b3b85154564ca"
+checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837"
 dependencies = [
 "addr2line",
 "cc",
 "cfg-if",
 "libc",
- "miniz_oxide 0.6.2",
+ "miniz_oxide",
 "object",
 "rustc-demangle",
 ]
@@ -1215,7 +1216,7 @@ dependencies = [
 "flate2",
 "futures",
 "hyper",
- "nix 0.26.2",
+ "nix 0.27.1",
 "notify",
 "num_cpus",
 "opentelemetry",
@@ -1331,7 +1332,7 @@ dependencies = [
 "git-version",
 "hex",
 "hyper",
- "nix 0.26.2",
+ "nix 0.27.1",
 "once_cell",
 "pageserver_api",
 "pageserver_client",
@@ -1341,6 +1342,7 @@ dependencies = [
 "regex",
 "reqwest",
 "safekeeper_api",
+ "scopeguard",
 "serde",
 "serde_json",
 "serde_with",
@@ -1872,13 +1874,13 @@ dependencies = [

 [[package]]
 name = "filetime"
-version = "0.2.21"
+version = "0.2.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153"
+checksum = "d4029edd3e734da6fe05b6cd7bd2960760a616bd2ddd0d59a0124746d6272af0"
 dependencies = [
 "cfg-if",
 "libc",
- "redox_syscall 0.2.16",
+ "redox_syscall 0.3.5",
 "windows-sys 0.48.0",
 ]

@@ -1895,7 +1897,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743"
 dependencies = [
 "crc32fast",
- "miniz_oxide 0.7.1",
+ "miniz_oxide",
 ]

 [[package]]
@@ -2093,9 +2095,9 @@ dependencies = [

 [[package]]
 name = "gimli"
-version = "0.27.2"
+version = "0.28.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad0a93d233ebf96623465aad4046a8d3aa4da22d4f4beba5388838c8a434bbb4"
+checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"

 [[package]]
 name = "git-version"
@@ -2748,18 +2750,18 @@ checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"

 [[package]]
 name = "memoffset"
-version = "0.7.1"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
+checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
 dependencies = [
 "autocfg",
 ]

 [[package]]
 name = "memoffset"
-version = "0.8.0"
+version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
+checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c"
 dependencies = [
 "autocfg",
 ]
@@ -2797,15 +2799,6 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"

-[[package]]
-name = "miniz_oxide"
-version = "0.6.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa"
-dependencies = [
- "adler",
-]
-
 [[package]]
 name = "miniz_oxide"
 version = "0.7.1"
@@ -2865,16 +2858,14 @@ dependencies = [

 [[package]]
 name = "nix"
-version = "0.26.2"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a"
+checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.4.1",
 "cfg-if",
 "libc",
- "memoffset 0.7.1",
- "pin-utils",
- "static_assertions",
+ "memoffset 0.9.0",
 ]

 [[package]]
@@ -2889,20 +2880,21 @@ dependencies = [

 [[package]]
 name = "notify"
-version = "5.2.0"
+version = "6.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "729f63e1ca555a43fe3efa4f3efdf4801c479da85b432242a7b726f353c88486"
+checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.4.1",
 "crossbeam-channel",
 "filetime",
 "fsevent-sys",
 "inotify 0.9.6",
 "kqueue",
 "libc",
+ "log",
 "mio",
 "walkdir",
- "windows-sys 0.45.0",
+ "windows-sys 0.48.0",
 ]

 [[package]]
@@ -3028,9 +3020,9 @@ dependencies = [

 [[package]]
 name = "object"
-version = "0.30.3"
+version = "0.32.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ea86265d3d3dcb6a27fc51bd29a4bf387fae9d2986b823079d4986af253eb439"
+checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
 dependencies = [
 "memchr",
 ]
@@ -3102,9 +3094,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry"
-version = "0.19.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f4b8347cc26099d3aeee044065ecc3ae11469796b4d65d065a23a584ed92a6f"
+checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54"
 dependencies = [
 "opentelemetry_api",
 "opentelemetry_sdk",
@@ -3112,9 +3104,9 @@ dependencies = [

 [[package]]
 name = "opentelemetry-http"
-version = "0.8.0"
+version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a819b71d6530c4297b49b3cae2939ab3a8cc1b9f382826a1bc29dd0ca3864906"
+checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b"
 dependencies = [
 "async-trait",
 "bytes",
@@ -3125,54 +3117,56 @@ dependencies = [

 [[package]]
 name = "opentelemetry-otlp"
-version = "0.12.0"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8af72d59a4484654ea8eb183fea5ae4eb6a41d7ac3e3bae5f4d2a282a3a7d3ca"
+checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275"
 dependencies = [
 "async-trait",
- "futures",
- "futures-util",
+ "futures-core",
 "http",
- "opentelemetry",
 "opentelemetry-http",
 "opentelemetry-proto",
+ "opentelemetry-semantic-conventions",
+ "opentelemetry_api",
+ "opentelemetry_sdk",
 "prost",
 "reqwest",
 "thiserror",
+ "tokio",
+ "tonic",
 ]

 [[package]]
 name = "opentelemetry-proto"
-version = "0.2.0"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "045f8eea8c0fa19f7d48e7bc3128a39c2e5c533d5c61298c548dfefc1064474c"
+checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb"
 dependencies = [
- "futures",
- "futures-util",
- "opentelemetry",
+ "opentelemetry_api",
+ "opentelemetry_sdk",
 "prost",
- "tonic 0.8.3",
+ "tonic",
 ]

 [[package]]
 name = "opentelemetry-semantic-conventions"
-version = "0.11.0"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5"
+checksum = "73c9f9340ad135068800e7f1b24e9e09ed9e7143f5bf8518ded3d3ec69789269"
 dependencies = [
 "opentelemetry",
 ]

 [[package]]
 name = "opentelemetry_api"
-version = "0.19.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed41783a5bf567688eb38372f2b7a8530f5a607a4b49d38dd7573236c23ca7e2"
+checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b"
 dependencies = [
- "fnv",
 "futures-channel",
 "futures-util",
 "indexmap 1.9.3",
+ "js-sys",
 "once_cell",
 "pin-project-lite",
 "thiserror",
@@ -3181,21 +3175,22 @@ dependencies = [

 [[package]]
 name = "opentelemetry_sdk"
-version = "0.19.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b3a2a91fdbfdd4d212c0dcc2ab540de2c2bcbbd90be17de7a7daf8822d010c1"
+checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026"
 dependencies = [
 "async-trait",
 "crossbeam-channel",
- "dashmap",
- "fnv",
 "futures-channel",
 "futures-executor",
 "futures-util",
 "once_cell",
 "opentelemetry_api",
+ "ordered-float 3.9.2",
 "percent-encoding",
 "rand 0.8.5",
+ "regex",
+ "serde_json",
 "thiserror",
 "tokio",
 "tokio-stream",
@@ -3210,6 +3205,15 @@ dependencies = [
 "num-traits",
 ]

+[[package]]
+name = "ordered-float"
+version = "3.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "ordered-multimap"
 version = "0.7.1"
@@ -3325,7 +3329,7 @@ dependencies = [
 "itertools",
 "md5",
 "metrics",
- "nix 0.26.2",
+ "nix 0.27.1",
 "num-traits",
 "num_cpus",
 "once_cell",
@@ -4339,9 +4343,9 @@ dependencies = [

 [[package]]
 name = "reqwest-tracing"
-version = "0.4.5"
+version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b97ad83c2fc18113346b7158d79732242002427c30f620fa817c1f32901e0a8"
+checksum = "5a0152176687dd5cfe7f507ac1cb1a491c679cfe483afd133a7db7aaea818bb3"
 dependencies = [
 "anyhow",
 "async-trait",
@@ -5110,9 +5114,9 @@ checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"

 [[package]]
 name = "smol_str"
-version = "0.2.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74212e6bbe9a4352329b2f68ba3130c15a3f26fe88ff22dbdc6cdd58fa85e99c"
+checksum = "e6845563ada680337a52d43bb0b29f396f2d911616f6573012645b9e3d048a49"
 dependencies = [
 "serde",
 ]
@@ -5195,7 +5199,7 @@ dependencies = [
 "prost",
 "tokio",
 "tokio-stream",
- "tonic 0.9.2",
+ "tonic",
 "tonic-build",
 "tracing",
 "utils",
@@ -5415,7 +5419,7 @@ checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
 dependencies = [
 "byteorder",
 "integer-encoding",
- "ordered-float",
+ "ordered-float 2.10.1",
 ]

 [[package]]
@@ -5681,38 +5685,6 @@ dependencies = [
 "winnow",
 ]

-[[package]]
-name = "tonic"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f219fad3b929bef19b1f86fbc0358d35daed8f2cac972037ac0dc10bbb8d5fb"
-dependencies = [
- "async-stream",
- "async-trait",
- "axum",
- "base64 0.13.1",
- "bytes",
- "futures-core",
- "futures-util",
- "h2",
- "http",
- "http-body",
- "hyper",
- "hyper-timeout",
- "percent-encoding",
- "pin-project",
- "prost",
- "prost-derive",
- "tokio",
- "tokio-stream",
- "tokio-util",
- "tower",
- "tower-layer",
- "tower-service",
- "tracing",
- "tracing-futures",
-]
-
 [[package]]
 name = "tonic"
 version = "0.9.2"
@@ -5856,16 +5828,6 @@ dependencies = [
 "tracing-subscriber",
 ]

-[[package]]
-name = "tracing-futures"
-version = "0.2.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2"
-dependencies = [
- "pin-project",
- "tracing",
-]
-
 [[package]]
 name = "tracing-log"
 version = "0.1.3"
@@ -5879,9 +5841,9 @@ dependencies = [

 [[package]]
 name = "tracing-opentelemetry"
-version = "0.19.0"
+version = "0.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00a39dcf9bfc1742fa4d6215253b33a6e474be78275884c216fc2a06267b3600"
+checksum = "fc09e402904a5261e42cf27aea09ccb7d5318c6717a9eec3d8e2e65c56b18f19"
 dependencies = [
 "once_cell",
 "opentelemetry",
@@ -6118,7 +6080,7 @@ dependencies = [
 "hyper",
 "jsonwebtoken",
 "metrics",
- "nix 0.26.2",
+ "nix 0.27.1",
 "once_cell",
 "pin-project-lite",
 "postgres_connection",
@@ -6626,10 +6588,8 @@ dependencies = [
 "clap",
 "clap_builder",
 "crossbeam-utils",
- "dashmap",
 "either",
 "fail",
- "futures",
 "futures-channel",
 "futures-core",
 "futures-executor",
@@ -6674,6 +6634,7 @@ dependencies = [
 "tokio-util",
 "toml_datetime",
 "toml_edit",
+ "tonic",
 "tower",
 "tracing",
 "tracing-core",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -99,14 +99,14 @@ libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
 native-tls = "0.2"
-nix = "0.26"
-notify = "5.0.0"
+nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
+notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.19.0"
-opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.11.0"
+opentelemetry = "0.20.0"
+opentelemetry-otlp = { version = "0.13.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.12.0"
 parking_lot = "0.12"
 parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
 parquet_derive = "49.0.0"
@@ -118,7 +118,7 @@ rand = "0.8"
 redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
+reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
 reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
@@ -162,7 +162,7 @@ toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
 tracing = "0.1"
 tracing-error = "0.2.0"
-tracing-opentelemetry = "0.19.0"
+tracing-opentelemetry = "0.20.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 url = "2.2"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -700,13 +700,14 @@ impl ComputeNode {
        // In this case we need to connect with old `zenith_admin` name
        // and create new user. We cannot simply rename connected user,
        // but we can create a new one and grant it all privileges.
-        let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
+        let connstr = self.connstr.clone();
+        let mut client = match Client::connect(connstr.as_str(), NoTls) {
            Err(e) => {
                info!(
                    "cannot connect to postgres: {}, retrying with `zenith_admin` username",
                    e
                );
-                let mut zenith_admin_connstr = self.connstr.clone();
+                let mut zenith_admin_connstr = connstr.clone();

                zenith_admin_connstr
                    .set_username("zenith_admin")
@@ -719,8 +720,8 @@ impl ComputeNode {
                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                drop(client);

-                // reconnect with connsting with expected name
-                Client::connect(self.connstr.as_str(), NoTls)?
+                // reconnect with connstring with expected name
+                Client::connect(connstr.as_str(), NoTls)?
            }
            Ok(client) => client,
        };
@@ -734,8 +735,8 @@ impl ComputeNode {
        cleanup_instance(&mut client)?;
        handle_roles(spec, &mut client)?;
        handle_databases(spec, &mut client)?;
-        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
-        handle_grants(spec, &mut client, self.connstr.as_str())?;
+        handle_role_deletions(spec, connstr.as_str(), &mut client)?;
+        handle_grants(spec, &mut client, connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
        handle_extension_neon(&mut client)?;
        create_availability_check_data(&mut client)?;
@@ -743,6 +744,12 @@ impl ComputeNode {
        // 'Close' connection
        drop(client);

+        if self.has_feature(ComputeFeature::Migrations) {
+            thread::spawn(move || {
+                let mut client = Client::connect(connstr.as_str(), NoTls)?;
+                handle_migrations(&mut client)
+            });
+        }
        Ok(())
    }

@@ -807,6 +814,10 @@ impl ComputeNode {
            handle_grants(&spec, &mut client, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
            handle_extension_neon(&mut client)?;
+            // We can skip handle_migrations here because a new migration can only appear
+            // if we have a new version of the compute_ctl binary, which can only happen
+            // if compute got restarted, in which case we'll end up inside of apply_config
+            // instead of reconfigure.
        }

        // 'Close' connection
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -727,3 +727,79 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {

    Ok(())
 }
+
+#[instrument(skip_all)]
+pub fn handle_migrations(client: &mut Client) -> Result<()> {
+    info!("handle migrations");
+
+    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    // !BE SURE TO ONLY ADD MIGRATIONS TO THE END OF THIS ARRAY. IF YOU DO NOT, VERY VERY BAD THINGS MAY HAPPEN!
+    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+    let migrations = [
+        "ALTER ROLE neon_superuser BYPASSRLS",
+        r#"
+DO $$
+DECLARE
+    role_name text;
+BEGIN
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
+    END LOOP;
+
+    FOR role_name IN SELECT rolname FROM pg_roles
+        WHERE
+            NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
+    END LOOP;
+END $$;
+"#,
+    ];
+
+    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+    client.simple_query(query)?;
+
+    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+    client.simple_query(query)?;
+
+    query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+    client.simple_query(query)?;
+
+    query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+    client.simple_query(query)?;
+
+    query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+    client.simple_query(query)?;
+
+    query = "SELECT id FROM neon_migration.migration_id";
+    let row = client.query_one(query, &[])?;
+    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
+    let starting_migration_id = current_migration;
+
+    query = "BEGIN";
+    client.simple_query(query)?;
+
+    while current_migration < migrations.len() {
+        info!("Running migration:\n{}\n", migrations[current_migration]);
+        client.simple_query(migrations[current_migration])?;
+        current_migration += 1;
+    }
+    let setval = format!(
+        "UPDATE neon_migration.migration_id SET id={}",
+        migrations.len()
+    );
+    client.simple_query(&setval)?;
+
+    query = "COMMIT";
+    client.simple_query(query)?;
+
+    info!(
+        "Ran {} migrations",
+        (migrations.len() - starting_migration_id)
+    );
+    Ok(())
+}
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -19,6 +19,7 @@ hex.workspace = true
 hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
+scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 serde_with.workspace = true
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -14,6 +14,7 @@ hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
+scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -17,6 +17,8 @@ enum PlacementPolicy {
    /// Production-ready way to attach a tenant: one attached pageserver and
    /// some number of secondaries.
    Double(usize),
+    /// Do not attach to any pageservers
+    Detached,
 }

 #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -66,7 +66,7 @@ async fn main() -> anyhow::Result<()> {
        jwt_token: args.jwt_token,
    };

-    let persistence = Arc::new(Persistence::new(&args.path).await);
+    let persistence = Arc::new(Persistence::spawn(&args.path).await);

    let service = Service::spawn(config, persistence).await?;

--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -11,6 +11,7 @@ use pageserver_api::{
 };
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
+use tracing::info;
 use utils::{
    generation::Generation,
    id::{NodeId, TenantId},
@@ -20,46 +21,28 @@ use crate::{node::Node, PlacementPolicy};

 /// Placeholder for storage.  This will be replaced with a database client.
 pub struct Persistence {
-    state: std::sync::Mutex<PersistentState>,
+    inner: std::sync::Mutex<Inner>,
+}
+
+struct Inner {
+    state: PersistentState,
+    write_queue_tx: tokio::sync::mpsc::UnboundedSender<PendingWrite>,
 }

-// Top level state available to all HTTP handlers
 #[derive(Serialize, Deserialize)]
 struct PersistentState {
    tenants: HashMap<TenantShardId, TenantShardPersistence>,
-
-    #[serde(skip)]
-    path: Utf8PathBuf,
 }

-/// A convenience for serializing the state inside a sync lock, and then
-/// writing it to disk outside of the lock.  This will go away when switching
-/// to a database backend.
 struct PendingWrite {
    bytes: Vec<u8>,
-    path: Utf8PathBuf,
-}
-
-impl PendingWrite {
-    async fn commit(&self) -> anyhow::Result<()> {
-        tokio::fs::write(&self.path, &self.bytes).await?;
-
-        Ok(())
-    }
+    done_tx: tokio::sync::oneshot::Sender<()>,
 }

 impl PersistentState {
-    fn save(&self) -> PendingWrite {
-        PendingWrite {
-            bytes: serde_json::to_vec(self).expect("Serialization error"),
-            path: self.path.clone(),
-        }
-    }
-
    async fn load(path: &Utf8Path) -> anyhow::Result<Self> {
        let bytes = tokio::fs::read(path).await?;
        let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
-        decoded.path = path.to_owned();

        for (tenant_id, tenant) in &mut decoded.tenants {
            // Backward compat: an old attachments.json from before PR #6251, replace
@@ -88,7 +71,6 @@ impl PersistentState {
                tracing::info!("Will create state file at {}", path);
                Self {
                    tenants: HashMap::new(),
-                    path: path.to_owned(),
                }
            }
            Err(e) => {
@@ -99,13 +81,74 @@ impl PersistentState {
 }

 impl Persistence {
-    pub async fn new(path: &Utf8Path) -> Self {
+    pub async fn spawn(path: &Utf8Path) -> Self {
+        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
        let state = PersistentState::load_or_new(path).await;
+        tokio::spawn(Self::writer_task(rx, path.to_owned()));
        Self {
-            state: std::sync::Mutex::new(state),
+            inner: std::sync::Mutex::new(Inner {
+                state,
+                write_queue_tx: tx,
+            }),
        }
    }

+    async fn writer_task(
+        mut rx: tokio::sync::mpsc::UnboundedReceiver<PendingWrite>,
+        path: Utf8PathBuf,
+    ) {
+        scopeguard::defer! {
+            info!("persistence writer task exiting");
+        };
+        loop {
+            match rx.recv().await {
+                Some(write) => {
+                    tokio::task::spawn_blocking({
+                        let path = path.clone();
+                        move || {
+                            let tmp_path =
+                                utils::crashsafe::path_with_suffix_extension(&path, "___new");
+                            utils::crashsafe::overwrite(&path, &tmp_path, &write.bytes)
+                        }
+                    })
+                    .await
+                    .expect("spawn_blocking")
+                    .expect("write file");
+                    let _ = write.done_tx.send(()); // receiver may lose interest any time
+                }
+                None => {
+                    return;
+                }
+            }
+        }
+    }
+
+    /// Perform a modification on our [`PersistentState`].
+    /// Return a future that completes once our modification has been persisted.
+    /// The output of the future is the return value of the `txn`` closure.
+    async fn mutating_transaction<F, R>(&self, txn: F) -> R
+    where
+        F: FnOnce(&mut PersistentState) -> R,
+    {
+        let (ret, done_rx) = {
+            let mut inner = self.inner.lock().unwrap();
+            let ret = txn(&mut inner.state);
+            let (done_tx, done_rx) = tokio::sync::oneshot::channel();
+            let write = PendingWrite {
+                bytes: serde_json::to_vec(&inner.state).expect("Serialization error"),
+                done_tx,
+            };
+            inner
+                .write_queue_tx
+                .send(write)
+                .expect("writer task always outlives self");
+            (ret, done_rx)
+        };
+        // the write task can go away once we start .await'ing
+        let _: () = done_rx.await.expect("writer task dead, check logs");
+        ret
+    }
+
    /// When registering a node, persist it so that on next start we will be able to
    /// iterate over known nodes to synchronize their tenant shard states with our observed state.
    pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> {
@@ -149,8 +192,8 @@ impl Persistence {

    /// At startup, we populate our map of tenant shards from persistent storage.
    pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result<Vec<TenantShardPersistence>> {
-        let locked = self.state.lock().unwrap();
-        Ok(locked.tenants.values().cloned().collect())
+        let inner = self.inner.lock().unwrap();
+        Ok(inner.state.tenants.values().cloned().collect())
    }

    /// Tenants must be persisted before we schedule them for the first time.  This enables us
@@ -159,8 +202,7 @@ impl Persistence {
        &self,
        shards: Vec<TenantShardPersistence>,
    ) -> anyhow::Result<()> {
-        let write = {
-            let mut locked = self.state.lock().unwrap();
+        self.mutating_transaction(|locked| {
            for shard in shards {
                let tenant_shard_id = TenantShardId {
                    tenant_id: TenantId::from_str(shard.tenant_id.as_str())?,
@@ -170,12 +212,9 @@ impl Persistence {

                locked.tenants.insert(tenant_shard_id, shard);
            }
-            locked.save()
-        };
-
-        write.commit().await?;
-
-        Ok(())
+            Ok(())
+        })
+        .await
    }

    /// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
@@ -184,49 +223,49 @@ impl Persistence {
    pub(crate) async fn increment_generation(
        &self,
        tenant_shard_id: TenantShardId,
-        node_id: Option<NodeId>,
+        node_id: NodeId,
    ) -> anyhow::Result<Generation> {
-        let (write, gen) = {
-            let mut locked = self.state.lock().unwrap();
+        self.mutating_transaction(|locked| {
            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
                anyhow::bail!("Tried to increment generation of unknown shard");
            };

-            // If we're called with a None pageserver, we need only update the generation
-            // record to disassociate it with this pageserver, not actually increment the number, as
-            // the increment is guaranteed to happen the next time this tenant is attached.
-            if node_id.is_some() {
-                shard.generation += 1;
-            }
+            shard.generation += 1;
+            shard.generation_pageserver = Some(node_id);

-            shard.generation_pageserver = node_id;
            let gen = Generation::new(shard.generation);
-            (locked.save(), gen)
-        };
+            Ok(gen)
+        })
+        .await
+    }

-        write.commit().await?;
-        Ok(gen)
+    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+        self.mutating_transaction(|locked| {
+            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
+                anyhow::bail!("Tried to increment generation of unknown shard");
+            };
+            shard.generation_pageserver = None;
+            shard.placement_policy = serde_json::to_string(&PlacementPolicy::Detached).unwrap();
+            Ok(())
+        })
+        .await
    }

    pub(crate) async fn re_attach(
        &self,
        node_id: NodeId,
    ) -> anyhow::Result<HashMap<TenantShardId, Generation>> {
-        let (write, result) = {
+        self.mutating_transaction(|locked| {
            let mut result = HashMap::new();
-            let mut locked = self.state.lock().unwrap();
            for (tenant_shard_id, shard) in locked.tenants.iter_mut() {
                if shard.generation_pageserver == Some(node_id) {
                    shard.generation += 1;
                    result.insert(*tenant_shard_id, Generation::new(shard.generation));
                }
            }
-
-            (locked.save(), result)
-        };
-
-        write.commit().await?;
-        Ok(result)
+            Ok(result)
+        })
+        .await
    }

    // TODO: when we start shard splitting, we must durably mark the tenant so that
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -296,7 +296,7 @@ impl Reconciler {
        // Increment generation before attaching to new pageserver
        self.generation = self
            .persistence
-            .increment_generation(self.tenant_shard_id, Some(dest_ps_id))
+            .increment_generation(self.tenant_shard_id, dest_ps_id)
            .await?;

        let dest_conf = build_location_config(
@@ -395,7 +395,7 @@ impl Reconciler {
                    // as locations with unknown (None) observed state.
                    self.generation = self
                        .persistence
-                        .increment_generation(self.tenant_shard_id, Some(node_id))
+                        .increment_generation(self.tenant_shard_id, node_id)
                        .await?;
                    wanted_conf.generation = self.generation.into();
                    tracing::info!("Observed configuration requires update.");
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -362,13 +362,14 @@ impl Service {
            );
        }

-        let new_generation = if attach_req.node_id.is_some() {
+        let new_generation = if let Some(req_node_id) = attach_req.node_id {
            Some(
                self.persistence
-                    .increment_generation(attach_req.tenant_shard_id, attach_req.node_id)
+                    .increment_generation(attach_req.tenant_shard_id, req_node_id)
                    .await?,
            )
        } else {
+            self.persistence.detach(attach_req.tenant_shard_id).await?;
            None
        };

@@ -380,6 +381,11 @@ impl Service {

        if let Some(new_generation) = new_generation {
            tenant_state.generation = new_generation;
+        } else {
+            // This is a detach notification.  We must update placement policy to avoid re-attaching
+            // during background scheduling/reconciliation, or during attachment service restart.
+            assert!(attach_req.node_id.is_none());
+            tenant_state.policy = PlacementPolicy::Detached;
        }

        if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
@@ -407,6 +413,7 @@ impl Service {
            "attach_hook: tenant {} set generation {:?}, pageserver {}",
            attach_req.tenant_shard_id,
            tenant_state.generation,
+            // TODO: this is an odd number of 0xf's
            attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
        );

@@ -868,7 +875,6 @@ impl Service {
            } else {
                let old_attached = shard.intent.attached;

-                shard.intent.attached = Some(migrate_req.node_id);
                match shard.policy {
                    PlacementPolicy::Single => {
                        shard.intent.secondary.clear();
@@ -882,7 +888,13 @@ impl Service {
                            shard.intent.secondary.push(old_attached);
                        }
                    }
+                    PlacementPolicy::Detached => {
+                        return Err(ApiError::BadRequest(anyhow::anyhow!(
+                            "Cannot migrate a tenant that is PlacementPolicy::Detached: configure it to an attached policy first"
+                        )))
+                    }
                }
+                shard.intent.attached = Some(migrate_req.node_id);

                tracing::info!("Migrating: new intent {:?}", shard.intent);
                shard.sequence = shard.sequence.next();
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -312,6 +312,18 @@ impl TenantState {
                    modified = true;
                }
            }
+            Detached => {
+                // Should have no attached or secondary pageservers
+                if self.intent.attached.is_some() {
+                    self.intent.attached = None;
+                    modified = true;
+                }
+
+                if !self.intent.secondary.is_empty() {
+                    self.intent.secondary.clear();
+                    modified = true;
+                }
+            }
        }

        if modified {
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -9,7 +9,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{path::PathBuf, process::Child, str::FromStr};
+use std::{path::PathBuf, str::FromStr};
 use tracing::instrument;
 use utils::{
    auth::{Claims, Scope},
@@ -220,7 +220,7 @@ impl AttachmentService {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self) -> anyhow::Result<Child> {
+    pub async fn start(&self) -> anyhow::Result<()> {
        let path_str = self.path.to_string_lossy();

        let mut args = vec!["-l", &self.listen, "-p", &path_str]
@@ -254,6 +254,7 @@ impl AttachmentService {
        )
        .await;

+        // TODO: shouldn't we bail if we fail to spawn the process?
        for ps_conf in &self.env.pageservers {
            let (pg_host, pg_port) =
                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -17,7 +17,7 @@ use std::io::Write;
 use std::os::unix::prelude::AsRawFd;
 use std::os::unix::process::CommandExt;
 use std::path::Path;
-use std::process::{Child, Command};
+use std::process::Command;
 use std::time::Duration;
 use std::{fs, io, thread};

@@ -60,7 +60,7 @@ pub async fn start_process<F, Fut, AI, A, EI>(
    envs: EI,
    initial_pid_file: InitialPidFile,
    process_status_check: F,
-) -> anyhow::Result<Child>
+) -> anyhow::Result<()>
 where
    F: Fn() -> Fut,
    Fut: std::future::Future<Output = anyhow::Result<bool>>,
@@ -98,7 +98,7 @@ where
        InitialPidFile::Expect(path) => path,
    };

-    let mut spawned_process = filled_cmd.spawn().with_context(|| {
+    let spawned_process = filled_cmd.spawn().with_context(|| {
        format!("Could not spawn {process_name}, see console output and log files for details.")
    })?;
    let pid = spawned_process.id();
@@ -106,12 +106,26 @@ where
        i32::try_from(pid)
            .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
    );
+    // set up a scopeguard to kill & wait for the child in case we panic or bail below
+    let spawned_process = scopeguard::guard(spawned_process, |mut spawned_process| {
+        println!("SIGKILL & wait the started process");
+        (|| {
+            // TODO: use another signal that can be caught by the child so it can clean up any children it spawned (e..g, walredo).
+            spawned_process.kill().context("SIGKILL child")?;
+            spawned_process.wait().context("wait() for child process")?;
+            anyhow::Ok(())
+        })()
+        .with_context(|| format!("scopeguard kill&wait child {process_name:?}"))
+        .unwrap();
+    });

    for retries in 0..RETRIES {
        match process_started(pid, pid_file_to_check, &process_status_check).await {
            Ok(true) => {
-                println!("\n{process_name} started, pid: {pid}");
-                return Ok(spawned_process);
+                println!("\n{process_name} started and passed status check, pid: {pid}");
+                // leak the child process, it'll outlive this neon_local invocation
+                drop(scopeguard::ScopeGuard::into_inner(spawned_process));
+                return Ok(());
            }
            Ok(false) => {
                if retries == NOTICE_AFTER_RETRIES {
@@ -126,16 +140,15 @@ where
                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
            }
            Err(e) => {
-                println!("{process_name} failed to start: {e:#}");
-                if let Err(e) = spawned_process.kill() {
-                    println!("Could not stop {process_name} subprocess: {e:#}")
-                };
+                println!("error starting process {process_name:?}: {e:#}");
                return Err(e);
            }
        }
    }
    println!();
-    anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
+    anyhow::bail!(
+        "{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
+    );
 }

 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -57,7 +57,7 @@ use crate::local_env::LocalEnv;
 use crate::postgresql_conf::PostgresConf;

 use compute_api::responses::{ComputeState, ComputeStatus};
-use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
+use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};

 // contents of a endpoint.json file
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
@@ -70,6 +70,7 @@ pub struct EndpointConf {
    http_port: u16,
    pg_version: u32,
    skip_pg_catalog_updates: bool,
+    features: Vec<ComputeFeature>,
 }

 //
@@ -140,6 +141,7 @@ impl ComputeControlPlane {
            // with this we basically test a case of waking up an idle compute, where
            // we also skip catalog updates in the cloud.
            skip_pg_catalog_updates: true,
+            features: vec![],
        });

        ep.create_endpoint_dir()?;
@@ -154,6 +156,7 @@ impl ComputeControlPlane {
                pg_port,
                pg_version,
                skip_pg_catalog_updates: true,
+                features: vec![],
            })?,
        )?;
        std::fs::write(
@@ -215,6 +218,9 @@ pub struct Endpoint {

    // Optimizations
    skip_pg_catalog_updates: bool,
+
+    // Feature flags
+    features: Vec<ComputeFeature>,
 }

 impl Endpoint {
@@ -244,6 +250,7 @@ impl Endpoint {
            tenant_id: conf.tenant_id,
            pg_version: conf.pg_version,
            skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
+            features: conf.features,
        })
    }

@@ -519,7 +526,7 @@ impl Endpoint {
            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
            format_version: 1.0,
            operation_uuid: None,
-            features: vec![],
+            features: self.features.clone(),
            cluster: Cluster {
                cluster_id: None, // project ID: not used
                name: None,       // project name: not used
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -11,7 +11,7 @@ use std::io;
 use std::io::Write;
 use std::num::NonZeroU64;
 use std::path::PathBuf;
-use std::process::{Child, Command};
+use std::process::Command;
 use std::time::Duration;

 use anyhow::{bail, Context};
@@ -161,7 +161,7 @@ impl PageServerNode {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
+    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        self.start_node(config_overrides, false).await
    }

@@ -207,7 +207,7 @@ impl PageServerNode {
        &self,
        config_overrides: &[&str],
        update_config: bool,
-    ) -> anyhow::Result<Child> {
+    ) -> anyhow::Result<()> {
        // TODO: using a thread here because start_process() is not async but we need to call check_status()
        let datadir = self.repo_path();
        print!(
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -7,7 +7,6 @@
 //! ```
 use std::io::Write;
 use std::path::PathBuf;
-use std::process::Child;
 use std::{io, result};

 use anyhow::Context;
@@ -104,7 +103,7 @@ impl SafekeeperNode {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
+    pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
        print!(
            "Starting safekeeper at '{}' in '{}'",
            self.pg_connection_config.raw_address(),
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -90,6 +90,9 @@ pub enum ComputeFeature {
    /// track short-lived connections as user activity.
    ActivityMonitorExperimental,

+    /// Enable running migrations
+    Migrations,
+
    /// This is a special feature flag that is used to represent unknown feature flags.
    /// Basically all unknown to enum flags are represented as this one. See unit test
    /// `parse_unknown_features()` for more details.
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,9 +1,11 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
+use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
-use std::fmt;
+use std::{fmt, ops::Range};

-use crate::reltag::{BlockNumber, RelTag};
+use crate::reltag::{BlockNumber, RelTag, SlruKind};

 /// Key used in the Repository kv-store.
 ///
@@ -143,12 +145,390 @@ impl Key {
    }
 }

+// Layout of the Key address space
+//
+// The Key struct, used to address the underlying key-value store, consists of
+// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
+// all the data and metadata keys into those 18 bytes.
+//
+// Principles for the mapping:
+//
+// - Things that are often accessed or modified together, should be close to
+//   each other in the key space. For example, if a relation is extended by one
+//   block, we create a new key-value pair for the block data, and update the
+//   relation size entry. Because of that, the RelSize key comes after all the
+//   RelBlocks of a relation: the RelSize and the last RelBlock are always next
+//   to each other.
+//
+// The key space is divided into four major sections, identified by the first
+// byte, and the form a hierarchy:
+//
+// 00 Relation data and metadata
+//
+//   DbDir    () -> (dbnode, spcnode)
+//   Filenodemap
+//   RelDir   -> relnode forknum
+//       RelBlocks
+//       RelSize
+//
+// 01 SLRUs
+//
+//   SlruDir  kind
+//   SlruSegBlocks segno
+//   SlruSegSize
+//
+// 02 pg_twophase
+//
+// 03 misc
+//    Controlfile
+//    checkpoint
+//    pg_version
+//
+// 04 aux files
+//
+// Below is a full list of the keyspace allocation:
+//
+// DbDir:
+// 00 00000000 00000000 00000000 00   00000000
+//
+// Filenodemap:
+// 00 SPCNODE  DBNODE   00000000 00   00000000
+//
+// RelDir:
+// 00 SPCNODE  DBNODE   00000000 00   00000001 (Postgres never uses relfilenode 0)
+//
+// RelBlock:
+// 00 SPCNODE  DBNODE   RELNODE  FORK BLKNUM
+//
+// RelSize:
+// 00 SPCNODE  DBNODE   RELNODE  FORK FFFFFFFF
+//
+// SlruDir:
+// 01 kind     00000000 00000000 00   00000000
+//
+// SlruSegBlock:
+// 01 kind     00000001 SEGNO    00   BLKNUM
+//
+// SlruSegSize:
+// 01 kind     00000001 SEGNO    00   FFFFFFFF
+//
+// TwoPhaseDir:
+// 02 00000000 00000000 00000000 00   00000000
+//
+// TwoPhaseFile:
+// 02 00000000 00000000 00000000 00   XID
+//
+// ControlFile:
+// 03 00000000 00000000 00000000 00   00000000
+//
+// Checkpoint:
+// 03 00000000 00000000 00000000 00   00000001
+//
+// AuxFiles:
+// 03 00000000 00000000 00000000 00   00000002
+//
+
+//-- Section 01: relation data and metadata
+
+pub const DBDIR_KEY: Key = Key {
+    field1: 0x00,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 0,
+};
+
+#[inline(always)]
+pub fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
+    Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }..Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0xffffffff,
+        field5: 0xff,
+        field6: 0xffffffff,
+    }
+}
+
+#[inline(always)]
+pub fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
+    Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }
+}
+
+#[inline(always)]
+pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
+    Key {
+        field1: 0x00,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 1,
+    }
+}
+
+#[inline(always)]
+pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: blknum,
+    }
+}
+
+#[inline(always)]
+pub fn rel_size_to_key(rel: RelTag) -> Key {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: 0xffffffff,
+    }
+}
+
+#[inline(always)]
+pub fn rel_key_range(rel: RelTag) -> Range<Key> {
+    Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum,
+        field6: 0,
+    }..Key {
+        field1: 0x00,
+        field2: rel.spcnode,
+        field3: rel.dbnode,
+        field4: rel.relnode,
+        field5: rel.forknum + 1,
+        field6: 0,
+    }
+}
+
+//-- Section 02: SLRUs
+
+#[inline(always)]
+pub fn slru_dir_to_key(kind: SlruKind) -> Key {
+    Key {
+        field1: 0x01,
+        field2: match kind {
+            SlruKind::Clog => 0x00,
+            SlruKind::MultiXactMembers => 0x01,
+            SlruKind::MultiXactOffsets => 0x02,
+        },
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }
+}
+
+#[inline(always)]
+pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
+    Key {
+        field1: 0x01,
+        field2: match kind {
+            SlruKind::Clog => 0x00,
+            SlruKind::MultiXactMembers => 0x01,
+            SlruKind::MultiXactOffsets => 0x02,
+        },
+        field3: 1,
+        field4: segno,
+        field5: 0,
+        field6: blknum,
+    }
+}
+
+#[inline(always)]
+pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
+    Key {
+        field1: 0x01,
+        field2: match kind {
+            SlruKind::Clog => 0x00,
+            SlruKind::MultiXactMembers => 0x01,
+            SlruKind::MultiXactOffsets => 0x02,
+        },
+        field3: 1,
+        field4: segno,
+        field5: 0,
+        field6: 0xffffffff,
+    }
+}
+
+#[inline(always)]
+pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
+    let field2 = match kind {
+        SlruKind::Clog => 0x00,
+        SlruKind::MultiXactMembers => 0x01,
+        SlruKind::MultiXactOffsets => 0x02,
+    };
+
+    Key {
+        field1: 0x01,
+        field2,
+        field3: 1,
+        field4: segno,
+        field5: 0,
+        field6: 0,
+    }..Key {
+        field1: 0x01,
+        field2,
+        field3: 1,
+        field4: segno,
+        field5: 1,
+        field6: 0,
+    }
+}
+
+//-- Section 03: pg_twophase
+
+pub const TWOPHASEDIR_KEY: Key = Key {
+    field1: 0x02,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 0,
+};
+
+#[inline(always)]
+pub fn twophase_file_key(xid: TransactionId) -> Key {
+    Key {
+        field1: 0x02,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: xid,
+    }
+}
+
+#[inline(always)]
+pub fn twophase_key_range(xid: TransactionId) -> Range<Key> {
+    let (next_xid, overflowed) = xid.overflowing_add(1);
+
+    Key {
+        field1: 0x02,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: 0,
+        field6: xid,
+    }..Key {
+        field1: 0x02,
+        field2: 0,
+        field3: 0,
+        field4: 0,
+        field5: u8::from(overflowed),
+        field6: next_xid,
+    }
+}
+
+//-- Section 03: Control file
+pub const CONTROLFILE_KEY: Key = Key {
+    field1: 0x03,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 0,
+};
+
+pub const CHECKPOINT_KEY: Key = Key {
+    field1: 0x03,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 1,
+};
+
+pub const AUX_FILES_KEY: Key = Key {
+    field1: 0x03,
+    field2: 0,
+    field3: 0,
+    field4: 0,
+    field5: 0,
+    field6: 2,
+};
+
+// Reverse mappings for a few Keys.
+// These are needed by WAL redo manager.
+
+// AUX_FILES currently stores only data for logical replication (slots etc), and
+// we don't preserve these on a branch because safekeepers can't follow timeline
+// switch (and generally it likely should be optional), so ignore these.
+#[inline(always)]
+pub fn is_inherited_key(key: Key) -> bool {
+    key != AUX_FILES_KEY
+}
+
+#[inline(always)]
+pub fn is_rel_fsm_block_key(key: Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
+}
+
+#[inline(always)]
+pub fn is_rel_vm_block_key(key: Key) -> bool {
+    key.field1 == 0x00
+        && key.field4 != 0
+        && key.field5 == VISIBILITYMAP_FORKNUM
+        && key.field6 != 0xffffffff
+}
+
+#[inline(always)]
+pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
+    Ok(match key.field1 {
+        0x01 => {
+            let kind = match key.field2 {
+                0x00 => SlruKind::Clog,
+                0x01 => SlruKind::MultiXactMembers,
+                0x02 => SlruKind::MultiXactOffsets,
+                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
+            };
+            let segno = key.field4;
+            let blknum = key.field6;
+
+            (kind, segno, blknum)
+        }
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
+    })
+}
+
+#[inline(always)]
+pub fn is_slru_block_key(key: Key) -> bool {
+    key.field1 == 0x01                // SLRU-related
+        && key.field3 == 0x00000001   // but not SlruDir
+        && key.field6 != 0xffffffff // and not SlruSegSize
+}
+
 #[inline(always)]
 pub fn is_rel_block_key(key: &Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
 }

 /// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
+#[inline(always)]
 pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
    Ok(match key.field1 {
        0x00 => (
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -104,6 +104,7 @@ pub struct KeySpaceAccum {
    accum: Option<Range<Key>>,

    ranges: Vec<Range<Key>>,
+    size: u64,
 }

 impl KeySpaceAccum {
@@ -111,6 +112,7 @@ impl KeySpaceAccum {
        Self {
            accum: None,
            ranges: Vec::new(),
+            size: 0,
        }
    }

@@ -121,6 +123,8 @@ impl KeySpaceAccum {

    #[inline(always)]
    pub fn add_range(&mut self, range: Range<Key>) {
+        self.size += key_range_size(&range) as u64;
+
        match self.accum.as_mut() {
            Some(accum) => {
                if range.start == accum.end {
@@ -146,6 +150,23 @@ impl KeySpaceAccum {
            ranges: self.ranges,
        }
    }
+
+    pub fn consume_keyspace(&mut self) -> KeySpace {
+        if let Some(accum) = self.accum.take() {
+            self.ranges.push(accum);
+        }
+
+        let mut prev_accum = KeySpaceAccum::new();
+        std::mem::swap(self, &mut prev_accum);
+
+        KeySpace {
+            ranges: prev_accum.ranges,
+        }
+    }
+
+    pub fn size(&self) -> u64 {
+        self.size
+    }
 }

 ///
@@ -254,6 +275,30 @@ mod tests {
        }
    }

+    #[test]
+    fn keyspace_consume() {
+        let ranges = vec![kr(0..10), kr(20..35), kr(40..45)];
+
+        let mut accum = KeySpaceAccum::new();
+        for range in &ranges {
+            accum.add_range(range.clone());
+        }
+
+        let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
+        assert_eq!(accum.size(), expected_size);
+
+        assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
+        assert_eq!(accum.size(), 0);
+
+        assert_ks_eq(&accum.consume_keyspace(), vec![]);
+        assert_eq!(accum.size(), 0);
+
+        for range in &ranges {
+            accum.add_range(range.clone());
+        }
+        assert_ks_eq(&accum.to_keyspace(), ranges);
+    }
+
    #[test]
    fn keyspace_add_range() {
        // two separate ranges
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -111,7 +111,19 @@ impl RelTag {
 /// These files are divided into segments, which are divided into
 /// pages of the same BLCKSZ as used for relation files.
 ///
-#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
+#[derive(
+    Debug,
+    Clone,
+    Copy,
+    Hash,
+    Serialize,
+    Deserialize,
+    PartialEq,
+    Eq,
+    PartialOrd,
+    Ord,
+    strum_macros::EnumIter,
+)]
 pub enum SlruKind {
    Clog,
    MultiXactMembers,
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -8,6 +8,7 @@ use std::pin::Pin;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use std::time::SystemTime;

 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
@@ -23,6 +24,7 @@ use futures::stream::Stream;
 use futures_util::StreamExt;
 use http_types::{StatusCode, Url};
 use tokio::time::Instant;
+use tokio_util::sync::CancellationToken;
 use tracing::debug;

 use crate::s3_bucket::RequestKind;
@@ -183,7 +185,6 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
    }
 }

-#[async_trait::async_trait]
 impl RemoteStorage for AzureBlobStorage {
    async fn list(
        &self,
@@ -371,6 +372,20 @@ impl RemoteStorage for AzureBlobStorage {
            copy_status = status;
        }
    }
+
+    async fn time_travel_recover(
+        &self,
+        _prefix: Option<&RemotePath>,
+        _timestamp: SystemTime,
+        _done_if_after: SystemTime,
+        _cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        // TODO use Azure point in time recovery feature for this
+        // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
+        Err(anyhow::anyhow!(
+            "time travel recovery for azure blob storage is not implemented"
+        ))
+    }
 }

 pin_project_lite::pin_project! {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -25,6 +25,7 @@ use bytes::Bytes;
 use futures::stream::Stream;
 use serde::{Deserialize, Serialize};
 use tokio::sync::Semaphore;
+use tokio_util::sync::CancellationToken;
 use toml_edit::Item;
 use tracing::info;

@@ -142,7 +143,7 @@ pub struct Listing {
 /// Storage (potentially remote) API to manage its state.
 /// This storage tries to be unaware of any layered repository context,
 /// providing basic CRUD operations for storage files.
-#[async_trait::async_trait]
+#[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
    /// Lists all top level subdirectories for a given prefix
    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
@@ -210,6 +211,15 @@ pub trait RemoteStorage: Send + Sync + 'static {

    /// Copy a remote object inside a bucket from one path to another.
    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
+
+    /// Resets the content of everything with the given prefix to the given state
+    async fn time_travel_recover(
+        &self,
+        prefix: Option<&RemotePath>,
+        timestamp: SystemTime,
+        done_if_after: SystemTime,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<()>;
 }

 pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
@@ -262,14 +272,15 @@ impl std::error::Error for DownloadError {}
 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
 #[derive(Clone)]
-pub enum GenericRemoteStorage {
+// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
+pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
    LocalFs(LocalFs),
    AwsS3(Arc<S3Bucket>),
    AzureBlob(Arc<AzureBlobStorage>),
-    Unreliable(Arc<UnreliableWrapper>),
+    Unreliable(Other),
 }

-impl GenericRemoteStorage {
+impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
    pub async fn list(
        &self,
        prefix: Option<&RemotePath>,
@@ -386,6 +397,33 @@ impl GenericRemoteStorage {
            Self::Unreliable(s) => s.copy(from, to).await,
        }
    }
+
+    pub async fn time_travel_recover(
+        &self,
+        prefix: Option<&RemotePath>,
+        timestamp: SystemTime,
+        done_if_after: SystemTime,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        match self {
+            Self::LocalFs(s) => {
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                    .await
+            }
+            Self::AwsS3(s) => {
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                    .await
+            }
+            Self::AzureBlob(s) => {
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                    .await
+            }
+            Self::Unreliable(s) => {
+                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
+                    .await
+            }
+        }
+    }
 }

 impl GenericRemoteStorage {
@@ -673,6 +711,7 @@ impl ConcurrencyLimiter {
            RequestKind::List => &self.read,
            RequestKind::Delete => &self.write,
            RequestKind::Copy => &self.write,
+            RequestKind::TimeTravel => &self.write,
        }
    }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -4,7 +4,7 @@
 //! This storage used in tests, but can also be used in cases when a certain persistent
 //! volume is mounted to the local FS.

-use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
+use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime};

 use anyhow::{bail, ensure, Context};
 use bytes::Bytes;
@@ -14,7 +14,7 @@ use tokio::{
    fs,
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
-use tokio_util::io::ReaderStream;
+use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

@@ -157,7 +157,6 @@ impl LocalFs {
    }
 }

-#[async_trait::async_trait]
 impl RemoteStorage for LocalFs {
    async fn list(
        &self,
@@ -423,6 +422,17 @@ impl RemoteStorage for LocalFs {
        })?;
        Ok(())
    }
+
+    #[allow(clippy::diverging_sub_expression)]
+    async fn time_travel_recover(
+        &self,
+        _prefix: Option<&RemotePath>,
+        _timestamp: SystemTime,
+        _done_if_after: SystemTime,
+        _cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        unimplemented!()
+    }
 }

 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -6,12 +6,14 @@

 use std::{
    borrow::Cow,
+    collections::HashMap,
    pin::Pin,
    sync::Arc,
    task::{Context, Poll},
+    time::SystemTime,
 };

-use anyhow::Context as _;
+use anyhow::{anyhow, Context as _};
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
    imds::credentials::ImdsCredentialsProvider,
@@ -27,17 +29,19 @@ use aws_sdk_s3::{
    config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
    operation::get_object::GetObjectError,
-    types::{Delete, ObjectIdentifier},
+    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
    Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;

-use aws_smithy_types::body::SdkBody;
 use aws_smithy_types::byte_stream::ByteStream;
+use aws_smithy_types::{body::SdkBody, DateTime};
 use bytes::Bytes;
 use futures::stream::Stream;
 use hyper::Body;
 use scopeguard::ScopeGuard;
+use tokio_util::sync::CancellationToken;
+use utils::backoff;

 use super::StorageMetadata;
 use crate::{
@@ -270,6 +274,59 @@ impl S3Bucket {
            }
        }
    }
+
+    async fn delete_oids(
+        &self,
+        kind: RequestKind,
+        delete_objects: &[ObjectIdentifier],
+    ) -> anyhow::Result<()> {
+        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
+            let started_at = start_measuring_requests(kind);
+
+            let resp = self
+                .client
+                .delete_objects()
+                .bucket(self.bucket_name.clone())
+                .delete(
+                    Delete::builder()
+                        .set_objects(Some(chunk.to_vec()))
+                        .build()?,
+                )
+                .send()
+                .await;
+
+            let started_at = ScopeGuard::into_inner(started_at);
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &resp, started_at);
+
+            let resp = resp?;
+            metrics::BUCKET_METRICS
+                .deleted_objects_total
+                .inc_by(chunk.len() as u64);
+            if let Some(errors) = resp.errors {
+                // Log a bounded number of the errors within the response:
+                // these requests can carry 1000 keys so logging each one
+                // would be too verbose, especially as errors may lead us
+                // to retry repeatedly.
+                const LOG_UP_TO_N_ERRORS: usize = 10;
+                for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
+                    tracing::warn!(
+                        "DeleteObjects key {} failed: {}: {}",
+                        e.key.as_ref().map(Cow::from).unwrap_or("".into()),
+                        e.code.as_ref().map(Cow::from).unwrap_or("".into()),
+                        e.message.as_ref().map(Cow::from).unwrap_or("".into())
+                    );
+                }
+
+                return Err(anyhow::format_err!(
+                    "Failed to delete {} objects",
+                    errors.len()
+                ));
+            }
+        }
+        Ok(())
+    }
 }

 pin_project_lite::pin_project! {
@@ -373,7 +430,6 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
    }
 }

-#[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
    async fn list(
        &self,
@@ -569,64 +625,168 @@ impl RemoteStorage for S3Bucket {
            delete_objects.push(obj_id);
        }

-        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
-            let started_at = start_measuring_requests(kind);
-
-            let resp = self
-                .client
-                .delete_objects()
-                .bucket(self.bucket_name.clone())
-                .delete(
-                    Delete::builder()
-                        .set_objects(Some(chunk.to_vec()))
-                        .build()?,
-                )
-                .send()
-                .await;
-
-            let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &resp, started_at);
-
-            match resp {
-                Ok(resp) => {
-                    metrics::BUCKET_METRICS
-                        .deleted_objects_total
-                        .inc_by(chunk.len() as u64);
-                    if let Some(errors) = resp.errors {
-                        // Log a bounded number of the errors within the response:
-                        // these requests can carry 1000 keys so logging each one
-                        // would be too verbose, especially as errors may lead us
-                        // to retry repeatedly.
-                        const LOG_UP_TO_N_ERRORS: usize = 10;
-                        for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
-                            tracing::warn!(
-                                "DeleteObjects key {} failed: {}: {}",
-                                e.key.as_ref().map(Cow::from).unwrap_or("".into()),
-                                e.code.as_ref().map(Cow::from).unwrap_or("".into()),
-                                e.message.as_ref().map(Cow::from).unwrap_or("".into())
-                            );
-                        }
-
-                        return Err(anyhow::format_err!(
-                            "Failed to delete {} objects",
-                            errors.len()
-                        ));
-                    }
-                }
-                Err(e) => {
-                    return Err(e.into());
-                }
-            }
-        }
-        Ok(())
+        self.delete_oids(kind, &delete_objects).await
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
        let paths = std::array::from_ref(path);
        self.delete_objects(paths).await
    }
+
+    async fn time_travel_recover(
+        &self,
+        prefix: Option<&RemotePath>,
+        timestamp: SystemTime,
+        done_if_after: SystemTime,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        let kind = RequestKind::TimeTravel;
+        let _guard = self.permit(kind).await;
+
+        let timestamp = DateTime::from(timestamp);
+        let done_if_after = DateTime::from(done_if_after);
+
+        tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
+
+        // get the passed prefix or if it is not set use prefix_in_bucket value
+        let prefix = prefix
+            .map(|p| self.relative_path_to_s3_object(p))
+            .or_else(|| self.prefix_in_bucket.clone());
+
+        let warn_threshold = 3;
+        let max_retries = 10;
+        let is_permanent = |_e: &_| false;
+
+        let list = backoff::retry(
+            || async {
+                Ok(self
+                    .client
+                    .list_object_versions()
+                    .bucket(self.bucket_name.clone())
+                    .set_prefix(prefix.clone())
+                    .send()
+                    .await?)
+            },
+            is_permanent,
+            warn_threshold,
+            max_retries,
+            "listing object versions for time_travel_recover",
+            backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
+        )
+        .await?;
+
+        if list.is_truncated().unwrap_or_default() {
+            anyhow::bail!("Received truncated ListObjectVersions response for prefix={prefix:?}");
+        }
+
+        let mut versions_deletes = list
+            .versions()
+            .iter()
+            .map(VerOrDelete::Version)
+            .chain(list.delete_markers().iter().map(VerOrDelete::DeleteMarker))
+            .collect::<Vec<_>>();
+
+        versions_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified()));
+
+        let mut vds_for_key = HashMap::<_, Vec<_>>::new();
+
+        for vd in versions_deletes {
+            let last_modified = vd.last_modified();
+            let version_id = vd.version_id();
+            let key = vd.key();
+            let (Some(last_modified), Some(version_id), Some(key)) =
+                (last_modified, version_id, key)
+            else {
+                anyhow::bail!(
+                    "One (or more) of last_modified, key, and id is None. \
+                    Is versioning enabled in the bucket? last_modified={:?} key={:?} version_id={:?}",
+                    last_modified, key, version_id,
+                );
+            };
+            if version_id == "null" {
+                anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \
+                    indicating either disabled versioning, or legacy objects with null version id values");
+            }
+            tracing::trace!(
+                "Parsing version key={key} version_id={version_id} is_delete={}",
+                matches!(vd, VerOrDelete::DeleteMarker(_))
+            );
+
+            vds_for_key
+                .entry(key)
+                .or_default()
+                .push((vd, last_modified, version_id));
+        }
+        for (key, versions) in vds_for_key {
+            let (last_vd, last_last_modified, _version_id) = versions.last().unwrap();
+            if last_last_modified > &&done_if_after {
+                tracing::trace!("Key {key} has version later than done_if_after, skipping");
+                continue;
+            }
+            // the version we want to restore to.
+            let version_to_restore_to =
+                match versions.binary_search_by_key(&timestamp, |tpl| *tpl.1) {
+                    Ok(v) => v,
+                    Err(e) => e,
+                };
+            if version_to_restore_to == versions.len() {
+                tracing::trace!("Key {key} has no changes since timestamp, skipping");
+                continue;
+            }
+            let mut do_delete = false;
+            if version_to_restore_to == 0 {
+                // All versions more recent, so the key didn't exist at the specified time point.
+                tracing::trace!(
+                    "All {} versions more recent for {key}, deleting",
+                    versions.len()
+                );
+                do_delete = true;
+            } else {
+                match &versions[version_to_restore_to - 1] {
+                    (VerOrDelete::Version(_), _last_modified, version_id) => {
+                        tracing::trace!("Copying old version {version_id} for {key}...");
+                        // Restore the state to the last version by copying
+                        let source_id =
+                            format!("{}/{key}?versionId={version_id}", self.bucket_name);
+
+                        backoff::retry(
+                            || async {
+                                Ok(self
+                                    .client
+                                    .copy_object()
+                                    .bucket(self.bucket_name.clone())
+                                    .key(key)
+                                    .copy_source(&source_id)
+                                    .send()
+                                    .await?)
+                            },
+                            is_permanent,
+                            warn_threshold,
+                            max_retries,
+                            "listing object versions for time_travel_recover",
+                            backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
+                        )
+                        .await?;
+                    }
+                    (VerOrDelete::DeleteMarker(_), _last_modified, _version_id) => {
+                        do_delete = true;
+                    }
+                }
+            };
+            if do_delete {
+                if matches!(last_vd, VerOrDelete::DeleteMarker(_)) {
+                    // Key has since been deleted (but there was some history), no need to do anything
+                    tracing::trace!("Key {key} already deleted, skipping.");
+                } else {
+                    tracing::trace!("Deleting {key}...");
+
+                    let oid = ObjectIdentifier::builder().key(key.to_owned()).build()?;
+                    self.delete_oids(kind, &[oid]).await?;
+                }
+            }
+        }
+        Ok(())
+    }
 }

 /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
@@ -651,6 +811,32 @@ fn start_measuring_requests(
    })
 }

+enum VerOrDelete<'a> {
+    Version(&'a ObjectVersion),
+    DeleteMarker(&'a DeleteMarkerEntry),
+}
+
+impl<'a> VerOrDelete<'a> {
+    fn last_modified(&self) -> Option<&'a DateTime> {
+        match self {
+            VerOrDelete::Version(v) => v.last_modified(),
+            VerOrDelete::DeleteMarker(v) => v.last_modified(),
+        }
+    }
+    fn version_id(&self) -> Option<&'a str> {
+        match self {
+            VerOrDelete::Version(v) => v.version_id(),
+            VerOrDelete::DeleteMarker(v) => v.version_id(),
+        }
+    }
+    fn key(&self) -> Option<&'a str> {
+        match self {
+            VerOrDelete::Version(v) => v.key(),
+            VerOrDelete::DeleteMarker(v) => v.key(),
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use camino::Utf8Path;
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -12,6 +12,7 @@ pub(crate) enum RequestKind {
    Delete = 2,
    List = 3,
    Copy = 4,
+    TimeTravel = 5,
 }

 use RequestKind::*;
@@ -24,6 +25,7 @@ impl RequestKind {
            Delete => "delete_object",
            List => "list_objects",
            Copy => "copy_object",
+            TimeTravel => "time_travel_recover",
        }
    }
    const fn as_index(&self) -> usize {
@@ -31,7 +33,7 @@ impl RequestKind {
    }
 }

-pub(super) struct RequestTyped<C>([C; 5]);
+pub(super) struct RequestTyped<C>([C; 6]);

 impl<C> RequestTyped<C> {
    pub(super) fn get(&self, kind: RequestKind) -> &C {
@@ -40,8 +42,8 @@ impl<C> RequestTyped<C> {

    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
        use RequestKind::*;
-        let mut it = [Get, Put, Delete, List, Copy].into_iter();
-        let arr = std::array::from_fn::<C, 5, _>(|index| {
+        let mut it = [Get, Put, Delete, List, Copy, TimeTravel].into_iter();
+        let arr = std::array::from_fn::<C, 6, _>(|index| {
            let next = it.next().unwrap();
            assert_eq!(index, next.as_index());
            f(next)
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -3,16 +3,19 @@
 //! testing purposes.
 use bytes::Bytes;
 use futures::stream::Stream;
-use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Mutex;
+use std::time::SystemTime;
+use std::{collections::hash_map::Entry, sync::Arc};
+use tokio_util::sync::CancellationToken;

 use crate::{
-    Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
+    Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
+    StorageMetadata,
 };

 pub struct UnreliableWrapper {
-    inner: crate::GenericRemoteStorage,
+    inner: GenericRemoteStorage<Arc<VoidStorage>>,

    // This many attempts of each operation will fail, then we let it succeed.
    attempts_to_fail: u64,
@@ -29,11 +32,21 @@ enum RemoteOp {
    Download(RemotePath),
    Delete(RemotePath),
    DeleteObjects(Vec<RemotePath>),
+    TimeTravelRecover(Option<RemotePath>),
 }

 impl UnreliableWrapper {
    pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
        assert!(attempts_to_fail > 0);
+        let inner = match inner {
+            GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
+            GenericRemoteStorage::AzureBlob(s) => GenericRemoteStorage::AzureBlob(s),
+            GenericRemoteStorage::LocalFs(s) => GenericRemoteStorage::LocalFs(s),
+            // We could also make this a no-op, as in, extract the inner of the passed generic remote storage
+            GenericRemoteStorage::Unreliable(_s) => {
+                panic!("Can't wrap unreliable wrapper unreliably")
+            }
+        };
        UnreliableWrapper {
            inner,
            attempts_to_fail,
@@ -84,7 +97,9 @@ impl UnreliableWrapper {
    }
 }

-#[async_trait::async_trait]
+// We never construct this, so the type is not important, just has to not be UnreliableWrapper and impl RemoteStorage.
+type VoidStorage = crate::LocalFs;
+
 impl RemoteStorage for UnreliableWrapper {
    async fn list_prefixes(
        &self,
@@ -169,4 +184,17 @@ impl RemoteStorage for UnreliableWrapper {
        self.attempt(RemoteOp::Upload(to.clone()))?;
        self.inner.copy_object(from, to).await
    }
+
+    async fn time_travel_recover(
+        &self,
+        prefix: Option<&RemotePath>,
+        timestamp: SystemTime,
+        done_if_after: SystemTime,
+        cancel: CancellationToken,
+    ) -> anyhow::Result<()> {
+        self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))?;
+        self.inner
+            .time_travel_recover(prefix, timestamp, done_if_after, cancel)
+            .await
+    }
 }
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,15 +1,19 @@
-use std::collections::HashSet;
 use std::env;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::sync::Arc;
-use std::time::UNIX_EPOCH;
+use std::time::{Duration, UNIX_EPOCH};
+use std::{collections::HashSet, time::SystemTime};

+use crate::common::{download_to_vec, upload_stream};
 use anyhow::Context;
+use camino::Utf8Path;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
+use test_context::test_context;
 use test_context::AsyncTestContext;
+use tokio_util::sync::CancellationToken;
 use tracing::info;

 mod common;
@@ -23,6 +27,121 @@ const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_

 const BASE_PREFIX: &str = "test";

+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorage::Enabled(ctx) => ctx,
+        MaybeEnabledStorage::Disabled => return Ok(()),
+    };
+    // Our test depends on discrepancies in the clock between S3 and the environment the tests
+    // run in. Therefore, wait a little bit before and after. The alternative would be
+    // to take the time from S3 response headers.
+    const WAIT_TIME: Duration = Duration::from_millis(3_000);
+
+    async fn time_point() -> SystemTime {
+        tokio::time::sleep(WAIT_TIME).await;
+        let ret = SystemTime::now();
+        tokio::time::sleep(WAIT_TIME).await;
+        ret
+    }
+
+    async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
+        Ok(client
+            .list_files(None)
+            .await
+            .context("list root files failure")?
+            .into_iter()
+            .collect::<HashSet<_>>())
+    }
+
+    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+    ctx.client.upload(data, len, &path1, None).await?;
+
+    let t0_files = list_files(&ctx.client).await?;
+    let t0 = time_point().await;
+    println!("at t0: {t0_files:?}");
+
+    let old_data = "remote blob data2";
+    let (data, len) = upload_stream(old_data.as_bytes().into());
+    ctx.client.upload(data, len, &path2, None).await?;
+
+    let t1_files = list_files(&ctx.client).await?;
+    let t1 = time_point().await;
+    println!("at t1: {t1_files:?}");
+
+    // A little check to ensure that our clock is not too far off from the S3 clock
+    {
+        let dl = ctx.client.download(&path2).await?;
+        let last_modified = dl.last_modified.unwrap();
+        let half_wt = WAIT_TIME.mul_f32(0.5);
+        let t0_hwt = t0 + half_wt;
+        let t1_hwt = t1 - half_wt;
+        if !(t0_hwt..=t1_hwt).contains(&last_modified) {
+            panic!("last_modified={last_modified:?} is not between t0_hwt={t0_hwt:?} and t1_hwt={t1_hwt:?}. \
+                This likely means a large lock discrepancy between S3 and the local clock.");
+        }
+    }
+
+    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+    ctx.client.upload(data, len, &path3, None).await?;
+
+    let new_data = "new remote blob data2";
+    let (data, len) = upload_stream(new_data.as_bytes().into());
+    ctx.client.upload(data, len, &path2, None).await?;
+
+    ctx.client.delete(&path1).await?;
+
+    let t2_files = list_files(&ctx.client).await?;
+    let t2 = time_point().await;
+    println!("at t2: {t2_files:?}");
+
+    // No changes after recovery to t2 (no-op)
+    let t_final = time_point().await;
+    ctx.client
+        .time_travel_recover(None, t2, t_final, CancellationToken::new())
+        .await?;
+    let t2_files_recovered = list_files(&ctx.client).await?;
+    println!("after recovery to t2: {t2_files_recovered:?}");
+    assert_eq!(t2_files, t2_files_recovered);
+    let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?;
+    assert_eq!(path2_recovered_t2, new_data.as_bytes());
+
+    // after recovery to t1: path1 is back, path2 has the old content
+    let t_final = time_point().await;
+    ctx.client
+        .time_travel_recover(None, t1, t_final, CancellationToken::new())
+        .await?;
+    let t1_files_recovered = list_files(&ctx.client).await?;
+    println!("after recovery to t1: {t1_files_recovered:?}");
+    assert_eq!(t1_files, t1_files_recovered);
+    let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?;
+    assert_eq!(path2_recovered_t1, old_data.as_bytes());
+
+    // after recovery to t0: everything is gone except for path1
+    let t_final = time_point().await;
+    ctx.client
+        .time_travel_recover(None, t0, t_final, CancellationToken::new())
+        .await?;
+    let t0_files_recovered = list_files(&ctx.client).await?;
+    println!("after recovery to t0: {t0_files_recovered:?}");
+    assert_eq!(t0_files, t0_files_recovered);
+
+    // cleanup
+    ctx.client.delete_objects(&[path1, path2, path3]).await?;
+
+    Ok(())
+}
+
 struct EnabledS3 {
    client: Arc<GenericRemoteStorage>,
    base_prefix: &'static str,
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,10 +1,11 @@
 use std::{
    borrow::Cow,
    fs::{self, File},
-    io,
+    io::{self, Write},
 };

 use camino::{Utf8Path, Utf8PathBuf};
+use tracing::info;

 /// Similar to [`std::fs::create_dir`], except we fsync the
 /// created directory and its parent.
@@ -81,6 +82,22 @@ pub fn path_with_suffix_extension(
    original_path.as_ref().with_extension(new_extension)
 }

+#[tracing::instrument(skip_all)]
+pub fn fsync_file_and_parent_log(file_path: &Utf8Path) -> io::Result<()> {
+    let parent = file_path.parent().ok_or_else(|| {
+        io::Error::new(
+            io::ErrorKind::Other,
+            format!("File {file_path:?} has no parent"),
+        )
+    })?;
+    info!("fsync file");
+    fsync(file_path)?;
+    info!("fsync parent");
+    fsync(parent)?;
+    info!("done");
+    Ok(())
+}
+
 pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> {
    let parent = file_path.parent().ok_or_else(|| {
        io::Error::new(
@@ -88,7 +105,6 @@ pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> {
            format!("File {file_path:?} has no parent"),
        )
    })?;
-
    fsync(file_path)?;
    fsync(parent)?;
    Ok(())
@@ -112,6 +128,48 @@ pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Erro
    tokio::fs::File::open(path.as_ref()).await?.sync_all().await
 }

+/// Writes a file to the specified `final_path` in a crash safe fasion
+///
+/// The file is first written to the specified tmp_path, and in a second
+/// step, the tmp path is renamed to the final path. As renames are
+/// atomic, a crash during the write operation will never leave behind a
+/// partially written file.
+///
+/// NB: an async variant of this code exists in Pageserver's VirtualFile.
+pub fn overwrite(
+    final_path: &Utf8Path,
+    tmp_path: &Utf8Path,
+    content: &[u8],
+) -> std::io::Result<()> {
+    let Some(final_path_parent) = final_path.parent() else {
+        return Err(std::io::Error::from_raw_os_error(
+            nix::errno::Errno::EINVAL as i32,
+        ));
+    };
+    std::fs::remove_file(tmp_path).or_else(crate::fs_ext::ignore_not_found)?;
+    let mut file = std::fs::OpenOptions::new()
+        .write(true)
+        // Use `create_new` so that, if we race with ourselves or something else,
+        // we bail out instead of causing damage.
+        .create_new(true)
+        .open(tmp_path)?;
+    file.write_all(content)?;
+    file.sync_all()?;
+    drop(file); // before the rename, that's important!
+                // renames are atomic
+    std::fs::rename(tmp_path, final_path)?;
+    // Only open final path parent dirfd now, so that this operation only
+    // ever holds one VirtualFile fd at a time.  That's important because
+    // the current `find_victim_slot` impl might pick the same slot for both
+    // VirtualFile., and it eventually does a blocking write lock instead of
+    // try_lock.
+    let final_parent_dirfd = std::fs::OpenOptions::new()
+        .read(true)
+        .open(final_path_parent)?;
+    final_parent_dirfd.sync_all()?;
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {

--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -131,7 +131,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
        ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
-        _ => error!("Error processing HTTP request: {api_error:#}"),
+        ApiError::ShuttingDown => info!("Shut down while processing HTTP request"),
+        ApiError::Timeout(_) => info!("Timeout while processing HTTP request: {api_error:#}"),
+        _ => info!("Error processing HTTP request: {api_error:#}"),
    }

    api_error.into_response()
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -16,6 +16,7 @@ use std::{
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use nix::{errno::Errno::EAGAIN, fcntl};
+use tracing::info;

 use crate::crashsafe;

@@ -41,14 +42,19 @@ impl Deref for LockFileGuard {

 impl UnwrittenLockFile {
    /// Replace the content of this lock file with the byte representation of `contents`.
+    #[tracing::instrument(skip_all)]
    pub fn write_content(mut self, contents: String) -> anyhow::Result<LockFileGuard> {
+        info!("truncate");
        self.file
            .set_len(0)
            .context("Failed to truncate lockfile")?;
+        info!("write_all");
        self.file
            .write_all(contents.as_bytes())
            .with_context(|| format!("Failed to write '{contents}' contents into lockfile"))?;
-        crashsafe::fsync_file_and_parent(&self.path).context("fsync lockfile")?;
+        info!("fsync file and parent");
+        crashsafe::fsync_file_and_parent_log(&self.path).context("fsync lockfile")?;
+        info!("done");
        Ok(LockFileGuard(self.file))
    }
 }
--- a/libs/utils/src/nonblock.rs
+++ b/libs/utils/src/nonblock.rs
@@ -5,10 +5,10 @@ use std::os::unix::io::RawFd;
 pub fn set_nonblock(fd: RawFd) -> Result<(), std::io::Error> {
    let bits = fcntl(fd, F_GETFL)?;

-    // Safety: If F_GETFL returns some unknown bits, they should be valid
+    // If F_GETFL returns some unknown bits, they should be valid
    // for passing back to F_SETFL, too. If we left them out, the F_SETFL
    // would effectively clear them, which is not what we want.
-    let mut flags = unsafe { OFlag::from_bits_unchecked(bits) };
+    let mut flags = OFlag::from_bits_retain(bits);
    flags |= OFlag::O_NONBLOCK;

    fcntl(fd, F_SETFL(flags))?;
--- a/libs/utils/src/pid_file.rs
+++ b/libs/utils/src/pid_file.rs
@@ -54,6 +54,7 @@ use std::ops::Deref;
 use anyhow::Context;
 use camino::Utf8Path;
 use nix::unistd::Pid;
+use tracing::info;

 use crate::lock_file::{self, LockFileRead};

@@ -85,12 +86,16 @@ impl Deref for PidFileGuard {
 /// The claim ends as soon as the returned guard object is dropped.
 /// To maintain the claim for the remaining lifetime of the current process,
 /// use [`std::mem::forget`] or similar.
+#[tracing::instrument(skip_all)]
 pub fn claim_for_current_process(path: &Utf8Path) -> anyhow::Result<PidFileGuard> {
+    info!("create_exclusive");
    let unwritten_lock_file = lock_file::create_exclusive(path).context("lock file")?;
    // if any of the next steps fail, we drop the file descriptor and thereby release the lock
+    info!("write_content");
    let guard = unwritten_lock_file
        .write_content(Pid::this().to_string())
        .context("write pid to lock file")?;
+    info!("done");
    Ok(PidFileGuard(guard))
 }

--- a/libs/utils/src/tcp_listener.rs
+++ b/libs/utils/src/tcp_listener.rs
@@ -1,7 +1,6 @@
 use std::{
    io,
    net::{TcpListener, ToSocketAddrs},
-    os::unix::prelude::AsRawFd,
 };

 use nix::sys::socket::{setsockopt, sockopt::ReuseAddr};
@@ -10,7 +9,7 @@ use nix::sys::socket::{setsockopt, sockopt::ReuseAddr};
 pub fn bind<A: ToSocketAddrs>(addr: A) -> io::Result<TcpListener> {
    let listener = TcpListener::bind(addr)?;

-    setsockopt(listener.as_raw_fd(), ReuseAddr, &true)?;
+    setsockopt(&listener, ReuseAddr, &true)?;

    Ok(listener)
 }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -423,8 +423,8 @@ async fn client(
    tokio::select! {
        res = do_requests => { res },
        _ = cancel.cancelled() => {
-            client.shutdown().await;
-            return;
+            // fallthrough to shutdown
        }
    }
+    client.shutdown().await;
 }
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -11,8 +11,9 @@
 //! from data stored in object storage.
 //!
 use anyhow::{anyhow, bail, ensure, Context};
-use bytes::{BufMut, BytesMut};
+use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
+use pageserver_api::key::{key_to_slru_block, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -133,6 +134,87 @@ where
    ctx: &'a RequestContext,
 }

+/// A sink that accepts SLRU blocks ordered by key and forwards
+/// full segments to the archive.
+struct SlruSegmentsBuilder<'a, 'b, W>
+where
+    W: AsyncWrite + Send + Sync + Unpin,
+{
+    ar: &'a mut Builder<&'b mut W>,
+    buf: Vec<u8>,
+    current_segment: Option<(SlruKind, u32)>,
+}
+
+impl<'a, 'b, W> SlruSegmentsBuilder<'a, 'b, W>
+where
+    W: AsyncWrite + Send + Sync + Unpin,
+{
+    fn new(ar: &'a mut Builder<&'b mut W>) -> Self {
+        Self {
+            ar,
+            buf: Vec::new(),
+            current_segment: None,
+        }
+    }
+
+    async fn add_block(&mut self, key: &Key, block: Bytes) -> anyhow::Result<()> {
+        let (kind, segno, _) = key_to_slru_block(*key)?;
+
+        match kind {
+            SlruKind::Clog => {
+                ensure!(block.len() == BLCKSZ as usize || block.len() == BLCKSZ as usize + 8);
+            }
+            SlruKind::MultiXactMembers | SlruKind::MultiXactOffsets => {
+                ensure!(block.len() == BLCKSZ as usize);
+            }
+        }
+
+        let segment = (kind, segno);
+        match self.current_segment {
+            None => {
+                self.current_segment = Some(segment);
+                self.buf
+                    .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
+            }
+            Some(current_seg) if current_seg == segment => {
+                self.buf
+                    .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
+            }
+            Some(_) => {
+                self.flush().await?;
+
+                self.current_segment = Some(segment);
+                self.buf
+                    .extend_from_slice(block.slice(..BLCKSZ as usize).as_ref());
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn flush(&mut self) -> anyhow::Result<()> {
+        let nblocks = self.buf.len() / BLCKSZ as usize;
+        let (kind, segno) = self.current_segment.take().unwrap();
+        let segname = format!("{}/{:>04X}", kind.to_str(), segno);
+        let header = new_tar_header(&segname, self.buf.len() as u64)?;
+        self.ar.append(&header, self.buf.as_slice()).await?;
+
+        trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
+
+        self.buf.clear();
+
+        Ok(())
+    }
+
+    async fn finish(mut self) -> anyhow::Result<()> {
+        if self.current_segment.is_none() || self.buf.is_empty() {
+            return Ok(());
+        }
+
+        self.flush().await
+    }
+}
+
 impl<'a, W> Basebackup<'a, W>
 where
    W: AsyncWrite + Send + Sync + Unpin,
@@ -168,20 +250,27 @@ where
        }

        // Gather non-relational files from object storage pages.
-        for kind in [
-            SlruKind::Clog,
-            SlruKind::MultiXactOffsets,
-            SlruKind::MultiXactMembers,
-        ] {
-            for segno in self
+        let slru_partitions = self
+            .timeline
+            .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
+            .await?
+            .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
+
+        let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
+
+        for part in slru_partitions.parts {
+            let blocks = self
                .timeline
-                .list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
-                .await?
-            {
-                self.add_slru_segment(kind, segno).await?;
+                .get_vectored(&part.ranges, self.lsn, self.ctx)
+                .await?;
+
+            for (key, block) in blocks {
+                slru_builder.add_block(&key, block?).await?;
            }
        }

+        slru_builder.finish().await?;
+
        let mut min_restart_lsn: Lsn = Lsn::MAX;
        // Create tablespace directories
        for ((spcnode, dbnode), has_relmap_file) in
@@ -305,39 +394,6 @@ where
        Ok(())
    }

-    //
-    // Generate SLRU segment files from repository.
-    //
-    async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
-        let nblocks = self
-            .timeline
-            .get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
-            .await?;
-
-        let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
-        for blknum in 0..nblocks {
-            let img = self
-                .timeline
-                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn, self.ctx)
-                .await?;
-
-            if slru == SlruKind::Clog {
-                ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
-            } else {
-                ensure!(img.len() == BLCKSZ as usize);
-            }
-
-            slru_buf.extend_from_slice(&img[..BLCKSZ as usize]);
-        }
-
-        let segname = format!("{}/{:>04X}", slru.to_str(), segno);
-        let header = new_tar_header(&segname, slru_buf.len() as u64)?;
-        self.ar.append(&header, slru_buf.as_slice()).await?;
-
-        trace!("Added to basebackup slru {} relsize {}", segname, nblocks);
-        Ok(())
-    }
-
    //
    // Include database/tablespace directories.
    //
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -293,6 +293,7 @@ fn start_pageserver(
    // Create and lock PID file. This ensures that there cannot be more than one
    // pageserver process running at the same time.
    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
+    info!("Claiming pid file at {lock_file_path:?}");
    let lock_file =
        utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
    info!("Claimed pid file at {lock_file_path:?}");
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -877,6 +877,56 @@ paths:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"

+  /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+    post:
+      description: |
+        Marks the initdb archive for preservation upon deletion of the timeline or tenant.
+        This is meant to be part of the disaster recovery process.
+      responses:
+        "202":
+          description: Tenant scheduled to load successfully
+        "404":
+          description: No tenant or timeline found for the specified ids
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/synthetic_size:
    parameters:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -187,6 +187,7 @@ impl From<TenantSlotUpsertError> for ApiError {
        match e {
            InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
            MapState(e) => e.into(),
+            ShuttingDown(_) => ApiError::ShuttingDown,
        }
    }
 }
@@ -495,6 +496,10 @@ async fn timeline_create_handler(
                    .map_err(ApiError::InternalServerError)?;
                json_response(StatusCode::CREATED, timeline_info)
            }
+            Err(_) if tenant.cancel.is_cancelled() => {
+                // In case we get some ugly error type during shutdown, cast it into a clean 503.
+                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string()))
+            }
            Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
                json_response(StatusCode::CONFLICT, ())
            }
@@ -561,6 +566,43 @@ async fn timeline_list_handler(
    json_response(StatusCode::OK, response_data)
 }

+async fn timeline_preserve_initdb_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    // Part of the process for disaster recovery from safekeeper-stored WAL:
+    // If we don't recover into a new timeline but want to keep the timeline ID,
+    // then the initdb archive is deleted. This endpoint copies it to a different
+    // location where timeline recreation cand find it.
+
+    async {
+        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+
+        let timeline = tenant
+            .get_timeline(timeline_id, false)
+            .map_err(|e| ApiError::NotFound(e.into()))?;
+
+        timeline
+            .preserve_initdb_archive()
+            .await
+            .context("preserving initdb archive")
+            .map_err(ApiError::InternalServerError)?;
+
+        Ok::<_, ApiError>(())
+    }
+    .instrument(info_span!("timeline_preserve_initdb_archive",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug(),
+                %timeline_id))
+    .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn timeline_detail_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -1220,19 +1262,9 @@ async fn tenant_create_handler(
    };
    // We created the tenant. Existing API semantics are that the tenant
    // is Active when this function returns.
-    if let res @ Err(_) = new_tenant
+    new_tenant
        .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
-        .await
-    {
-        // This shouldn't happen because we just created the tenant directory
-        // in upsert_location, and there aren't any remote timelines
-        // to load, so, nothing can really fail during load.
-        // Don't do cleanup because we don't know how we got here.
-        // The tenant will likely be in `Broken` state and subsequent
-        // calls will fail.
-        res.context("created tenant failed to become active")
-            .map_err(ApiError::InternalServerError)?;
-    }
+        .await?;

    json_response(
        StatusCode::CREATED,
@@ -1943,6 +1975,10 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_id/ignore", |r| {
            api_handler(r, tenant_ignore_handler)
        })
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
+            |r| api_handler(r, timeline_preserve_initdb_handler),
+        )
        .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
            api_handler(r, timeline_detail_handler)
        })
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -61,7 +61,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::pgdatadir_mapping::{rel_block_to_key, Version};
+use crate::pgdatadir_mapping::Version;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -75,6 +75,7 @@ use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;

+use pageserver_api::key::rel_block_to_key;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

@@ -321,8 +322,8 @@ enum PageStreamError {
    Shutdown,

    /// Something went wrong reading a page: this likely indicates a pageserver bug
-    #[error("Read error: {0}")]
-    Read(PageReconstructError),
+    #[error("Read error")]
+    Read(#[source] PageReconstructError),

    /// Ran out of time waiting for an LSN
    #[error("LSN timeout: {0}")]
@@ -331,11 +332,11 @@ enum PageStreamError {
    /// The entity required to serve the request (tenant or timeline) is not found,
    /// or is not found in a suitable state to serve a request.
    #[error("Not found: {0}")]
-    NotFound(std::borrow::Cow<'static, str>),
+    NotFound(Cow<'static, str>),

    /// Request asked for something that doesn't make sense, like an invalid LSN
    #[error("Bad request: {0}")]
-    BadRequest(std::borrow::Cow<'static, str>),
+    BadRequest(Cow<'static, str>),
 }

 impl From<PageReconstructError> for PageStreamError {
@@ -386,12 +387,18 @@ impl PageServerHandler {

    /// Future that completes when we need to shut down the connection.
    ///
-    /// Reasons for need to shut down are:
-    /// - any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
-    /// - task_mgr requests shutdown of the connection
+    /// We currently need to shut down when any of the following happens:
+    /// 1. any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
+    /// 2. task_mgr requests shutdown of the connection
    ///
-    /// The need to check for `task_mgr` cancellation arises mainly from `handle_pagerequests`
-    /// where, at first, `shard_timelines` is empty, see <https://github.com/neondatabase/neon/pull/6388>
+    /// NB on (1): the connection's lifecycle is not actually tied to any of the
+    /// `shard_timelines`s' lifecycles. But it's _necessary_ in the current
+    /// implementation to be responsive to timeline cancellation because
+    /// the connection holds their `GateGuards` open (sored in `shard_timelines`).
+    /// We currently do the easy thing and terminate the connection if any of the
+    /// shard_timelines gets cancelled. But really, we cuold spend more effort
+    /// and simply remove the cancelled timeline from the `shard_timelines`, thereby
+    /// dropping the guard.
    ///
    /// NB: keep in sync with [`Self::is_connection_cancelled`]
    async fn await_connection_cancelled(&self) {
@@ -404,16 +411,17 @@ impl PageServerHandler {
        // immutable &self).  So it's fine to evaluate shard_timelines after the sleep, we don't risk
        // missing any inserts to the map.

-        let mut futs = self
-            .shard_timelines
-            .values()
-            .map(|ht| ht.timeline.cancel.cancelled())
-            .collect::<FuturesUnordered<_>>();
-
-        tokio::select! {
-            _ = task_mgr::shutdown_watcher() => { }
-            _ = futs.next() => {}
-        }
+        let mut cancellation_sources = Vec::with_capacity(1 + self.shard_timelines.len());
+        use futures::future::Either;
+        cancellation_sources.push(Either::Left(task_mgr::shutdown_watcher()));
+        cancellation_sources.extend(
+            self.shard_timelines
+                .values()
+                .map(|ht| Either::Right(ht.timeline.cancel.cancelled())),
+        );
+        FuturesUnordered::from_iter(cancellation_sources)
+            .next()
+            .await;
    }

    /// Checking variant of [`Self::await_connection_cancelled`].
@@ -659,7 +667,10 @@ impl PageServerHandler {
                        // print the all details to the log with {:#}, but for the client the
                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
                        // here includes cancellation which is not an error.
-                        span.in_scope(|| error!("error reading relation or page version: {:#}", e));
+                        let full = utils::error::report_compact_sources(&e);
+                        span.in_scope(|| {
+                            error!("error reading relation or page version: {full:#}")
+                        });
                        PagestreamBeMessage::Error(PagestreamErrorResponse {
                            message: e.to_string(),
                        })
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -13,7 +13,12 @@ use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes};
-use pageserver_api::key::is_rel_block_key;
+use pageserver_api::key::{
+    dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
+    rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
+    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
+    AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+};
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -22,6 +27,7 @@ use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
 use std::ops::Range;
+use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
@@ -528,6 +534,33 @@ impl Timeline {
        Ok(Default::default())
    }

+    pub(crate) async fn get_slru_keyspace(
+        &self,
+        version: Version<'_>,
+        ctx: &RequestContext,
+    ) -> Result<KeySpace, PageReconstructError> {
+        let mut accum = KeySpaceAccum::new();
+
+        for kind in SlruKind::iter() {
+            let mut segments: Vec<u32> = self
+                .list_slru_segments(kind, version, ctx)
+                .await?
+                .into_iter()
+                .collect();
+            segments.sort_unstable();
+
+            for seg in segments {
+                let block_count = self.get_slru_segment_size(kind, seg, version, ctx).await?;
+
+                accum.add_range(
+                    slru_block_to_key(kind, seg, 0)..slru_block_to_key(kind, seg, block_count),
+                );
+            }
+        }
+
+        Ok(accum.to_keyspace())
+    }
+
    /// Get a list of SLRU segments
    pub(crate) async fn list_slru_segments(
        &self,
@@ -1535,366 +1568,6 @@ struct SlruSegmentDirectory {

 static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);

-// Layout of the Key address space
-//
-// The Key struct, used to address the underlying key-value store, consists of
-// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map
-// all the data and metadata keys into those 18 bytes.
-//
-// Principles for the mapping:
-//
-// - Things that are often accessed or modified together, should be close to
-//   each other in the key space. For example, if a relation is extended by one
-//   block, we create a new key-value pair for the block data, and update the
-//   relation size entry. Because of that, the RelSize key comes after all the
-//   RelBlocks of a relation: the RelSize and the last RelBlock are always next
-//   to each other.
-//
-// The key space is divided into four major sections, identified by the first
-// byte, and the form a hierarchy:
-//
-// 00 Relation data and metadata
-//
-//   DbDir    () -> (dbnode, spcnode)
-//   Filenodemap
-//   RelDir   -> relnode forknum
-//       RelBlocks
-//       RelSize
-//
-// 01 SLRUs
-//
-//   SlruDir  kind
-//   SlruSegBlocks segno
-//   SlruSegSize
-//
-// 02 pg_twophase
-//
-// 03 misc
-//    Controlfile
-//    checkpoint
-//    pg_version
-//
-// 04 aux files
-//
-// Below is a full list of the keyspace allocation:
-//
-// DbDir:
-// 00 00000000 00000000 00000000 00   00000000
-//
-// Filenodemap:
-// 00 SPCNODE  DBNODE   00000000 00   00000000
-//
-// RelDir:
-// 00 SPCNODE  DBNODE   00000000 00   00000001 (Postgres never uses relfilenode 0)
-//
-// RelBlock:
-// 00 SPCNODE  DBNODE   RELNODE  FORK BLKNUM
-//
-// RelSize:
-// 00 SPCNODE  DBNODE   RELNODE  FORK FFFFFFFF
-//
-// SlruDir:
-// 01 kind     00000000 00000000 00   00000000
-//
-// SlruSegBlock:
-// 01 kind     00000001 SEGNO    00   BLKNUM
-//
-// SlruSegSize:
-// 01 kind     00000001 SEGNO    00   FFFFFFFF
-//
-// TwoPhaseDir:
-// 02 00000000 00000000 00000000 00   00000000
-//
-// TwoPhaseFile:
-// 02 00000000 00000000 00000000 00   XID
-//
-// ControlFile:
-// 03 00000000 00000000 00000000 00   00000000
-//
-// Checkpoint:
-// 03 00000000 00000000 00000000 00   00000001
-//
-// AuxFiles:
-// 03 00000000 00000000 00000000 00   00000002
-//
-
-//-- Section 01: relation data and metadata
-
-const DBDIR_KEY: Key = Key {
-    field1: 0x00,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 0,
-};
-
-fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
-    Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }..Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0xffffffff,
-        field5: 0xff,
-        field6: 0xffffffff,
-    }
-}
-
-fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
-    Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }
-}
-
-fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
-    Key {
-        field1: 0x00,
-        field2: spcnode,
-        field3: dbnode,
-        field4: 0,
-        field5: 0,
-        field6: 1,
-    }
-}
-
-pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: blknum,
-    }
-}
-
-fn rel_size_to_key(rel: RelTag) -> Key {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: 0xffffffff,
-    }
-}
-
-fn rel_key_range(rel: RelTag) -> Range<Key> {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: 0,
-    }..Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum + 1,
-        field6: 0,
-    }
-}
-
-//-- Section 02: SLRUs
-
-fn slru_dir_to_key(kind: SlruKind) -> Key {
-    Key {
-        field1: 0x01,
-        field2: match kind {
-            SlruKind::Clog => 0x00,
-            SlruKind::MultiXactMembers => 0x01,
-            SlruKind::MultiXactOffsets => 0x02,
-        },
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: 0,
-    }
-}
-
-fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key {
-    Key {
-        field1: 0x01,
-        field2: match kind {
-            SlruKind::Clog => 0x00,
-            SlruKind::MultiXactMembers => 0x01,
-            SlruKind::MultiXactOffsets => 0x02,
-        },
-        field3: 1,
-        field4: segno,
-        field5: 0,
-        field6: blknum,
-    }
-}
-
-fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key {
-    Key {
-        field1: 0x01,
-        field2: match kind {
-            SlruKind::Clog => 0x00,
-            SlruKind::MultiXactMembers => 0x01,
-            SlruKind::MultiXactOffsets => 0x02,
-        },
-        field3: 1,
-        field4: segno,
-        field5: 0,
-        field6: 0xffffffff,
-    }
-}
-
-fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
-    let field2 = match kind {
-        SlruKind::Clog => 0x00,
-        SlruKind::MultiXactMembers => 0x01,
-        SlruKind::MultiXactOffsets => 0x02,
-    };
-
-    Key {
-        field1: 0x01,
-        field2,
-        field3: 1,
-        field4: segno,
-        field5: 0,
-        field6: 0,
-    }..Key {
-        field1: 0x01,
-        field2,
-        field3: 1,
-        field4: segno,
-        field5: 1,
-        field6: 0,
-    }
-}
-
-//-- Section 03: pg_twophase
-
-const TWOPHASEDIR_KEY: Key = Key {
-    field1: 0x02,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 0,
-};
-
-fn twophase_file_key(xid: TransactionId) -> Key {
-    Key {
-        field1: 0x02,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: xid,
-    }
-}
-
-fn twophase_key_range(xid: TransactionId) -> Range<Key> {
-    let (next_xid, overflowed) = xid.overflowing_add(1);
-
-    Key {
-        field1: 0x02,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: 0,
-        field6: xid,
-    }..Key {
-        field1: 0x02,
-        field2: 0,
-        field3: 0,
-        field4: 0,
-        field5: u8::from(overflowed),
-        field6: next_xid,
-    }
-}
-
-//-- Section 03: Control file
-const CONTROLFILE_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 0,
-};
-
-const CHECKPOINT_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 1,
-};
-
-const AUX_FILES_KEY: Key = Key {
-    field1: 0x03,
-    field2: 0,
-    field3: 0,
-    field4: 0,
-    field5: 0,
-    field6: 2,
-};
-
-// Reverse mappings for a few Keys.
-// These are needed by WAL redo manager.
-
-// AUX_FILES currently stores only data for logical replication (slots etc), and
-// we don't preserve these on a branch because safekeepers can't follow timeline
-// switch (and generally it likely should be optional), so ignore these.
-pub fn is_inherited_key(key: Key) -> bool {
-    key != AUX_FILES_KEY
-}
-
-pub fn is_rel_fsm_block_key(key: Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
-}
-
-pub fn is_rel_vm_block_key(key: Key) -> bool {
-    key.field1 == 0x00
-        && key.field4 != 0
-        && key.field5 == VISIBILITYMAP_FORKNUM
-        && key.field6 != 0xffffffff
-}
-
-pub fn key_to_slru_block(key: Key) -> anyhow::Result<(SlruKind, u32, BlockNumber)> {
-    Ok(match key.field1 {
-        0x01 => {
-            let kind = match key.field2 {
-                0x00 => SlruKind::Clog,
-                0x01 => SlruKind::MultiXactMembers,
-                0x02 => SlruKind::MultiXactOffsets,
-                _ => anyhow::bail!("unrecognized slru kind 0x{:02x}", key.field2),
-            };
-            let segno = key.field4;
-            let blknum = key.field6;
-
-            (kind, segno, blknum)
-        }
-        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
-    })
-}
-
-fn is_slru_block_key(key: Key) -> bool {
-    key.field1 == 0x01                // SLRU-related
-        && key.field3 == 0x00000001   // but not SlruDir
-        && key.field6 != 0xffffffff // and not SlruSegSize
-}
-
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -18,16 +18,13 @@ use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::FutureExt;
 use futures::StreamExt;
-use nix::unistd::Pid;
 use pageserver_api::models;
 use pageserver_api::models::TimelineState;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
-use tokio::signal::unix::Signal;
 use std::fmt;
-use std::os::unix::process::CommandExt;
 use storage_broker::BrokerClientChannel;
 use tokio::io::BufReader;
 use tokio::runtime::Handle;
@@ -94,7 +91,6 @@ use std::fs;
 use std::fs::File;
 use std::io;
 use std::ops::Bound::Included;
-use std::process::Stdio;
 use std::sync::atomic::AtomicU64;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
@@ -631,9 +627,15 @@ impl Tenant {
            deletion_queue_client,
        ));

+        // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
+        // we shut down while attaching.
+        let Ok(attach_gate_guard) = tenant.gate.enter() else {
+            // We just created the Tenant: nothing else can have shut it down yet
+            unreachable!();
+        };
+
        // Do all the hard work in the background
        let tenant_clone = Arc::clone(&tenant);
-
        let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
        task_mgr::spawn(
            &tokio::runtime::Handle::current(),
@@ -643,6 +645,8 @@ impl Tenant {
            "attach tenant",
            false,
            async move {
+                let _gate_guard = attach_gate_guard;
+
                // Is this tenant being spawned as part of process startup?
                let starting_up = init_order.is_some();
                scopeguard::defer! {
@@ -719,6 +723,10 @@ impl Tenant {
                            // stayed in Activating for such a long time that shutdown found it in
                            // that state.
                            tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
+                            // Make the tenant broken so that set_stopping will not hang waiting for it to leave
+                            // the Attaching state.  This is an over-reaction (nothing really broke, the tenant is
+                            // just shutting down), but ensures progress.
+                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
                            return Ok(());
                        },
                    )
@@ -813,7 +821,7 @@ impl Tenant {
                    SpawnMode::Create => None,
                    SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
                };
-                match tenant_clone.attach(preload, &ctx).await {
+                match tenant_clone.attach(preload, mode, &ctx).await {
                    Ok(()) => {
                        info!("attach finished, activating");
                        if let Some(t)=  attach_timer {t.observe_duration();}
@@ -900,15 +908,20 @@ impl Tenant {
    async fn attach(
        self: &Arc<Tenant>,
        preload: Option<TenantPreload>,
+        mode: SpawnMode,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        span::debug_assert_current_span_has_tenant_id();

        failpoint_support::sleep_millis_async!("before-attaching-tenant");

-        let preload = match preload {
-            Some(p) => p,
-            None => {
+        let preload = match (preload, mode) {
+            (Some(p), _) => p,
+            (None, SpawnMode::Create) => TenantPreload {
+                deleting: false,
+                timelines: HashMap::new(),
+            },
+            (None, SpawnMode::Normal) => {
                // Deprecated dev mode: load from local disk state instead of remote storage
                // https://github.com/neondatabase/neon/issues/5624
                return self.load_local(ctx).await;
@@ -1016,7 +1029,10 @@ impl Tenant {
        // IndexPart is the source of truth.
        self.clean_up_timelines(&existent_timelines)?;

-        failpoint_support::sleep_millis_async!("attach-before-activate", &self.cancel);
+        fail::fail_point!("attach-before-activate", |_| {
+            anyhow::bail!("attach-before-activate");
+        });
+        failpoint_support::sleep_millis_async!("attach-before-activate-sleep", &self.cancel);

        info!("Done");

@@ -1680,9 +1696,13 @@ impl Tenant {
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        if !self.is_active() {
-            return Err(CreateTimelineError::Other(anyhow::anyhow!(
-                "Cannot create timelines on inactive tenant"
-            )));
+            if matches!(self.current_state(), TenantState::Stopping { .. }) {
+                return Err(CreateTimelineError::ShuttingDown);
+            } else {
+                return Err(CreateTimelineError::Other(anyhow::anyhow!(
+                    "Cannot create timelines on inactive tenant"
+                )));
+            }
        }

        let _gate = self
@@ -3749,17 +3769,7 @@ async fn run_initdb(

    let _permit = INIT_DB_SEMAPHORE.acquire().await;

-    let mut initdb_command_std = std::process::Command::new(&initdb_bin_path);
-    // The process_group function is unstable as tokio's MSRV is 1.63,
-    // and process_group was stabilized in 1.64. This is the officially
-    // recommended workaround.
-    // Setting pgroup to 0 makes the pgroupid be that of the child, as explained in
-    // https://github.com/microsoft/WSL/issues/2997 (unrelated bug, but explains it)
-    // We use need the pgid to be set for pkill to work during cancellation, to also
-    // get the child processes of initdb.
-    initdb_command_std.process_group(0);
-
-    let mut initdb_command = tokio::process::Command::from(initdb_command_std)
+    let initdb_command = tokio::process::Command::new(&initdb_bin_path)
        .args(["-D", initdb_target_dir.as_ref()])
        .args(["-U", &conf.superuser])
        .args(["-E", "utf8"])
@@ -3768,39 +3778,25 @@ async fn run_initdb(
        .env_clear()
        .env("LD_LIBRARY_PATH", &initdb_lib_dir)
        .env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
-        .stdout(Stdio::piped())
-        .stderr(Stdio::piped())
-        // If the `select!` below doesn't finish the `wait_with_output`,
-        // let the task get `wait()`ed for asynchronously by tokio.
-        // This means there is a slim chance we can go over the INIT_DB_SEMAPHORE.
-        // TODO: fix for this is non-trivial, see
-        // https://github.com/neondatabase/neon/pull/5921#pullrequestreview-1750858021
-        //
-        .kill_on_drop(true)
        .spawn()?;

-    tokio::select! {
-        exit_status = initdb_command.wait() => {
-            let exit_status = exit_status?;
-            if !exit_status.success() {
-                let mut stderr = initdb_command.stderr.take().unwrap();
-                let mut stderr_vec = Vec::new();
-                tokio::io::copy(&mut stderr, &mut stderr_vec).await?;
-                return Err(InitdbError::Failed(exit_status, stderr_vec));
-            }
-        }
-        _ = cancel.cancelled() => {
-            if let Some(pid) = initdb_command.id() {
-                warn!("Doing killpg...");
-                nix::sys::signal::killpg(Pid::from_raw(pid as i32), Signal::SIGKILL)
-                    .map_err(|e| InitdbError::Other(anyhow::anyhow!(e)))?;
-                initdb_command.wait().await?;
-            } else {
-                warn!("Couldn't obtain initdb pid, killing initdb process only.");
-                initdb_command.kill().await?;
-            }
-            return Err(InitdbError::Cancelled);
-        }
+    // Ideally we'd select here with the cancellation token, but the problem is that
+    // we can't safely terminate initdb: it launches processes of its own, and killing
+    // initdb doesn't kill them. After we return from this function, we want the target
+    // directory to be able to be cleaned up.
+    // See https://github.com/neondatabase/neon/issues/6385
+    let initdb_output = initdb_command.wait_with_output().await?;
+    if !initdb_output.status.success() {
+        return Err(InitdbError::Failed(
+            initdb_output.status,
+            initdb_output.stderr,
+        ));
+    }
+
+    // This isn't true cancellation support, see above. Still return an error to
+    // excercise the cancellation code path.
+    if cancel.is_cancelled() {
+        return Err(InitdbError::Cancelled);
    }

    Ok(())
@@ -4056,7 +4052,7 @@ pub(crate) mod harness {
                        .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                        .await?;
                    tenant
-                        .attach(Some(preload), ctx)
+                        .attach(Some(preload), SpawnMode::Normal, ctx)
                        .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                        .await?;
                }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -409,7 +409,10 @@ impl DeleteTenantFlow {
            .await
            .expect("cant be stopping or broken");

-        tenant.attach(preload, ctx).await.context("attach")?;
+        tenant
+            .attach(preload, super::SpawnMode::Normal, ctx)
+            .await
+            .context("attach")?;

        Self::background(
            guard,
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -283,15 +283,15 @@ impl LayerMap {
    ///
    /// This is used for garbage collection, to determine if an old layer can
    /// be deleted.
-    pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> Result<bool> {
+    pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> bool {
        if key.is_empty() {
            // Vacuously true. There's a newer image for all 0 of the kerys in the range.
-            return Ok(true);
+            return true;
        }

        let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
            Some(v) => v,
-            None => return Ok(false),
+            None => return false,
        };

        let start = key.start.to_i128();
@@ -304,17 +304,17 @@ impl LayerMap {

        // Check the start is covered
        if !layer_covers(version.image_coverage.query(start)) {
-            return Ok(false);
+            return false;
        }

        // Check after all changes of coverage
        for (_, change_val) in version.image_coverage.range(start..end) {
            if !layer_covers(change_val) {
-                return Ok(false);
+                return false;
            }
        }

-        Ok(true)
+        true
    }

    pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<PersistentLayerDesc>> {
@@ -325,18 +325,14 @@ impl LayerMap {
    /// Divide the whole given range of keys into sub-ranges based on the latest
    /// image layer that covers each range at the specified lsn (inclusive).
    /// This is used when creating  new image layers.
-    ///
-    // FIXME: clippy complains that the result type is very complex. She's probably
-    // right...
-    #[allow(clippy::type_complexity)]
    pub fn image_coverage(
        &self,
        key_range: &Range<Key>,
        lsn: Lsn,
-    ) -> Result<Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)>> {
+    ) -> Vec<(Range<Key>, Option<Arc<PersistentLayerDesc>>)> {
        let version = match self.historic.get().unwrap().get_version(lsn.0) {
            Some(v) => v,
-            None => return Ok(vec![]),
+            None => return vec![],
        };

        let start = key_range.start.to_i128();
@@ -359,7 +355,7 @@ impl LayerMap {
        let kr = Key::from_i128(current_key)..Key::from_i128(end);
        coverage.push((kr, current_val.take()));

-        Ok(coverage)
+        coverage
    }

    pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
@@ -410,24 +406,19 @@ impl LayerMap {
    /// This number is used to compute the largest number of deltas that
    /// we'll need to visit for any page reconstruction in this region.
    /// We use this heuristic to decide whether to create an image layer.
-    pub fn count_deltas(
-        &self,
-        key: &Range<Key>,
-        lsn: &Range<Lsn>,
-        limit: Option<usize>,
-    ) -> Result<usize> {
+    pub fn count_deltas(&self, key: &Range<Key>, lsn: &Range<Lsn>, limit: Option<usize>) -> usize {
        // We get the delta coverage of the region, and for each part of the coverage
        // we recurse right underneath the delta. The recursion depth is limited by
        // the largest result this function could return, which is in practice between
        // 3 and 10 (since we usually try to create an image when the number gets larger).

        if lsn.is_empty() || key.is_empty() || limit == Some(0) {
-            return Ok(0);
+            return 0;
        }

        let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
            Some(v) => v,
-            None => return Ok(0),
+            None => return 0,
        };

        let start = key.start.to_i128();
@@ -448,8 +439,7 @@ impl LayerMap {
                    if !kr.is_empty() {
                        let base_count = Self::is_reimage_worthy(&val, key) as usize;
                        let new_limit = limit.map(|l| l - base_count);
-                        let max_stacked_deltas_underneath =
-                            self.count_deltas(&kr, &lr, new_limit)?;
+                        let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
                        max_stacked_deltas = std::cmp::max(
                            max_stacked_deltas,
                            base_count + max_stacked_deltas_underneath,
@@ -471,7 +461,7 @@ impl LayerMap {
                if !kr.is_empty() {
                    let base_count = Self::is_reimage_worthy(&val, key) as usize;
                    let new_limit = limit.map(|l| l - base_count);
-                    let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
+                    let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit);
                    max_stacked_deltas = std::cmp::max(
                        max_stacked_deltas,
                        base_count + max_stacked_deltas_underneath,
@@ -480,7 +470,7 @@ impl LayerMap {
            }
        }

-        Ok(max_stacked_deltas)
+        max_stacked_deltas
    }

    /// Count how many reimage-worthy layers we need to visit for given key-lsn pair.
@@ -592,10 +582,7 @@ impl LayerMap {
                    if limit == Some(difficulty) {
                        break;
                    }
-                    for (img_range, last_img) in self
-                        .image_coverage(range, lsn)
-                        .expect("why would this err?")
-                    {
+                    for (img_range, last_img) in self.image_coverage(range, lsn) {
                        if limit == Some(difficulty) {
                            break;
                        }
@@ -606,9 +593,7 @@ impl LayerMap {
                        };

                        if img_lsn < lsn {
-                            let num_deltas = self
-                                .count_deltas(&img_range, &(img_lsn..lsn), limit)
-                                .expect("why would this err lol?");
+                            let num_deltas = self.count_deltas(&img_range, &(img_lsn..lsn), limit);
                            difficulty = std::cmp::max(difficulty, num_deltas);
                        }
                    }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -7,6 +7,7 @@ use pageserver_api::models::ShardParameters;
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
+use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashMap};
 use std::ops::Deref;
 use std::sync::Arc;
@@ -32,7 +33,8 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
-    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
+    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
+    TenantConfOpt,
 };
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
@@ -466,6 +468,26 @@ pub async fn init_tenant_mgr(
            // We have a generation map: treat it as the authority for whether
            // this tenant is really attached.
            if let Some(gen) = generations.get(&tenant_shard_id) {
+                if let LocationMode::Attached(attached) = &location_conf.mode {
+                    if attached.generation > *gen {
+                        tracing::error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+                            "Control plane gave decreasing generation ({gen:?}) in re-attach response for tenant that was attached in generation {:?}, demoting to secondary",
+                            attached.generation
+                        );
+
+                        // We cannot safely attach this tenant given a bogus generation number, but let's avoid throwing away
+                        // local disk content: demote to secondary rather than detaching.
+                        tenants.insert(
+                            tenant_shard_id,
+                            TenantSlot::Secondary(SecondaryTenant::new(
+                                tenant_shard_id,
+                                location_conf.shard,
+                                location_conf.tenant_conf,
+                                &SecondaryLocationConfig { warm: false },
+                            )),
+                        );
+                    }
+                }
                *gen
            } else {
                match &location_conf.mode {
@@ -721,7 +743,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
        tokio::select! {
            Some(joined) = join_set.join_next() => {
                match joined {
-                    Ok(()) => {}
+                    Ok(()) => {},
                    Err(join_error) if join_error.is_cancelled() => {
                        unreachable!("we are not cancelling any of the tasks");
                    }
@@ -882,7 +904,7 @@ impl TenantManager {
        tenant_shard_id: TenantShardId,
        new_location_config: LocationConf,
        flush: Option<Duration>,
-        spawn_mode: SpawnMode,
+        mut spawn_mode: SpawnMode,
        ctx: &RequestContext,
    ) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
        debug_assert_current_span_has_tenant_id();
@@ -902,19 +924,29 @@ impl TenantManager {
                tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
            match (&new_location_config.mode, peek_slot) {
                (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
-                    if attach_conf.generation == tenant.generation {
-                        // A transition from Attached to Attached in the same generation, we may
-                        // take our fast path and just provide the updated configuration
-                        // to the tenant.
-                        tenant.set_new_location_config(
-                            AttachedTenantConf::try_from(new_location_config.clone())
-                                .map_err(UpsertLocationError::BadRequest)?,
-                        );
+                    match attach_conf.generation.cmp(&tenant.generation) {
+                        Ordering::Equal => {
+                            // A transition from Attached to Attached in the same generation, we may
+                            // take our fast path and just provide the updated configuration
+                            // to the tenant.
+                            tenant.set_new_location_config(
+                                AttachedTenantConf::try_from(new_location_config.clone())
+                                    .map_err(UpsertLocationError::BadRequest)?,
+                            );

-                        Some(FastPathModified::Attached(tenant.clone()))
-                    } else {
-                        // Different generations, fall through to general case
-                        None
+                            Some(FastPathModified::Attached(tenant.clone()))
+                        }
+                        Ordering::Less => {
+                            return Err(UpsertLocationError::BadRequest(anyhow::anyhow!(
+                                "Generation {:?} is less than existing {:?}",
+                                attach_conf.generation,
+                                tenant.generation
+                            )));
+                        }
+                        Ordering::Greater => {
+                            // Generation advanced, fall through to general case of replacing `Tenant` object
+                            None
+                        }
                    }
                }
                (
@@ -1019,6 +1051,12 @@ impl TenantManager {
                    }
                }
                slot_guard.drop_old_value().expect("We just shut it down");
+
+                // Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
+                // the caller thinks they're creating but the tenant already existed.  We must switch to
+                // Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
+                // rather than assuming it to be empty.
+                spawn_mode = SpawnMode::Normal;
            }
            Some(TenantSlot::Secondary(state)) => {
                info!("Shutting down secondary tenant");
@@ -1102,14 +1140,46 @@ impl TenantManager {
            None
        };

-        slot_guard.upsert(new_slot).map_err(|e| match e {
-            TenantSlotUpsertError::InternalError(e) => {
-                UpsertLocationError::Other(anyhow::anyhow!(e))
+        match slot_guard.upsert(new_slot) {
+            Err(TenantSlotUpsertError::InternalError(e)) => {
+                Err(UpsertLocationError::Other(anyhow::anyhow!(e)))
            }
-            TenantSlotUpsertError::MapState(e) => UpsertLocationError::Unavailable(e),
-        })?;
+            Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)),
+            Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => {
+                // If we just called tenant_spawn() on a new tenant, and can't insert it into our map, then
+                // we must not leak it: this would violate the invariant that after shutdown_all_tenants, all tenants
+                // are shutdown.
+                //
+                // We must shut it down inline here.
+                match new_slot {
+                    TenantSlot::InProgress(_) => {
+                        // Unreachable because we never insert an InProgress
+                        unreachable!()
+                    }
+                    TenantSlot::Attached(tenant) => {
+                        let (_guard, progress) = utils::completion::channel();
+                        info!("Shutting down just-spawned tenant, because tenant manager is shut down");
+                        match tenant.shutdown(progress, false).await {
+                            Ok(()) => {
+                                info!("Finished shutting down just-spawned tenant");
+                            }
+                            Err(barrier) => {
+                                info!("Shutdown already in progress, waiting for it to complete");
+                                barrier.wait().await;
+                            }
+                        }
+                    }
+                    TenantSlot::Secondary(secondary_tenant) => {
+                        secondary_tenant.shutdown().await;
+                    }
+                }

-        Ok(attached_tenant)
+                Err(UpsertLocationError::Unavailable(
+                    TenantMapError::ShuttingDown,
+                ))
+            }
+            Ok(()) => Ok(attached_tenant),
+        }
    }

    /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
@@ -1728,14 +1798,31 @@ pub(crate) enum TenantSlotError {

 /// Superset of TenantMapError: issues that can occur when using a SlotGuard
 /// to insert a new value.
-#[derive(Debug, thiserror::Error)]
-pub enum TenantSlotUpsertError {
+#[derive(thiserror::Error)]
+pub(crate) enum TenantSlotUpsertError {
    /// An error where the slot is in an unexpected state, indicating a code bug
    #[error("Internal error updating Tenant")]
    InternalError(Cow<'static, str>),

    #[error(transparent)]
-    MapState(#[from] TenantMapError),
+    MapState(TenantMapError),
+
+    // If we encounter TenantManager shutdown during upsert, we must carry the Completion
+    // from the SlotGuard, so that the caller can hold it while they clean up: otherwise
+    // TenantManager shutdown might race ahead before we're done cleaning up any Tenant that
+    // was protected by the SlotGuard.
+    #[error("Shutting down")]
+    ShuttingDown((TenantSlot, utils::completion::Completion)),
+}
+
+impl std::fmt::Debug for TenantSlotUpsertError {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            Self::InternalError(reason) => write!(f, "Internal Error {reason}"),
+            Self::MapState(map_error) => write!(f, "Tenant map state: {map_error:?}"),
+            Self::ShuttingDown(_completion) => write!(f, "Tenant map shutting down"),
+        }
+    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -1784,7 +1871,7 @@ pub struct SlotGuard {

    /// [`TenantSlot::InProgress`] carries the corresponding Barrier: it will
    /// release any waiters as soon as this SlotGuard is dropped.
-    _completion: utils::completion::Completion,
+    completion: utils::completion::Completion,
 }

 impl SlotGuard {
@@ -1797,7 +1884,7 @@ impl SlotGuard {
            tenant_shard_id,
            old_value,
            upserted: false,
-            _completion: completion,
+            completion,
        }
    }

@@ -1830,9 +1917,16 @@ impl SlotGuard {
            }

            let m = match &mut *locked {
-                TenantsMap::Initializing => return Err(TenantMapError::StillInitializing.into()),
+                TenantsMap::Initializing => {
+                    return Err(TenantSlotUpsertError::MapState(
+                        TenantMapError::StillInitializing,
+                    ))
+                }
                TenantsMap::ShuttingDown(_) => {
-                    return Err(TenantMapError::ShuttingDown.into());
+                    return Err(TenantSlotUpsertError::ShuttingDown((
+                        new_value,
+                        self.completion.clone(),
+                    )));
                }
                TenantsMap::Open(m) => m,
            };
@@ -1880,7 +1974,9 @@ impl SlotGuard {
                Err(TenantSlotUpsertError::InternalError(_)) => {
                    // We already logged the error, nothing else we can do.
                }
-                Err(TenantSlotUpsertError::MapState(_)) => {
+                Err(
+                    TenantSlotUpsertError::MapState(_) | TenantSlotUpsertError::ShuttingDown(_),
+                ) => {
                    // If the map is shutting down, we need not replace anything
                }
                Ok(()) => {}
@@ -1978,18 +2074,22 @@ fn tenant_map_peek_slot<'a>(
    tenant_shard_id: &TenantShardId,
    mode: TenantSlotPeekMode,
 ) -> Result<Option<&'a TenantSlot>, TenantMapError> {
-    let m = match tenants.deref() {
-        TenantsMap::Initializing => return Err(TenantMapError::StillInitializing),
+    match tenants.deref() {
+        TenantsMap::Initializing => Err(TenantMapError::StillInitializing),
        TenantsMap::ShuttingDown(m) => match mode {
-            TenantSlotPeekMode::Read => m,
-            TenantSlotPeekMode::Write => {
-                return Err(TenantMapError::ShuttingDown);
-            }
+            TenantSlotPeekMode::Read => Ok(Some(
+                // When reading in ShuttingDown state, we must translate None results
+                // into a ShuttingDown error, because absence of a tenant shard ID in the map
+                // isn't a reliable indicator of the tenant being gone: it might have been
+                // InProgress when shutdown started, and cleaned up from that state such
+                // that it's now no longer in the map.  Callers will have to wait until
+                // we next start up to get a proper answer.  This avoids incorrect 404 API responses.
+                m.get(tenant_shard_id).ok_or(TenantMapError::ShuttingDown)?,
+            )),
+            TenantSlotPeekMode::Write => Err(TenantMapError::ShuttingDown),
        },
-        TenantsMap::Open(m) => m,
-    };
-
-    Ok(m.get(tenant_shard_id))
+        TenantsMap::Open(m) => Ok(m.get(tenant_shard_id)),
+    }
 }

 enum TenantSlotAcquireMode {
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -257,6 +257,8 @@ pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;

 pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";

+pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
+
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 pub(crate) const BUFFER_SIZE: usize = 32 * 1024;

@@ -1066,6 +1068,28 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    pub(crate) async fn preserve_initdb_archive(
+        self: &Arc<Self>,
+        tenant_id: &TenantId,
+        timeline_id: &TimelineId,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        backoff::retry(
+            || async {
+                upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel)
+                    .await
+            },
+            |_e| false,
+            FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "preserve_initdb_tar_zst",
+            backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled!")),
+        )
+        .await
+        .context("backing up initdb archive")?;
+        Ok(())
+    }
+
    /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
    /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
    /// deletes leaked files if any and proceeds with deletion of index file at the end.
@@ -1101,6 +1125,14 @@ impl RemoteTimelineClient {
        let layer_deletion_count = layers.len();
        self.deletion_queue_client.push_immediate(layers).await?;

+        // Delete the initdb.tar.zst, which is not always present, but deletion attempts of
+        // inexistant objects are not considered errors.
+        let initdb_path =
+            remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &self.timeline_id);
+        self.deletion_queue_client
+            .push_immediate(vec![initdb_path])
+            .await?;
+
        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
        let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
@@ -1148,10 +1180,8 @@ impl RemoteTimelineClient {
                if p == &latest_index {
                    return false;
                }
-                if let Some(name) = p.object_name() {
-                    if name == INITDB_PATH {
-                        return false;
-                    }
+                if p.object_name() == Some(INITDB_PRESERVED_PATH) {
+                    return false;
                }
                true
            })
@@ -1724,6 +1754,16 @@ pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId
    .expect("Failed to construct path")
 }

+pub fn remote_initdb_preserved_archive_path(
+    tenant_id: &TenantId,
+    timeline_id: &TimelineId,
+) -> RemotePath {
+    RemotePath::from_string(&format!(
+        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PRESERVED_PATH}"
+    ))
+    .expect("Failed to construct path")
+}
+
 pub fn remote_index_path(
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -32,7 +32,8 @@ use utils::id::TimelineId;
 use super::index::{IndexPart, LayerFileMetadata};
 use super::{
    parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
-    FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
+    remote_initdb_preserved_archive_path, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
+    INITDB_PATH,
 };

 ///
@@ -430,6 +431,9 @@ pub(crate) async fn download_initdb_tar_zst(

    let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id);

+    let remote_preserved_path =
+        remote_initdb_preserved_archive_path(&tenant_shard_id.tenant_id, timeline_id);
+
    let timeline_path = conf.timelines_path(tenant_shard_id);

    if !timeline_path.exists() {
@@ -456,8 +460,16 @@ pub(crate) async fn download_initdb_tar_zst(
                .with_context(|| format!("tempfile creation {temp_path}"))
                .map_err(DownloadError::Other)?;

-            let download =
-                download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
+            let download = match download_cancellable(&cancel_inner, storage.download(&remote_path))
+                .await
+            {
+                Ok(dl) => dl,
+                Err(DownloadError::NotFound) => {
+                    download_cancellable(&cancel_inner, storage.download(&remote_preserved_path))
+                        .await?
+                }
+                Err(other) => Err(other)?,
+            };
            let mut download = tokio_util::io::StreamReader::new(download.download_stream);
            let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);

--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -13,8 +13,8 @@ use super::Generation;
 use crate::{
    config::PageServerConf,
    tenant::remote_timeline_client::{
-        index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path,
-        upload_cancellable,
+        index::IndexPart, remote_index_path, remote_initdb_archive_path,
+        remote_initdb_preserved_archive_path, remote_path, upload_cancellable,
    },
 };
 use remote_storage::GenericRemoteStorage;
@@ -144,3 +144,16 @@ pub(crate) async fn upload_initdb_dir(
    .await
    .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
 }
+
+pub(crate) async fn preserve_initdb_archive(
+    storage: &GenericRemoteStorage,
+    tenant_id: &TenantId,
+    timeline_id: &TimelineId,
+    cancel: &CancellationToken,
+) -> anyhow::Result<()> {
+    let source_path = remote_initdb_archive_path(tenant_id, timeline_id);
+    let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id);
+    upload_cancellable(cancel, storage.copy_object(&source_path, &dest_path))
+        .await
+        .with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'"))
+}
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -9,6 +9,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -181,8 +182,11 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                    );
                    error_run_count += 1;
                    let wait_duration = Duration::from_secs_f64(wait_duration);
-                    error!(
-                        "Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
+                    log_compaction_error(
+                        &e,
+                        error_run_count,
+                        &wait_duration,
+                        cancel.is_cancelled(),
                    );
                    wait_duration
                } else {
@@ -210,6 +214,58 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
 }

+fn log_compaction_error(
+    e: &CompactionError,
+    error_run_count: u32,
+    sleep_duration: &std::time::Duration,
+    task_cancelled: bool,
+) {
+    use crate::tenant::upload_queue::NotInitialized;
+    use crate::tenant::PageReconstructError;
+    use CompactionError::*;
+
+    enum LooksLike {
+        Info,
+        Error,
+    }
+
+    let decision = match e {
+        ShuttingDown => None,
+        _ if task_cancelled => Some(LooksLike::Info),
+        Other(e) => {
+            let root_cause = e.root_cause();
+
+            let is_stopping = {
+                let upload_queue = root_cause
+                    .downcast_ref::<NotInitialized>()
+                    .is_some_and(|e| e.is_stopping());
+
+                let timeline = root_cause
+                    .downcast_ref::<PageReconstructError>()
+                    .is_some_and(|e| e.is_stopping());
+
+                upload_queue || timeline
+            };
+
+            if is_stopping {
+                Some(LooksLike::Info)
+            } else {
+                Some(LooksLike::Error)
+            }
+        }
+    };
+
+    match decision {
+        Some(LooksLike::Info) => info!(
+            "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:#}",
+        ),
+        Some(LooksLike::Error) => error!(
+            "Compaction failed {error_run_count} times, retrying in {sleep_duration:?}: {e:?}",
+        ),
+        None => {}
+    }
+}
+
 ///
 /// GC task's main loop
 ///
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,6 +14,7 @@ use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::{
+    keyspace::{key_range_size, KeySpaceAccum},
    models::{
        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
        LayerMapInfo, TimelineState,
@@ -32,7 +33,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::sync::gate::Gate;

-use std::collections::{BinaryHeap, HashMap, HashSet};
+use std::collections::{BTreeMap, BinaryHeap, HashMap, HashSet};
 use std::ops::{Deref, Range};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
@@ -73,8 +74,8 @@ use crate::metrics::{
    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
-use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
 use crate::tenant::config::TenantConfOpt;
+use pageserver_api::key::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::ShardIndex;

@@ -391,8 +392,7 @@ pub(crate) enum PageReconstructError {
    #[error("Ancestor LSN wait error: {0}")]
    AncestorLsnTimeout(#[from] WaitLsnError),

-    /// The operation was cancelled
-    #[error("Cancelled")]
+    #[error("timeline shutting down")]
    Cancelled,

    /// The ancestor of this is being stopped
@@ -404,6 +404,34 @@ pub(crate) enum PageReconstructError {
    WalRedo(anyhow::Error),
 }

+impl PageReconstructError {
+    /// Returns true if this error indicates a tenant/timeline shutdown alike situation
+    pub(crate) fn is_stopping(&self) -> bool {
+        use PageReconstructError::*;
+        match self {
+            Other(_) => false,
+            AncestorLsnTimeout(_) => false,
+            Cancelled | AncestorStopping(_) => true,
+            WalRedo(_) => false,
+        }
+    }
+}
+
+#[derive(thiserror::Error, Debug)]
+enum CreateImageLayersError {
+    #[error("timeline shutting down")]
+    Cancelled,
+
+    #[error(transparent)]
+    GetVectoredError(GetVectoredError),
+
+    #[error(transparent)]
+    PageReconstructError(PageReconstructError),
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 #[derive(thiserror::Error, Debug)]
 enum FlushLayerError {
    /// Timeline cancellation token was cancelled
@@ -411,12 +439,24 @@ enum FlushLayerError {
    Cancelled,

    #[error(transparent)]
-    PageReconstructError(#[from] PageReconstructError),
+    CreateImageLayersError(CreateImageLayersError),

    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }

+#[derive(thiserror::Error, Debug)]
+pub(crate) enum GetVectoredError {
+    #[error("timeline shutting down")]
+    Cancelled,
+
+    #[error("Requested too many keys: {0} > {}", Timeline::MAX_GET_VECTORED_KEYS)]
+    Oversized(u64),
+
+    #[error("Requested at invalid LSN: {0}")]
+    InvalidLsn(Lsn),
+}
+
 #[derive(Clone, Copy)]
 pub enum LogicalSizeCalculationCause {
    Initial,
@@ -456,6 +496,45 @@ pub(crate) enum WaitLsnError {
    Timeout(String),
 }

+// The impls below achieve cancellation mapping for errors.
+// Perhaps there's a way of achieving this with less cruft.
+
+impl From<CreateImageLayersError> for CompactionError {
+    fn from(e: CreateImageLayersError) -> Self {
+        match e {
+            CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
+            _ => CompactionError::Other(e.into()),
+        }
+    }
+}
+
+impl From<CreateImageLayersError> for FlushLayerError {
+    fn from(e: CreateImageLayersError) -> Self {
+        match e {
+            CreateImageLayersError::Cancelled => FlushLayerError::Cancelled,
+            any => FlushLayerError::CreateImageLayersError(any),
+        }
+    }
+}
+
+impl From<PageReconstructError> for CreateImageLayersError {
+    fn from(e: PageReconstructError) -> Self {
+        match e {
+            PageReconstructError::Cancelled => CreateImageLayersError::Cancelled,
+            _ => CreateImageLayersError::PageReconstructError(e),
+        }
+    }
+}
+
+impl From<GetVectoredError> for CreateImageLayersError {
+    fn from(e: GetVectoredError) -> Self {
+        match e {
+            GetVectoredError::Cancelled => CreateImageLayersError::Cancelled,
+            _ => CreateImageLayersError::GetVectoredError(e),
+        }
+    }
+}
+
 /// Public interface functions
 impl Timeline {
    /// Get the LSN where this branch was created
@@ -575,6 +654,53 @@ impl Timeline {
        res
    }

+    pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
+
+    /// Look up multiple page versions at a given LSN
+    ///
+    /// This naive implementation will be replaced with a more efficient one
+    /// which actually vectorizes the read path.
+    pub(crate) async fn get_vectored(
+        &self,
+        key_ranges: &[Range<Key>],
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        if !lsn.is_valid() {
+            return Err(GetVectoredError::InvalidLsn(lsn));
+        }
+
+        let key_count = key_ranges
+            .iter()
+            .map(|range| key_range_size(range) as u64)
+            .sum();
+        if key_count > Timeline::MAX_GET_VECTORED_KEYS {
+            return Err(GetVectoredError::Oversized(key_count));
+        }
+
+        let mut values = BTreeMap::new();
+        for range in key_ranges {
+            let mut key = range.start;
+            while key != range.end {
+                assert!(!self.shard_identity.is_key_disposable(&key));
+
+                let block = self.get(key, lsn, ctx).await;
+
+                if matches!(
+                    block,
+                    Err(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
+                ) {
+                    return Err(GetVectoredError::Cancelled);
+                }
+
+                values.insert(key, block);
+                key = key.next();
+            }
+        }
+
+        Ok(values)
+    }
+
    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
    pub fn get_last_record_lsn(&self) -> Lsn {
        self.last_record_lsn.load().last
@@ -2582,7 +2708,7 @@ impl Timeline {
                        return;
                    }
                    err @ Err(
-                        FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_),
+                        FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_),
                    ) => {
                        error!("could not flush frozen layer: {err:?}");
                        break err;
@@ -2859,6 +2985,21 @@ impl Timeline {
        Ok(())
    }

+    pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
+        if let Some(remote_client) = &self.remote_client {
+            remote_client
+                .preserve_initdb_archive(
+                    &self.tenant_shard_id.tenant_id,
+                    &self.timeline_id,
+                    &self.cancel,
+                )
+                .await?;
+        } else {
+            bail!("No remote storage configured, but was asked to backup the initdb archive for {} / {}", self.tenant_shard_id.tenant_id, self.timeline_id);
+        }
+        Ok(())
+    }
+
    // Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked
    // in layer map immediately. The caller is responsible to put it into the layer map.
    async fn create_delta_layer(
@@ -2950,11 +3091,7 @@ impl Timeline {
    }

    // Is it time to create a new image layer for the given partition?
-    async fn time_for_new_image_layer(
-        &self,
-        partition: &KeySpace,
-        lsn: Lsn,
-    ) -> anyhow::Result<bool> {
+    async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
        let threshold = self.get_image_creation_threshold();

        let guard = self.layers.read().await;
@@ -2974,20 +3111,20 @@ impl Timeline {
                    // but the range is already covered by image layers at more recent LSNs. Before we
                    // create a new image layer, check if the range is already covered at more recent LSNs.
                    if !layers
-                        .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))?
+                        .image_layer_exists(&img_range, &(Lsn::min(lsn, *cutoff_lsn)..lsn + 1))
                    {
                        debug!(
                            "Force generation of layer {}-{} wanted by GC, cutoff={}, lsn={})",
                            img_range.start, img_range.end, cutoff_lsn, lsn
                        );
-                        return Ok(true);
+                        return true;
                    }
                }
            }
        }

        for part_range in &partition.ranges {
-            let image_coverage = layers.image_coverage(part_range, lsn)?;
+            let image_coverage = layers.image_coverage(part_range, lsn);
            for (img_range, last_img) in image_coverage {
                let img_lsn = if let Some(last_img) = last_img {
                    last_img.get_lsn_range().end
@@ -3008,7 +3145,7 @@ impl Timeline {
                // after we read last_record_lsn, which is passed here in the 'lsn' argument.
                if img_lsn < lsn {
                    let num_deltas =
-                        layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
+                        layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold));

                    max_deltas = max_deltas.max(num_deltas);
                    if num_deltas >= threshold {
@@ -3016,7 +3153,7 @@ impl Timeline {
                            "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
                            img_range.start, img_range.end, num_deltas, img_lsn, lsn
                        );
-                        return Ok(true);
+                        return true;
                    }
                }
            }
@@ -3026,7 +3163,7 @@ impl Timeline {
            max_deltas,
            "none of the partitioned ranges had >= {threshold} deltas"
        );
-        Ok(false)
+        false
    }

    #[tracing::instrument(skip_all, fields(%lsn, %force))]
@@ -3036,7 +3173,7 @@ impl Timeline {
        lsn: Lsn,
        force: bool,
        ctx: &RequestContext,
-    ) -> Result<Vec<ResidentLayer>, PageReconstructError> {
+    ) -> Result<Vec<ResidentLayer>, CreateImageLayersError> {
        let timer = self.metrics.create_images_time_histo.start_timer();
        let mut image_layers = Vec::new();

@@ -3054,7 +3191,7 @@ impl Timeline {
        for partition in partitioning.parts.iter() {
            let img_range = start..partition.ranges.last().unwrap().end;
            start = img_range.end;
-            if force || self.time_for_new_image_layer(partition, lsn).await? {
+            if force || self.time_for_new_image_layer(partition, lsn).await {
                let mut image_layer_writer = ImageLayerWriter::new(
                    self.conf,
                    self.timeline_id,
@@ -3065,10 +3202,12 @@ impl Timeline {
                .await?;

                fail_point!("image-layer-writer-fail-before-finish", |_| {
-                    Err(PageReconstructError::Other(anyhow::anyhow!(
+                    Err(CreateImageLayersError::Other(anyhow::anyhow!(
                        "failpoint image-layer-writer-fail-before-finish"
                    )))
                });
+
+                let mut key_request_accum = KeySpaceAccum::new();
                for range in &partition.ranges {
                    let mut key = range.start;
                    while key < range.end {
@@ -3081,34 +3220,55 @@ impl Timeline {
                            key = key.next();
                            continue;
                        }
-                        let img = match self.get(key, lsn, ctx).await {
-                            Ok(img) => img,
-                            Err(err) => {
-                                // If we fail to reconstruct a VM or FSM page, we can zero the
-                                // page without losing any actual user data. That seems better
-                                // than failing repeatedly and getting stuck.
-                                //
-                                // We had a bug at one point, where we truncated the FSM and VM
-                                // in the pageserver, but the Postgres didn't know about that
-                                // and continued to generate incremental WAL records for pages
-                                // that didn't exist in the pageserver. Trying to replay those
-                                // WAL records failed to find the previous image of the page.
-                                // This special case allows us to recover from that situation.
-                                // See https://github.com/neondatabase/neon/issues/2601.
-                                //
-                                // Unfortunately we cannot do this for the main fork, or for
-                                // any metadata keys, keys, as that would lead to actual data
-                                // loss.
-                                if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
-                                    warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
-                                    ZERO_PAGE.clone()
-                                } else {
-                                    return Err(err);
-                                }
-                            }
-                        };

-                        image_layer_writer.put_image(key, &img).await?;
+                        key_request_accum.add_key(key);
+                        if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
+                            || key.next() == range.end
+                        {
+                            let results = self
+                                .get_vectored(
+                                    &key_request_accum.consume_keyspace().ranges,
+                                    lsn,
+                                    ctx,
+                                )
+                                .await?;
+
+                            for (img_key, img) in results {
+                                let img = match img {
+                                    Ok(img) => img,
+                                    Err(err) => {
+                                        // If we fail to reconstruct a VM or FSM page, we can zero the
+                                        // page without losing any actual user data. That seems better
+                                        // than failing repeatedly and getting stuck.
+                                        //
+                                        // We had a bug at one point, where we truncated the FSM and VM
+                                        // in the pageserver, but the Postgres didn't know about that
+                                        // and continued to generate incremental WAL records for pages
+                                        // that didn't exist in the pageserver. Trying to replay those
+                                        // WAL records failed to find the previous image of the page.
+                                        // This special case allows us to recover from that situation.
+                                        // See https://github.com/neondatabase/neon/issues/2601.
+                                        //
+                                        // Unfortunately we cannot do this for the main fork, or for
+                                        // any metadata keys, keys, as that would lead to actual data
+                                        // loss.
+                                        if is_rel_fsm_block_key(img_key)
+                                            || is_rel_vm_block_key(img_key)
+                                        {
+                                            warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
+                                            ZERO_PAGE.clone()
+                                        } else {
+                                            return Err(
+                                                CreateImageLayersError::PageReconstructError(err),
+                                            );
+                                        }
+                                    }
+                                };
+
+                                image_layer_writer.put_image(img_key, &img).await?;
+                            }
+                        }
+
                        key = key.next();
                    }
                }
@@ -3484,7 +3644,7 @@ impl Timeline {
                    // has not so much sense, because largest holes will corresponds field1/field2 changes.
                    // But we are mostly interested to eliminate holes which cause generation of excessive image layers.
                    // That is why it is better to measure size of hole as number of covering image layers.
-                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len();
+                    let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
                    if coverage_size >= min_hole_coverage_size {
                        heap.push(Hole {
                            key_range,
@@ -4110,7 +4270,7 @@ impl Timeline {
            // we cannot remove C, even though it's older than 2500, because
            // the delta layer 2000-3000 depends on it.
            if !layers
-                .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))?
+                .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))
            {
                debug!("keeping {} because it is the latest layer", l.filename());
                // Collect delta key ranges that need image layers to allow garbage
@@ -4240,7 +4400,7 @@ impl Timeline {
                    .walredo_mgr
                    .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
                    .await
-                    .context("Failed to reconstruct a page image:")
+                    .context("reconstruct a page image")
                {
                    Ok(img) => img,
                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -126,6 +126,27 @@ pub(super) struct UploadQueueStopped {
    pub(super) deleted_at: SetDeletedFlagProgress,
 }

+#[derive(thiserror::Error, Debug)]
+pub(crate) enum NotInitialized {
+    #[error("queue is in state Uninitialized")]
+    Uninitialized,
+    #[error("queue is in state Stopping")]
+    Stopped,
+    #[error("queue is shutting down")]
+    ShuttingDown,
+}
+
+impl NotInitialized {
+    pub(crate) fn is_stopping(&self) -> bool {
+        use NotInitialized::*;
+        match self {
+            Uninitialized => false,
+            Stopped => true,
+            ShuttingDown => true,
+        }
+    }
+}
+
 impl UploadQueue {
    pub(crate) fn initialize_empty_remote(
        &mut self,
@@ -214,17 +235,17 @@ impl UploadQueue {
    }

    pub(crate) fn initialized_mut(&mut self) -> anyhow::Result<&mut UploadQueueInitialized> {
+        use UploadQueue::*;
        match self {
-            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
-                anyhow::bail!("queue is in state {}", self.as_str())
-            }
-            UploadQueue::Initialized(x) => {
-                if !x.shutting_down {
-                    Ok(x)
+            Uninitialized => Err(NotInitialized::Uninitialized.into()),
+            Initialized(x) => {
+                if x.shutting_down {
+                    Err(NotInitialized::ShuttingDown.into())
                } else {
-                    anyhow::bail!("queue is shutting down")
+                    Ok(x)
                }
            }
+            Stopped(_) => Err(NotInitialized::Stopped.into()),
        }
    }

--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -356,12 +356,7 @@ impl VirtualFile {
        Ok(vfile)
    }

-    /// Writes a file to the specified `final_path` in a crash safe fasion
-    ///
-    /// The file is first written to the specified tmp_path, and in a second
-    /// step, the tmp path is renamed to the final path. As renames are
-    /// atomic, a crash during the write operation will never leave behind a
-    /// partially written file.
+    /// Async & [`VirtualFile`]-enabled version of [`::utils::crashsafe::overwrite`].
    pub async fn crashsafe_overwrite(
        final_path: &Utf8Path,
        tmp_path: &Utf8Path,
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -33,11 +33,12 @@ use utils::failpoint_support;

 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
-use crate::pgdatadir_mapping::*;
+use crate::pgdatadir_mapping::{DatadirModification, Version};
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::walrecord::*;
 use crate::ZERO_PAGE;
+use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -47,11 +47,10 @@ use crate::metrics::{
    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM,
    WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
 };
-use crate::pgdatadir_mapping::key_to_slru_block;
 use crate::repository::Key;
 use crate::walrecord::NeonWalRecord;

-use pageserver_api::key::key_to_rel_block;
+use pageserver_api::key::{key_to_rel_block, key_to_slru_block};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
@@ -837,9 +836,8 @@ impl WalRedoProcess {
        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
        let mut nwrite = 0usize;

-        let mut stdin_pollfds = [PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT)];
-
        while nwrite < writebuf.len() {
+            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
            let n = loop {
                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
                    Err(nix::errno::Errno::EINTR) => continue,
@@ -878,7 +876,6 @@ impl WalRedoProcess {
        // advancing processed responses number.

        let mut output = self.stdout.lock().unwrap();
-        let mut stdout_pollfds = [PollFd::new(output.stdout.as_raw_fd(), PollFlags::POLLIN)];
        let n_processed_responses = output.n_processed_responses;
        while n_processed_responses + output.pending_responses.len() <= request_no {
            // We expect the WAL redo process to respond with an 8k page image. We read it
@@ -886,6 +883,7 @@ impl WalRedoProcess {
            let mut resultbuf = vec![0; BLCKSZ.into()];
            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
            while nresult < BLCKSZ.into() {
+                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
                // We do two things simultaneously: reading response from stdout
                // and forward any logging information that the child writes to its stderr to the page server's log.
                let n = loop {
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -637,7 +637,7 @@ HandleAlterRole(AlterRoleStmt *stmt)
 	ListCell   *option;
 	const char *role_name = stmt->role->rolename;

-	if (RoleIsNeonSuperuser(role_name))
+	if (RoleIsNeonSuperuser(role_name) && !superuser())
 		elog(ERROR, "can't ALTER neon_superuser");

 	foreach(option, stmt->options)
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -15,6 +15,7 @@
 #include "postgres.h"

 #include "access/xlog.h"
+#include "common/hashfn.h"
 #include "fmgr.h"
 #include "libpq-fe.h"
 #include "libpq/libpq.h"
@@ -38,17 +39,6 @@
 #define MIN_RECONNECT_INTERVAL_USEC 1000
 #define MAX_RECONNECT_INTERVAL_USEC 1000000

-bool		connected = false;
-PGconn	   *pageserver_conn = NULL;
-
-/*
- * WaitEventSet containing:
- * - WL_SOCKET_READABLE on pageserver_conn,
- * - WL_LATCH_SET on MyLatch, and
- * - WL_EXIT_ON_PM_DEATH.
- */
-WaitEventSet *pageserver_conn_wes = NULL;
-
 /* GUCs */
 char	   *neon_timeline;
 char	   *neon_tenant;
@@ -59,16 +49,40 @@ char	   *neon_auth_token;
 int			readahead_buffer_size = 128;
 int			flush_every_n_requests = 8;

-static int n_reconnect_attempts = 0;
-static int max_reconnect_attempts = 60;
-
-#define MAX_PAGESERVER_CONNSTRING_SIZE 256
+static int	n_reconnect_attempts = 0;
+static int	max_reconnect_attempts = 60;
+static int	stripe_size;

 typedef struct
 {
-	LWLockId	lock;
-	pg_atomic_uint64 update_counter;
-	char		pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
+	char		connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE];
+	size_t		num_shards;
+} ShardMap;
+
+/*
+ * PagestoreShmemState is kept in shared memory. It contains the connection
+ * strings for each shard.
+ *
+ * The "neon.pageserver_connstring" GUC is marked with the PGC_SIGHUP option,
+ * allowing it to be changed using pg_reload_conf(). The control plane can
+ * update the connection string if the pageserver crashes, is relocated, or
+ * new shards are added. A parsed copy of the current value of the GUC is kept
+ * in shared memory, updated by the postmaster, because regular backends don't
+ * reload the config during query execution, but we might need to re-establish
+ * the pageserver connection with the new connection string even in the middle
+ * of a query.
+ *
+ * The shared memory copy is protected by a lockless algorithm using two
+ * atomic counters. The counters allow a backend to quickly check if the value
+ * has changed since last access, and to detect and retry copying the value if
+ * the postmaster changes the value concurrently. (Postmaster doesn't have a
+ * PGPROC entry and therefore cannot use LWLocks.)
+ */
+typedef struct
+{
+	pg_atomic_uint64 begin_update_counter;
+	pg_atomic_uint64 end_update_counter;
+	ShardMap	shard_map;
 } PagestoreShmemState;

 #if PG_VERSION_NUM >= 150000
@@ -78,76 +92,242 @@ static void walproposer_shmem_request(void);
 static shmem_startup_hook_type prev_shmem_startup_hook;
 static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;
-static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];

-static bool pageserver_flush(void);
-static void pageserver_disconnect(void);
+/* This backend's per-shard connections */
+typedef struct
+{
+	PGconn	   *conn;
+
+	/*---
+	 * WaitEventSet containing:
+	 * - WL_SOCKET_READABLE on 'conn'
+	 * - WL_LATCH_SET on MyLatch, and
+	 * - WL_EXIT_ON_PM_DEATH.
+	 */
+	WaitEventSet *wes;
+} PageServer;
+
+static PageServer page_servers[MAX_SHARDS];
+
+static bool pageserver_flush(shardno_t shard_no);
+static void pageserver_disconnect(shardno_t shard_no);

 static bool
-PagestoreShmemIsValid()
+PagestoreShmemIsValid(void)
 {
 	return pagestore_shared && UsedShmemSegAddr;
 }

+/*
+ * Parse a comma-separated list of connection strings into a ShardMap.
+ *
+ * If 'result' is NULL, just checks that the input is valid. If the input is
+ * not valid, returns false. The contents of *result are undefined in
+ * that case, and must not be relied on.
+ */
+static bool
+ParseShardMap(const char *connstr, ShardMap *result)
+{
+	const char *p;
+	int			nshards = 0;
+
+	if (result)
+		memset(result, 0, sizeof(ShardMap));
+
+	p = connstr;
+	nshards = 0;
+	for (;;)
+	{
+		const char *sep;
+		size_t		connstr_len;
+
+		sep = strchr(p, ',');
+		connstr_len = sep != NULL ? sep - p : strlen(p);
+
+		if (connstr_len == 0 && sep == NULL)
+			break;				/* ignore trailing comma */
+
+		if (nshards >= MAX_SHARDS)
+		{
+			neon_log(LOG, "Too many shards");
+			return false;
+		}
+		if (connstr_len >= MAX_PAGESERVER_CONNSTRING_SIZE)
+		{
+			neon_log(LOG, "Connection string too long");
+			return false;
+		}
+		if (result)
+		{
+			memcpy(result->connstring[nshards], p, connstr_len);
+			result->connstring[nshards][connstr_len] = '\0';
+		}
+		nshards++;
+
+		if (sep == NULL)
+			break;
+		p = sep + 1;
+	}
+	if (result)
+		result->num_shards = nshards;
+
+	return true;
+}
+
 static bool
 CheckPageserverConnstring(char **newval, void **extra, GucSource source)
 {
-	return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
+	char	   *p = *newval;
+
+	return ParseShardMap(p, NULL);
 }

 static void
 AssignPageserverConnstring(const char *newval, void *extra)
 {
-	if (!PagestoreShmemIsValid())
+	ShardMap	shard_map;
+
+	/*
+	 * Only postmaster updates the copy in shared memory.
+	 */
+	if (!PagestoreShmemIsValid() || IsUnderPostmaster)
 		return;
-	LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
-	strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
-	pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
-	LWLockRelease(pagestore_shared->lock);
-}
-
-static bool
-CheckConnstringUpdated()
-{
-	if (!PagestoreShmemIsValid())
-		return false;
-	return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
+
+	if (!ParseShardMap(newval, &shard_map))
+	{
+		/*
+		 * shouldn't happen, because we already checked the value in
+		 * CheckPageserverConnstring
+		 */
+		elog(ERROR, "could not parse shard map");
+	}
+
+	if (memcmp(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap)) != 0)
+	{
+		pg_atomic_add_fetch_u64(&pagestore_shared->begin_update_counter, 1);
+		pg_write_barrier();
+		memcpy(&pagestore_shared->shard_map, &shard_map, sizeof(ShardMap));
+		pg_write_barrier();
+		pg_atomic_add_fetch_u64(&pagestore_shared->end_update_counter, 1);
+	}
+	else
+	{
+		/* no change */
+	}
 }

+/*
+ * Get the current number of shards, and/or the connection string for a
+ * particular shard from the shard map in shared memory.
+ *
+ * If num_shards_p is not NULL, it is set to the current number of shards.
+ *
+ * If connstr_p is not NULL, the connection string for 'shard_no' is copied to
+ * it. It must point to a buffer at least MAX_PAGESERVER_CONNSTRING_SIZE bytes
+ * long.
+ *
+ * As a side-effect, if the shard map in shared memory had changed since the
+ * last call, terminates all existing connections to all pageservers.
+ */
 static void
-ReloadConnstring()
+load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p)
 {
-	if (!PagestoreShmemIsValid())
-		return;
-	LWLockAcquire(pagestore_shared->lock, LW_SHARED);
-	strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
-	pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
-	LWLockRelease(pagestore_shared->lock);
+	uint64		begin_update_counter;
+	uint64		end_update_counter;
+	ShardMap   *shard_map = &pagestore_shared->shard_map;
+	shardno_t	num_shards;
+
+	/*
+	 * Postmaster can update the shared memory values concurrently, in which
+	 * case we would copy a garbled mix of the old and new values. We will
+	 * detect it because the counter's won't match, and retry. But it's
+	 * important that we don't do anything within the retry-loop that would
+	 * depend on the string having valid contents.
+	 */
+	do
+	{
+		begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
+		end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter);
+
+		num_shards = shard_map->num_shards;
+		if (connstr_p && shard_no < MAX_SHARDS)
+			strlcpy(connstr_p, shard_map->connstring[shard_no], MAX_PAGESERVER_CONNSTRING_SIZE);
+		pg_memory_barrier();
+	}
+	while (begin_update_counter != end_update_counter
+		   || begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter)
+		   || end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter));
+
+	if (connstr_p && shard_no >= num_shards)
+		neon_log(ERROR, "Shard %d is greater or equal than number of shards %d",
+				 shard_no, num_shards);
+
+	/*
+	 * If any of the connection strings changed, reset all connections.
+	 */
+	if (pagestore_local_counter != end_update_counter)
+	{
+		for (shardno_t i = 0; i < MAX_SHARDS; i++)
+		{
+			if (page_servers[i].conn)
+				pageserver_disconnect(i);
+		}
+		pagestore_local_counter = end_update_counter;
+	}
+
+	if (num_shards_p)
+		*num_shards_p = num_shards;
+}
+
+#define MB (1024*1024)
+
+shardno_t
+get_shard_number(BufferTag *tag)
+{
+	shardno_t	n_shards;
+	uint32		hash;
+
+	load_shard_map(0, NULL, &n_shards);
+
+#if PG_MAJORVERSION_NUM < 16
+	hash = murmurhash32(tag->rnode.relNode);
+	hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
+#else
+	hash = murmurhash32(tag->relNumber);
+	hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
+#endif
+
+	return hash % n_shards;
 }

 static bool
-pageserver_connect(int elevel)
+pageserver_connect(shardno_t shard_no, int elevel)
 {
 	char	   *query;
 	int			ret;
 	const char *keywords[3];
 	const char *values[3];
 	int			n;
+	PGconn	   *conn;
+	WaitEventSet *wes;
+	char		connstr[MAX_PAGESERVER_CONNSTRING_SIZE];

 	static TimestampTz last_connect_time = 0;
 	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
 	TimestampTz now;
-        uint64_t us_since_last_connect;
+	uint64_t	us_since_last_connect;

-	Assert(!connected);
+	Assert(page_servers[shard_no].conn == NULL);

-	if (CheckConnstringUpdated())
-	{
-		ReloadConnstring();
-	}
+	/*
+	 * Get the connection string for this shard. If the shard map has been
+	 * updated since we last looked, this will also disconnect any existing
+	 * pageserver connections as a side effect.
+	 */
+	load_shard_map(shard_no, connstr, NULL);

 	now = GetCurrentTimestamp();
-        us_since_last_connect = now - last_connect_time;
+	us_since_last_connect = now - last_connect_time;
 	if (us_since_last_connect < delay_us)
 	{
 		pg_usleep(delay_us - us_since_last_connect);
@@ -180,76 +360,84 @@ pageserver_connect(int elevel)
 		n++;
 	}
 	keywords[n] = "dbname";
-	values[n] = local_pageserver_connstring;
+	values[n] = connstr;
 	n++;
 	keywords[n] = NULL;
 	values[n] = NULL;
 	n++;
-	pageserver_conn = PQconnectdbParams(keywords, values, 1);
+	conn = PQconnectdbParams(keywords, values, 1);

-	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
+	if (PQstatus(conn) == CONNECTION_BAD)
 	{
-		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+		char	   *msg = pchomp(PQerrorMessage(conn));

-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
+		PQfinish(conn);

 		ereport(elevel,
 				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-				 errmsg(NEON_TAG "could not establish connection to pageserver"),
+				 errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
 				 errdetail_internal("%s", msg)));
+		pfree(msg);
 		return false;
 	}
-
 	query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
-	ret = PQsendQuery(pageserver_conn, query);
+	ret = PQsendQuery(conn, query);
+	pfree(query);
 	if (ret != 1)
 	{
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
-		neon_log(elevel, "could not send pagestream command to pageserver");
+		PQfinish(conn);
+		neon_shard_log(shard_no, elevel, "could not send pagestream command to pageserver");
 		return false;
 	}

-	pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
-	AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
+	wes = CreateWaitEventSet(TopMemoryContext, 3);
+	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
 					  MyLatch, NULL);
-	AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+	AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 					  NULL, NULL);
-	AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
+	AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);

-	while (PQisBusy(pageserver_conn))
+	PG_TRY();
 	{
-		WaitEvent	event;
-
-		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
-		ResetLatch(MyLatch);
-
-		CHECK_FOR_INTERRUPTS();
-
-		/* Data available in socket? */
-		if (event.events & WL_SOCKET_READABLE)
+		while (PQisBusy(conn))
 		{
-			if (!PQconsumeInput(pageserver_conn))
+			WaitEvent	event;
+
+			/* Sleep until there's something to do */
+			(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+			ResetLatch(MyLatch);
+
+			CHECK_FOR_INTERRUPTS();
+
+			/* Data available in socket? */
+			if (event.events & WL_SOCKET_READABLE)
 			{
-				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+				if (!PQconsumeInput(conn))
+				{
+					char	   *msg = pchomp(PQerrorMessage(conn));

-				PQfinish(pageserver_conn);
-				pageserver_conn = NULL;
-				FreeWaitEventSet(pageserver_conn_wes);
-				pageserver_conn_wes = NULL;
+					PQfinish(conn);
+					FreeWaitEventSet(wes);

-				neon_log(elevel, "could not complete handshake with pageserver: %s",
-						 msg);
-				return false;
+					neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
+								   msg);
+					return false;
+				}
 			}
 		}
 	}
+	PG_CATCH();
+	{
+		PQfinish(conn);
+		FreeWaitEventSet(wes);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();

-	neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring);
+	neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
+	page_servers[shard_no].conn = conn;
+	page_servers[shard_no].wes = wes;

-	connected = true;
 	return true;
 }

@@ -257,9 +445,10 @@ pageserver_connect(int elevel)
 * A wrapper around PQgetCopyData that checks for interrupts while sleeping.
 */
 static int
-call_PQgetCopyData(char **buffer)
+call_PQgetCopyData(shardno_t shard_no, char **buffer)
 {
 	int			ret;
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;

 retry:
 	ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
@@ -269,7 +458,7 @@ retry:
 		WaitEvent	event;

 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();
@@ -281,7 +470,7 @@ retry:
 			{
 				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

-				neon_log(LOG, "could not get response from pageserver: %s", msg);
+				neon_shard_log(shard_no, LOG, "could not get response from pageserver: %s", msg);
 				pfree(msg);
 				return -1;
 			}
@@ -295,7 +484,7 @@ retry:


 static void
-pageserver_disconnect(void)
+pageserver_disconnect(shardno_t shard_no)
 {
 	/*
 	 * If anything goes wrong while we were sending a request, it's not clear
@@ -304,38 +493,38 @@ pageserver_disconnect(void)
 	 * time later after we have already sent a new unrelated request. Close
 	 * the connection to avoid getting confused.
 	 */
-	if (connected)
+	if (page_servers[shard_no].conn)
 	{
-		neon_log(LOG, "dropping connection to page server due to error");
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
-		connected = false;
+		neon_shard_log(shard_no, LOG, "dropping connection to page server due to error");
+		PQfinish(page_servers[shard_no].conn);
+		page_servers[shard_no].conn = NULL;

+		/*
+		 * If the connection to any pageserver is lost, we throw away the
+		 * whole prefetch queue, even for other pageservers. It should not
+		 * cause big problems, because connection loss is supposed to be a
+		 * rare event.
+		 */
 		prefetch_on_ps_disconnect();
 	}
-	if (pageserver_conn_wes != NULL)
+	if (page_servers[shard_no].wes != NULL)
 	{
-		FreeWaitEventSet(pageserver_conn_wes);
-		pageserver_conn_wes = NULL;
+		FreeWaitEventSet(page_servers[shard_no].wes);
+		page_servers[shard_no].wes = NULL;
 	}
 }

 static bool
-pageserver_send(NeonRequest *request)
+pageserver_send(shardno_t shard_no, NeonRequest *request)
 {
 	StringInfoData req_buff;
-
-	if (CheckConnstringUpdated())
-	{
-		pageserver_disconnect();
-		ReloadConnstring();
-	}
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;

 	/* If the connection was lost for some reason, reconnect */
-	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
+	if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
-		neon_log(LOG, "pageserver_send disconnect bad connection");
-		pageserver_disconnect();
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnect bad connection");
+		pageserver_disconnect(shard_no);
 	}

 	req_buff = nm_pack_request(request);
@@ -349,9 +538,9 @@ pageserver_send(NeonRequest *request)
 	 * https://github.com/neondatabase/neon/issues/1138 So try to reestablish
 	 * connection in case of failure.
 	 */
-	if (!connected)
+	if (!page_servers[shard_no].conn)
 	{
-		while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
+		while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
 			HandleMainLoopInterrupts();
 			n_reconnect_attempts += 1;
@@ -359,6 +548,8 @@ pageserver_send(NeonRequest *request)
 		n_reconnect_attempts = 0;
 	}

+	pageserver_conn = page_servers[shard_no].conn;
+
 	/*
 	 * Send request.
 	 *
@@ -371,8 +562,8 @@ pageserver_send(NeonRequest *request)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

-		pageserver_disconnect();
-		neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
+		pageserver_disconnect(shard_no);
+		neon_shard_log(shard_no, LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
 		pfree(msg);
 		pfree(req_buff.data);
 		return false;
@@ -384,19 +575,20 @@ pageserver_send(NeonRequest *request)
 	{
 		char	   *msg = nm_to_string((NeonMessage *) request);

-		neon_log(PageStoreTrace, "sent request: %s", msg);
+		neon_shard_log(shard_no, PageStoreTrace, "sent request: %s", msg);
 		pfree(msg);
 	}
 	return true;
 }

 static NeonResponse *
-pageserver_receive(void)
+pageserver_receive(shardno_t shard_no)
 {
 	StringInfoData resp_buff;
 	NeonResponse *resp;
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;

-	if (!connected)
+	if (!pageserver_conn)
 		return NULL;

 	PG_TRY();
@@ -404,7 +596,7 @@ pageserver_receive(void)
 		/* read response */
 		int			rc;

-		rc = call_PQgetCopyData(&resp_buff.data);
+		rc = call_PQgetCopyData(shard_no, &resp_buff.data);
 		if (rc >= 0)
 		{
 			resp_buff.len = rc;
@@ -416,33 +608,33 @@ pageserver_receive(void)
 			{
 				char	   *msg = nm_to_string((NeonMessage *) resp);

-				neon_log(PageStoreTrace, "got response: %s", msg);
+				neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
 				pfree(msg);
 			}
 		}
 		else if (rc == -1)
 		{
-			neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
-			pageserver_disconnect();
+			neon_shard_log(shard_no, LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
+			pageserver_disconnect(shard_no);
 			resp = NULL;
 		}
 		else if (rc == -2)
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

-			pageserver_disconnect();
-			neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
 		}
 		else
 		{
-			pageserver_disconnect();
-			neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
 		}
 	}
 	PG_CATCH();
 	{
-		neon_log(LOG, "pageserver_receive disconnect due to caught exception");
-		pageserver_disconnect();
+		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect due to caught exception");
+		pageserver_disconnect(shard_no);
 		PG_RE_THROW();
 	}
 	PG_END_TRY();
@@ -452,11 +644,13 @@ pageserver_receive(void)


 static bool
-pageserver_flush(void)
+pageserver_flush(shardno_t shard_no)
 {
-	if (!connected)
+	PGconn	   *pageserver_conn = page_servers[shard_no].conn;
+
+	if (!pageserver_conn)
 	{
-		neon_log(WARNING, "Tried to flush while disconnected");
+		neon_shard_log(shard_no, WARNING, "Tried to flush while disconnected");
 	}
 	else
 	{
@@ -464,8 +658,8 @@ pageserver_flush(void)
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));

-			pageserver_disconnect();
-			neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
+			pageserver_disconnect(shard_no);
+			neon_shard_log(shard_no, LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
 			pfree(msg);
 			return false;
 		}
@@ -505,8 +699,9 @@ PagestoreShmemInit(void)
 									   &found);
 	if (!found)
 	{
-		pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
-		pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
+		pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0);
+		pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0);
+		memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
 		AssignPageserverConnstring(page_server_connstring, NULL);
 	}
 	LWLockRelease(AddinShmemInitLock);
@@ -531,7 +726,6 @@ pagestore_shmem_request(void)
 #endif

 	RequestAddinShmemSpace(PagestoreShmemSize());
-	RequestNamedLWLockTranche("neon_libpagestore", 1);
 }

 static void
@@ -582,6 +776,15 @@ pg_init_libpagestore(void)
 							   0,	/* no flags required */
 							   check_neon_id, NULL, NULL);

+	DefineCustomIntVariable("neon.stripe_size",
+							"sharding stripe size",
+							NULL,
+							&stripe_size,
+							32768, 1, INT_MAX,
+							PGC_SIGHUP,
+							GUC_UNIT_BLOCKS,
+							NULL, NULL, NULL);
+
 	DefineCustomIntVariable("neon.max_cluster_size",
 							"cluster size limit",
 							NULL,
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -20,9 +20,13 @@
 #include "lib/stringinfo.h"
 #include "libpq/pqformat.h"
 #include "storage/block.h"
+#include "storage/buf_internals.h"
 #include "storage/smgr.h"
 #include "utils/memutils.h"

+#define MAX_SHARDS 128
+#define MAX_PAGESERVER_CONNSTRING_SIZE 256
+
 typedef enum
 {
 	/* pagestore_client -> pagestore */
@@ -51,6 +55,9 @@ typedef struct
 #define neon_log(tag, fmt, ...) ereport(tag,                                  \
 										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
 										 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
+#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag,	\
+														(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
+														 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))

 /*
 * supertype of all the Neon*Request structs below
@@ -141,11 +148,13 @@ extern char *nm_to_string(NeonMessage *msg);
 * API
 */

+typedef unsigned shardno_t;
+
 typedef struct
 {
-	bool		(*send) (NeonRequest *request);
-	NeonResponse *(*receive) (void);
-	bool		(*flush) (void);
+	bool		(*send) (shardno_t  shard_no, NeonRequest * request);
+	NeonResponse *(*receive) (shardno_t shard_no);
+	bool		(*flush) (shardno_t shard_no);
 } page_server_api;

 extern void prefetch_on_ps_disconnect(void);
@@ -159,6 +168,8 @@ extern char *neon_timeline;
 extern char *neon_tenant;
 extern int32 max_cluster_size;

+extern shardno_t get_shard_number(BufferTag* tag);
+
 extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo);
 extern void smgr_init_neon(void);
 extern void readahead_buffer_resize(int newsize, void *extra);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -172,6 +172,7 @@ typedef struct PrefetchRequest
 	XLogRecPtr	actual_request_lsn;
 	NeonResponse *response;		/* may be null */
 	PrefetchStatus status;
+	shardno_t   shard_no;
 	uint64		my_ring_index;
 } PrefetchRequest;

@@ -239,10 +240,17 @@ typedef struct PrefetchState
 								 * also unused */

 	/* the buffers */
-	prfh_hash  *prf_hash;
+	prfh_hash	*prf_hash;
+	int			max_shard_no;
+	/* Mark shards involved in prefetch */
+	uint8		shard_bitmap[(MAX_SHARDS + 7)/8];
 	PrefetchRequest prf_buffer[];	/* prefetch buffers */
 } PrefetchState;

+#define BITMAP_ISSET(bm, bit) ((bm)[(bit) >> 3] & (1 << ((bit) & 7)))
+#define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7))
+#define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7))
+
 static PrefetchState *MyPState;

 #define GetPrfSlot(ring_index) ( \
@@ -327,6 +335,7 @@ compact_prefetch_buffers(void)
 		Assert(target_slot->status == PRFS_UNUSED);

 		target_slot->buftag = source_slot->buftag;
+		target_slot->shard_no = source_slot->shard_no;
 		target_slot->status = source_slot->status;
 		target_slot->response = source_slot->response;
 		target_slot->effective_request_lsn = source_slot->effective_request_lsn;
@@ -494,6 +503,23 @@ prefetch_cleanup_trailing_unused(void)
 	}
 }

+
+static bool
+prefetch_flush_requests(void)
+{
+	for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++)
+	{
+		if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no))
+		{
+			if (!page_server->flush(shard_no))
+				return false;
+			BITMAP_CLR(MyPState->shard_bitmap, shard_no);
+		}
+	}
+	MyPState->max_shard_no = 0;
+	return true;
+}
+
 /*
 * Wait for slot of ring_index to have received its response.
 * The caller is responsible for making sure the request buffer is flushed.
@@ -509,7 +535,7 @@ prefetch_wait_for(uint64 ring_index)
 	if (MyPState->ring_flush <= ring_index &&
 		MyPState->ring_unused > MyPState->ring_flush)
 	{
-		if (!page_server->flush())
+		if (!prefetch_flush_requests())
 			return false;
 		MyPState->ring_flush = MyPState->ring_unused;
 	}
@@ -547,7 +573,7 @@ prefetch_read(PrefetchRequest *slot)
 	Assert(slot->my_ring_index == MyPState->ring_receive);

 	old = MemoryContextSwitchTo(MyPState->errctx);
-	response = (NeonResponse *) page_server->receive();
+	response = (NeonResponse *) page_server->receive(slot->shard_no);
 	MemoryContextSwitchTo(old);
 	if (response)
 	{
@@ -704,12 +730,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);

-	while (!page_server->send((NeonRequest *) &request));
+	while (!page_server->send(slot->shard_no, (NeonRequest *) &request));

 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;
 	MyPState->n_unused -= 1;
 	MyPState->ring_unused += 1;
+	BITMAP_SET(MyPState->shard_bitmap, slot->shard_no);
+	MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no);

 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
@@ -880,6 +908,7 @@ Retry:
 	 * function reads the buffer tag from the slot.
 	 */
 	slot->buftag = tag;
+	slot->shard_no = get_shard_number(&tag);
 	slot->my_ring_index = ring_index;

 	prefetch_do_request(slot, force_latest, force_lsn);
@@ -890,7 +919,7 @@ Retry:
 	if (flush_every_n_requests > 0 &&
 		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
 	{
-		if (!page_server->flush())
+		if (!prefetch_flush_requests())
 		{
 			/*
 			 * Prefetch set is reset in case of error, so we should try to
@@ -908,13 +937,44 @@ static NeonResponse *
 page_server_request(void const *req)
 {
 	NeonResponse *resp;
+	BufferTag tag = {0};
+	shardno_t shard_no;
+
+	switch (((NeonRequest *) req)->tag)
+	{
+		case T_NeonExistsRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
+			break;
+		case T_NeonNblocksRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo);
+			break;
+		case T_NeonDbSizeRequest:
+			NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode;
+			break;
+		case T_NeonGetPageRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo);
+			tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
+			break;
+		default:
+			neon_log(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag);
+	}
+	shard_no = get_shard_number(&tag);
+
+
+	/*
+	 * Current sharding model assumes that all metadata is present only at shard 0.
+	 * We still need to call get_shard_no() to check if shard map is up-to-date.
+	 */
+	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
+	{
+		shard_no = 0;
+	}

 	do
 	{
-		while (!page_server->send((NeonRequest *) req) || !page_server->flush());
-		MyPState->ring_flush = MyPState->ring_unused;
+		while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
 		consume_prefetch_responses();
-		resp = page_server->receive();
+		resp = page_server->receive(shard_no);
 	} while (resp == NULL);
 	return resp;

@@ -2098,8 +2158,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		case T_NeonErrorResponse:
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
-					 errmsg(NEON_TAG "could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
-							blkno,
+					 errmsg(NEON_TAG "[shard %d] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+							slot->shard_no, blkno,
 							RelFileInfoFmt(rinfo),
 							forkNum,
 							(uint32) (request_lsn >> 32), (uint32) request_lsn),
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -4,7 +4,9 @@ pub mod backend;
 pub use backend::BackendType;

 mod credentials;
-pub use credentials::{check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint};
+pub use credentials::{
+    check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, IpPattern,
+};

 mod password_hack;
 pub use password_hack::parse_endpoint_param;
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -3,7 +3,6 @@ mod hacks;
 mod link;

 pub use link::LinkAuthError;
-use smol_str::SmolStr;
 use tokio_postgres::config::AuthKeys;

 use crate::auth::credentials::check_peer_addr_is_in_list;
@@ -16,7 +15,6 @@ use crate::context::RequestMonitoring;
 use crate::proxy::connect_compute::handle_try_wake;
 use crate::proxy::retry::retry_after;
 use crate::proxy::NeonOptions;
-use crate::scram;
 use crate::stream::Stream;
 use crate::{
    auth::{self, ComputeUserInfoMaybeEndpoint},
@@ -28,6 +26,7 @@ use crate::{
    },
    stream, url,
 };
+use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
 use futures::TryFutureExt;
 use std::borrow::Cow;
 use std::ops::ControlFlow;
@@ -35,6 +34,8 @@ use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{error, info, warn};

+use super::IpPattern;
+
 /// This type serves two purposes:
 ///
 /// * When `T` is `()`, it's just a regular auth backend selector
@@ -55,7 +56,7 @@ pub enum BackendType<'a, T> {

 pub trait TestBackend: Send + Sync + 'static {
    fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
-    fn get_allowed_ips(&self) -> Result<Vec<SmolStr>, console::errors::GetAuthInfoError>;
+    fn get_allowed_ips(&self) -> Result<Vec<IpPattern>, console::errors::GetAuthInfoError>;
 }

 impl std::fmt::Display for BackendType<'_, ()> {
@@ -128,19 +129,19 @@ pub struct ComputeCredentials<T> {

 #[derive(Debug, Clone)]
 pub struct ComputeUserInfoNoEndpoint {
-    pub user: SmolStr,
+    pub user: RoleName,
    pub options: NeonOptions,
 }

 #[derive(Debug, Clone)]
 pub struct ComputeUserInfo {
-    pub endpoint: SmolStr,
-    pub user: SmolStr,
+    pub endpoint: EndpointId,
+    pub user: RoleName,
    pub options: NeonOptions,
 }

 impl ComputeUserInfo {
-    pub fn endpoint_cache_key(&self) -> SmolStr {
+    pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
        self.options.get_cache_key(&self.endpoint)
    }
 }
@@ -156,7 +157,7 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
    type Error = ComputeUserInfoNoEndpoint;

    fn try_from(user_info: ComputeUserInfoMaybeEndpoint) -> Result<Self, Self::Error> {
-        match user_info.project {
+        match user_info.endpoint_id {
            None => Err(ComputeUserInfoNoEndpoint {
                user: user_info.user,
                options: user_info.options,
@@ -202,21 +203,18 @@ async fn auth_quirks(
    if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
        return Err(auth::AuthError::ip_address_not_allowed());
    }
-    let maybe_secret = api.get_role_secret(ctx, &info).await?;
+    let cached_secret = api.get_role_secret(ctx, &info).await?;

-    let cached_secret = maybe_secret.unwrap_or_else(|| {
+    let secret = cached_secret.value.clone().unwrap_or_else(|| {
        // If we don't have an authentication secret, we mock one to
        // prevent malicious probing (possible due to missing protocol steps).
        // This mocked secret will never lead to successful authentication.
        info!("authentication info not found, mocking it");
-        Cached::new_uncached(AuthSecret::Scram(scram::ServerSecret::mock(
-            &info.user,
-            rand::random(),
-        )))
+        AuthSecret::Scram(scram::ServerSecret::mock(&info.user, rand::random()))
    });
    match authenticate_with_secret(
        ctx,
-        cached_secret.value.clone(),
+        secret,
        info,
        client,
        unauthenticated_password,
@@ -318,11 +316,11 @@ async fn auth_and_wake_compute(

 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
    /// Get compute endpoint name from the credentials.
-    pub fn get_endpoint(&self) -> Option<SmolStr> {
+    pub fn get_endpoint(&self) -> Option<EndpointId> {
        use BackendType::*;

        match self {
-            Console(_, user_info) => user_info.project.clone(),
+            Console(_, user_info) => user_info.endpoint_id.clone(),
            Link(_) => Some("link".into()),
            #[cfg(test)]
            Test(_) => Some("test".into()),
@@ -356,7 +354,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
            Console(api, user_info) => {
                info!(
                    user = &*user_info.user,
-                    project = user_info.project(),
+                    project = user_info.endpoint(),
                    "performing authentication using the console"
                );

--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -2,12 +2,12 @@

 use crate::{
    auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions,
+    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, EndpointId, RoleName,
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
-use std::{collections::HashSet, net::IpAddr};
+use std::{collections::HashSet, net::IpAddr, str::FromStr};
 use thiserror::Error;
 use tracing::{info, warn};

@@ -21,7 +21,10 @@ pub enum ComputeUserInfoParseError {
         SNI ('{}') and project option ('{}').",
        .domain, .option,
    )]
-    InconsistentProjectNames { domain: SmolStr, option: SmolStr },
+    InconsistentProjectNames {
+        domain: EndpointId,
+        option: EndpointId,
+    },

    #[error(
        "Common name inferred from SNI ('{}') is not known",
@@ -30,7 +33,7 @@ pub enum ComputeUserInfoParseError {
    UnknownCommonName { cn: String },

    #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
-    MalformedProjectName(SmolStr),
+    MalformedProjectName(EndpointId),
 }

 impl UserFacingError for ComputeUserInfoParseError {}
@@ -39,17 +42,15 @@ impl UserFacingError for ComputeUserInfoParseError {}
 /// Note that we don't store any kind of client key or password here.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct ComputeUserInfoMaybeEndpoint {
-    pub user: SmolStr,
-    // TODO: this is a severe misnomer! We should think of a new name ASAP.
-    pub project: Option<SmolStr>,
-
+    pub user: RoleName,
+    pub endpoint_id: Option<EndpointId>,
    pub options: NeonOptions,
 }

 impl ComputeUserInfoMaybeEndpoint {
    #[inline]
-    pub fn project(&self) -> Option<&str> {
-        self.project.as_deref()
+    pub fn endpoint(&self) -> Option<&str> {
+        self.endpoint_id.as_deref()
    }
 }

@@ -79,15 +80,15 @@ impl ComputeUserInfoMaybeEndpoint {

        // Some parameters are stored in the startup message.
        let get_param = |key| params.get(key).ok_or(MissingKey(key));
-        let user: SmolStr = get_param("user")?.into();
+        let user: RoleName = get_param("user")?.into();

        // record the values if we have them
        ctx.set_application(params.get("application_name").map(SmolStr::from));
        ctx.set_user(user.clone());
-        ctx.set_endpoint_id(sni.map(SmolStr::from));
+        ctx.set_endpoint_id(sni.map(EndpointId::from));

        // Project name might be passed via PG's command-line options.
-        let project_option = params
+        let endpoint_option = params
            .options_raw()
            .and_then(|options| {
                // We support both `project` (deprecated) and `endpoint` options for backward compatibility.
@@ -100,9 +101,9 @@ impl ComputeUserInfoMaybeEndpoint {
            })
            .map(|name| name.into());

-        let project_from_domain = if let Some(sni_str) = sni {
+        let endpoint_from_domain = if let Some(sni_str) = sni {
            if let Some(cn) = common_names {
-                Some(SmolStr::from(endpoint_sni(sni_str, cn)?))
+                Some(EndpointId::from(endpoint_sni(sni_str, cn)?))
            } else {
                None
            }
@@ -110,7 +111,7 @@ impl ComputeUserInfoMaybeEndpoint {
            None
        };

-        let project = match (project_option, project_from_domain) {
+        let endpoint = match (endpoint_option, endpoint_from_domain) {
            // Invariant: if we have both project name variants, they should match.
            (Some(option), Some(domain)) if option != domain => {
                Some(Err(InconsistentProjectNames { domain, option }))
@@ -123,13 +124,13 @@ impl ComputeUserInfoMaybeEndpoint {
        }
        .transpose()?;

-        info!(%user, project = project.as_deref(), "credentials");
+        info!(%user, project = endpoint.as_deref(), "credentials");
        if sni.is_some() {
            info!("Connection with sni");
            NUM_CONNECTION_ACCEPTED_BY_SNI
                .with_label_values(&["sni"])
                .inc();
-        } else if project.is_some() {
+        } else if endpoint.is_some() {
            NUM_CONNECTION_ACCEPTED_BY_SNI
                .with_label_values(&["no_sni"])
                .inc();
@@ -145,36 +146,57 @@ impl ComputeUserInfoMaybeEndpoint {

        Ok(Self {
            user,
-            project,
+            endpoint_id: endpoint.map(EndpointId::from),
            options,
        })
    }
 }

-pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &Vec<SmolStr>) -> bool {
-    if ip_list.is_empty() {
-        return true;
-    }
-    for ip in ip_list {
-        // We expect that all ip addresses from control plane are correct.
-        // However, if some of them are broken, we still can check the others.
-        match parse_ip_pattern(ip) {
-            Ok(pattern) => {
-                if check_ip(peer_addr, &pattern) {
-                    return true;
-                }
-            }
-            Err(err) => warn!("Cannot parse ip: {}; err: {}", ip, err),
-        }
-    }
-    false
+pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool {
+    ip_list.is_empty() || ip_list.iter().any(|pattern| check_ip(peer_addr, pattern))
 }

 #[derive(Debug, Clone, Eq, PartialEq)]
-enum IpPattern {
+pub enum IpPattern {
    Subnet(ipnet::IpNet),
    Range(IpAddr, IpAddr),
    Single(IpAddr),
+    None,
+}
+
+impl<'de> serde::de::Deserialize<'de> for IpPattern {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct StrVisitor;
+        impl<'de> serde::de::Visitor<'de> for StrVisitor {
+            type Value = IpPattern;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                write!(formatter, "comma separated list with ip address, ip address range, or ip address subnet mask")
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                Ok(parse_ip_pattern(v).unwrap_or_else(|e| {
+                    warn!("Cannot parse ip pattern {v}: {e}");
+                    IpPattern::None
+                }))
+            }
+        }
+        deserializer.deserialize_str(StrVisitor)
+    }
+}
+
+impl FromStr for IpPattern {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        parse_ip_pattern(s)
+    }
 }

 fn parse_ip_pattern(pattern: &str) -> anyhow::Result<IpPattern> {
@@ -196,6 +218,7 @@ fn check_ip(ip: &IpAddr, pattern: &IpPattern) -> bool {
        IpPattern::Subnet(subnet) => subnet.contains(ip),
        IpPattern::Range(start, end) => start <= ip && ip <= end,
        IpPattern::Single(addr) => addr == ip,
+        IpPattern::None => false,
    }
 }

@@ -206,6 +229,7 @@ fn project_name_valid(name: &str) -> bool {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use serde_json::json;
    use ComputeUserInfoParseError::*;

    #[test]
@@ -215,7 +239,7 @@ mod tests {
        let mut ctx = RequestMonitoring::test();
        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project, None);
+        assert_eq!(user_info.endpoint_id, None);

        Ok(())
    }
@@ -230,7 +254,7 @@ mod tests {
        let mut ctx = RequestMonitoring::test();
        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project, None);
+        assert_eq!(user_info.endpoint_id, None);

        Ok(())
    }
@@ -246,7 +270,7 @@ mod tests {
        let user_info =
            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("foo"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("foo"));
        assert_eq!(user_info.options.get_cache_key("foo"), "foo");

        Ok(())
@@ -262,7 +286,7 @@ mod tests {
        let mut ctx = RequestMonitoring::test();
        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("bar"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));

        Ok(())
    }
@@ -277,7 +301,7 @@ mod tests {
        let mut ctx = RequestMonitoring::test();
        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("bar"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("bar"));

        Ok(())
    }
@@ -295,7 +319,7 @@ mod tests {
        let mut ctx = RequestMonitoring::test();
        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
-        assert!(user_info.project.is_none());
+        assert!(user_info.endpoint_id.is_none());

        Ok(())
    }
@@ -310,7 +334,7 @@ mod tests {
        let mut ctx = RequestMonitoring::test();
        let user_info = ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, None, None)?;
        assert_eq!(user_info.user, "john_doe");
-        assert!(user_info.project.is_none());
+        assert!(user_info.endpoint_id.is_none());

        Ok(())
    }
@@ -326,7 +350,7 @@ mod tests {
        let user_info =
            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
        assert_eq!(user_info.user, "john_doe");
-        assert_eq!(user_info.project.as_deref(), Some("baz"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("baz"));

        Ok(())
    }
@@ -340,14 +364,14 @@ mod tests {
        let mut ctx = RequestMonitoring::test();
        let user_info =
            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
-        assert_eq!(user_info.project.as_deref(), Some("p1"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.b.com");
        let mut ctx = RequestMonitoring::test();
        let user_info =
            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
-        assert_eq!(user_info.project.as_deref(), Some("p1"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("p1"));

        Ok(())
    }
@@ -404,7 +428,7 @@ mod tests {
        let mut ctx = RequestMonitoring::test();
        let user_info =
            ComputeUserInfoMaybeEndpoint::parse(&mut ctx, &options, sni, common_names.as_ref())?;
-        assert_eq!(user_info.project.as_deref(), Some("project"));
+        assert_eq!(user_info.endpoint_id.as_deref(), Some("project"));
        assert_eq!(
            user_info.options.get_cache_key("project"),
            "project endpoint_type:read_write lsn:0/2"
@@ -415,21 +439,17 @@ mod tests {

    #[test]
    fn test_check_peer_addr_is_in_list() {
-        let peer_addr = IpAddr::from([127, 0, 0, 1]);
-        assert!(check_peer_addr_is_in_list(&peer_addr, &vec![]));
-        assert!(check_peer_addr_is_in_list(
-            &peer_addr,
-            &vec!["127.0.0.1".into()]
-        ));
-        assert!(!check_peer_addr_is_in_list(
-            &peer_addr,
-            &vec!["8.8.8.8".into()]
-        ));
+        fn check(v: serde_json::Value) -> bool {
+            let peer_addr = IpAddr::from([127, 0, 0, 1]);
+            let ip_list: Vec<IpPattern> = serde_json::from_value(v).unwrap();
+            check_peer_addr_is_in_list(&peer_addr, &ip_list)
+        }
+
+        assert!(check(json!([])));
+        assert!(check(json!(["127.0.0.1"])));
+        assert!(!check(json!(["8.8.8.8"])));
        // If there is an incorrect address, it will be skipped.
-        assert!(check_peer_addr_is_in_list(
-            &peer_addr,
-            &vec!["88.8.8".into(), "127.0.0.1".into()]
-        ));
+        assert!(check(json!(["88.8.8", "127.0.0.1"])));
    }
    #[test]
    fn test_parse_ip_v4() -> anyhow::Result<()> {
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -4,10 +4,11 @@
 //! UPDATE (Mon Aug  8 13:20:34 UTC 2022): the payload format has been simplified.

 use bstr::ByteSlice;
-use smol_str::SmolStr;
+
+use crate::EndpointId;

 pub struct PasswordHackPayload {
-    pub endpoint: SmolStr,
+    pub endpoint: EndpointId,
    pub password: Vec<u8>,
 }

--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -11,13 +11,16 @@ use smol_str::SmolStr;
 use tokio::time::Instant;
 use tracing::{debug, info};

-use crate::{config::ProjectInfoCacheOptions, console::AuthSecret};
+use crate::{
+    auth::IpPattern, config::ProjectInfoCacheOptions, console::AuthSecret, EndpointId, ProjectId,
+    RoleName,
+};

 use super::{Cache, Cached};

 pub trait ProjectInfoCache {
-    fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr);
-    fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr);
+    fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId);
+    fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName);
    fn enable_ttl(&self);
    fn disable_ttl(&self);
 }
@@ -44,8 +47,8 @@ impl<T> From<T> for Entry<T> {

 #[derive(Default)]
 struct EndpointInfo {
-    secret: std::collections::HashMap<SmolStr, Entry<AuthSecret>>,
-    allowed_ips: Option<Entry<Arc<Vec<SmolStr>>>>,
+    secret: std::collections::HashMap<RoleName, Entry<Option<AuthSecret>>>,
+    allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
 }

 impl EndpointInfo {
@@ -57,10 +60,10 @@ impl EndpointInfo {
    }
    pub fn get_role_secret(
        &self,
-        role_name: &SmolStr,
+        role_name: &RoleName,
        valid_since: Instant,
        ignore_cache_since: Option<Instant>,
-    ) -> Option<(AuthSecret, bool)> {
+    ) -> Option<(Option<AuthSecret>, bool)> {
        if let Some(secret) = self.secret.get(role_name) {
            if valid_since < secret.created_at {
                return Some((
@@ -76,7 +79,7 @@ impl EndpointInfo {
        &self,
        valid_since: Instant,
        ignore_cache_since: Option<Instant>,
-    ) -> Option<(Arc<Vec<SmolStr>>, bool)> {
+    ) -> Option<(Arc<Vec<IpPattern>>, bool)> {
        if let Some(allowed_ips) = &self.allowed_ips {
            if valid_since < allowed_ips.created_at {
                return Some((
@@ -90,7 +93,7 @@ impl EndpointInfo {
    pub fn invalidate_allowed_ips(&mut self) {
        self.allowed_ips = None;
    }
-    pub fn invalidate_role_secret(&mut self, role_name: &SmolStr) {
+    pub fn invalidate_role_secret(&mut self, role_name: &RoleName) {
        self.secret.remove(role_name);
    }
 }
@@ -103,9 +106,9 @@ impl EndpointInfo {
 /// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available?
 /// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache.
 pub struct ProjectInfoCacheImpl {
-    cache: DashMap<SmolStr, EndpointInfo>,
+    cache: DashMap<EndpointId, EndpointInfo>,

-    project2ep: DashMap<SmolStr, HashSet<SmolStr>>,
+    project2ep: DashMap<ProjectId, HashSet<EndpointId>>,
    config: ProjectInfoCacheOptions,

    start_time: Instant,
@@ -113,7 +116,7 @@ pub struct ProjectInfoCacheImpl {
 }

 impl ProjectInfoCache for ProjectInfoCacheImpl {
-    fn invalidate_allowed_ips_for_project(&self, project_id: &SmolStr) {
+    fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId) {
        info!("invalidating allowed ips for project `{}`", project_id);
        let endpoints = self
            .project2ep
@@ -126,7 +129,7 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
            }
        }
    }
-    fn invalidate_role_secret_for_project(&self, project_id: &SmolStr, role_name: &SmolStr) {
+    fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName) {
        info!(
            "invalidating role secret for project_id `{}` and role_name `{}`",
            project_id, role_name
@@ -167,9 +170,9 @@ impl ProjectInfoCacheImpl {

    pub fn get_role_secret(
        &self,
-        endpoint_id: &SmolStr,
-        role_name: &SmolStr,
-    ) -> Option<Cached<&Self, AuthSecret>> {
+        endpoint_id: &EndpointId,
+        role_name: &RoleName,
+    ) -> Option<Cached<&Self, Option<AuthSecret>>> {
        let (valid_since, ignore_cache_since) = self.get_cache_times();
        let endpoint_info = self.cache.get(endpoint_id)?;
        let (value, ignore_cache) =
@@ -188,8 +191,8 @@ impl ProjectInfoCacheImpl {
    }
    pub fn get_allowed_ips(
        &self,
-        endpoint_id: &SmolStr,
-    ) -> Option<Cached<&Self, Arc<Vec<SmolStr>>>> {
+        endpoint_id: &EndpointId,
+    ) -> Option<Cached<&Self, Arc<Vec<IpPattern>>>> {
        let (valid_since, ignore_cache_since) = self.get_cache_times();
        let endpoint_info = self.cache.get(endpoint_id)?;
        let value = endpoint_info.get_allowed_ips(valid_since, ignore_cache_since);
@@ -205,10 +208,10 @@ impl ProjectInfoCacheImpl {
    }
    pub fn insert_role_secret(
        &self,
-        project_id: &SmolStr,
-        endpoint_id: &SmolStr,
-        role_name: &SmolStr,
-        secret: AuthSecret,
+        project_id: &ProjectId,
+        endpoint_id: &EndpointId,
+        role_name: &RoleName,
+        secret: Option<AuthSecret>,
    ) {
        if self.cache.len() >= self.config.size {
            // If there are too many entries, wait until the next gc cycle.
@@ -222,9 +225,9 @@ impl ProjectInfoCacheImpl {
    }
    pub fn insert_allowed_ips(
        &self,
-        project_id: &SmolStr,
-        endpoint_id: &SmolStr,
-        allowed_ips: Arc<Vec<SmolStr>>,
+        project_id: &ProjectId,
+        endpoint_id: &EndpointId,
+        allowed_ips: Arc<Vec<IpPattern>>,
    ) {
        if self.cache.len() >= self.config.size {
            // If there are too many entries, wait until the next gc cycle.
@@ -236,7 +239,7 @@ impl ProjectInfoCacheImpl {
            .or_default()
            .allowed_ips = Some(allowed_ips.into());
    }
-    fn inser_project2endpoint(&self, project_id: &SmolStr, endpoint_id: &SmolStr) {
+    fn inser_project2endpoint(&self, project_id: &ProjectId, endpoint_id: &EndpointId) {
        if let Some(mut endpoints) = self.project2ep.get_mut(project_id) {
            endpoints.insert(endpoint_id.clone());
        } else {
@@ -297,18 +300,18 @@ impl ProjectInfoCacheImpl {
 /// This is used to invalidate cache entries.
 pub struct CachedLookupInfo {
    /// Search by this key.
-    endpoint_id: SmolStr,
+    endpoint_id: EndpointId,
    lookup_type: LookupType,
 }

 impl CachedLookupInfo {
-    pub(self) fn new_role_secret(endpoint_id: SmolStr, role_name: SmolStr) -> Self {
+    pub(self) fn new_role_secret(endpoint_id: EndpointId, role_name: RoleName) -> Self {
        Self {
            endpoint_id,
            lookup_type: LookupType::RoleSecret(role_name),
        }
    }
-    pub(self) fn new_allowed_ips(endpoint_id: SmolStr) -> Self {
+    pub(self) fn new_allowed_ips(endpoint_id: EndpointId) -> Self {
        Self {
            endpoint_id,
            lookup_type: LookupType::AllowedIps,
@@ -317,7 +320,7 @@ impl CachedLookupInfo {
 }

 enum LookupType {
-    RoleSecret(SmolStr),
+    RoleSecret(RoleName),
    AllowedIps,
 }

@@ -348,7 +351,6 @@ impl Cache for ProjectInfoCacheImpl {
 mod tests {
    use super::*;
    use crate::{console::AuthSecret, scram::ServerSecret};
-    use smol_str::SmolStr;
    use std::{sync::Arc, time::Duration};

    #[tokio::test]
@@ -362,11 +364,17 @@ mod tests {
        });
        let project_id = "project".into();
        let endpoint_id = "endpoint".into();
-        let user1: SmolStr = "user1".into();
-        let user2: SmolStr = "user2".into();
-        let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32]));
-        let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32]));
-        let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
+        let user1: RoleName = "user1".into();
+        let user2: RoleName = "user2".into();
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user1.as_str(),
+            [1; 32],
+        )));
+        let secret2 = None;
+        let allowed_ips = Arc::new(vec![
+            "127.0.0.1".parse().unwrap(),
+            "127.0.0.2".parse().unwrap(),
+        ]);
        cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
        cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
        cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
@@ -379,8 +387,11 @@ mod tests {
        assert_eq!(cached.value, secret2);

        // Shouldn't add more than 2 roles.
-        let user3: SmolStr = "user3".into();
-        let secret3 = AuthSecret::Scram(ServerSecret::mock(user3.as_str(), [3; 32]));
+        let user3: RoleName = "user3".into();
+        let secret3 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user3.as_str(),
+            [3; 32],
+        )));
        cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone());
        assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());

@@ -411,11 +422,20 @@ mod tests {

        let project_id = "project".into();
        let endpoint_id = "endpoint".into();
-        let user1: SmolStr = "user1".into();
-        let user2: SmolStr = "user2".into();
-        let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32]));
-        let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32]));
-        let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
+        let user1: RoleName = "user1".into();
+        let user2: RoleName = "user2".into();
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user1.as_str(),
+            [1; 32],
+        )));
+        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user2.as_str(),
+            [2; 32],
+        )));
+        let allowed_ips = Arc::new(vec![
+            "127.0.0.1".parse().unwrap(),
+            "127.0.0.2".parse().unwrap(),
+        ]);
        cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
        cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone());
        cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone());
@@ -457,11 +477,20 @@ mod tests {

        let project_id = "project".into();
        let endpoint_id = "endpoint".into();
-        let user1: SmolStr = "user1".into();
-        let user2: SmolStr = "user2".into();
-        let secret1 = AuthSecret::Scram(ServerSecret::mock(user1.as_str(), [1; 32]));
-        let secret2 = AuthSecret::Scram(ServerSecret::mock(user2.as_str(), [2; 32]));
-        let allowed_ips = Arc::new(vec!["allowed_ip1".into(), "allowed_ip2".into()]);
+        let user1: RoleName = "user1".into();
+        let user2: RoleName = "user2".into();
+        let secret1 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user1.as_str(),
+            [1; 32],
+        )));
+        let secret2 = Some(AuthSecret::Scram(ServerSecret::mock(
+            user2.as_str(),
+            [2; 32],
+        )));
+        let allowed_ips = Arc::new(vec![
+            "127.0.0.1".parse().unwrap(),
+            "127.0.0.2".parse().unwrap(),
+        ]);
        cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone());
        cache.clone().disable_ttl();
        tokio::time::advance(Duration::from_millis(100)).await;
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,7 +1,10 @@
 use serde::Deserialize;
-use smol_str::SmolStr;
 use std::fmt;

+use crate::auth::IpPattern;
+
+use crate::{BranchId, EndpointId, ProjectId};
+
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
 #[derive(Debug, Deserialize)]
@@ -14,8 +17,8 @@ pub struct ConsoleError {
 #[derive(Deserialize)]
 pub struct GetRoleSecret {
    pub role_secret: Box<str>,
-    pub allowed_ips: Option<Vec<Box<str>>>,
-    pub project_id: Option<Box<str>>,
+    pub allowed_ips: Option<Vec<IpPattern>>,
+    pub project_id: Option<ProjectId>,
 }

 // Manually implement debug to omit sensitive info.
@@ -92,9 +95,9 @@ impl fmt::Debug for DatabaseInfo {
 /// Also known as `ProxyMetricsAuxInfo` in the console.
 #[derive(Debug, Deserialize, Clone, Default)]
 pub struct MetricsAuxInfo {
-    pub endpoint_id: SmolStr,
-    pub project_id: SmolStr,
-    pub branch_id: SmolStr,
+    pub endpoint_id: EndpointId,
+    pub project_id: ProjectId,
+    pub branch_id: BranchId,
 }

 impl MetricsAuxInfo {
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -4,16 +4,15 @@ pub mod neon;

 use super::messages::MetricsAuxInfo;
 use crate::{
-    auth::backend::ComputeUserInfo,
+    auth::{backend::ComputeUserInfo, IpPattern},
    cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru},
    compute,
    config::{CacheOptions, ProjectInfoCacheOptions},
    context::RequestMonitoring,
-    scram,
+    scram, EndpointCacheKey, ProjectId,
 };
 use async_trait::async_trait;
 use dashmap::DashMap;
-use smol_str::SmolStr;
 use std::{sync::Arc, time::Duration};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio::time::Instant;
@@ -212,9 +211,9 @@ pub enum AuthSecret {
 pub struct AuthInfo {
    pub secret: Option<AuthSecret>,
    /// List of IP addresses allowed for the autorization.
-    pub allowed_ips: Vec<SmolStr>,
+    pub allowed_ips: Vec<IpPattern>,
    /// Project ID. This is used for cache invalidation.
-    pub project_id: Option<SmolStr>,
+    pub project_id: Option<ProjectId>,
 }

 /// Info for establishing a connection to a compute node.
@@ -233,10 +232,10 @@ pub struct NodeInfo {
    pub allow_self_signed_compute: bool,
 }

-pub type NodeInfoCache = TimedLru<SmolStr, NodeInfo>;
+pub type NodeInfoCache = TimedLru<EndpointCacheKey, NodeInfo>;
 pub type CachedNodeInfo = Cached<&'static NodeInfoCache>;
-pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, AuthSecret>;
-pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<SmolStr>>>;
+pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
+pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;

 /// This will allocate per each call, but the http requests alone
 /// already require a few allocations, so it should be fine.
@@ -249,7 +248,7 @@ pub trait Api {
        &self,
        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
-    ) -> Result<Option<CachedRoleSecret>, errors::GetAuthInfoError>;
+    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;

    async fn get_allowed_ips(
        &self,
@@ -280,7 +279,7 @@ impl Api for ConsoleBackend {
        &self,
        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
-    ) -> Result<Option<CachedRoleSecret>, errors::GetAuthInfoError> {
+    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
        use ConsoleBackend::*;
        match self {
            Console(api) => api.get_role_secret(ctx, user_info).await,
@@ -345,7 +344,7 @@ impl ApiCaches {
 /// Various caches for [`console`](super).
 pub struct ApiLocks {
    name: &'static str,
-    node_locks: DashMap<SmolStr, Arc<Semaphore>>,
+    node_locks: DashMap<EndpointCacheKey, Arc<Semaphore>>,
    permits: usize,
    timeout: Duration,
    registered: prometheus::IntCounter,
@@ -413,7 +412,7 @@ impl ApiLocks {

    pub async fn get_wake_compute_permit(
        &self,
-        key: &SmolStr,
+        key: &EndpointCacheKey,
    ) -> Result<WakeComputePermit, errors::WakeComputeError> {
        if self.permits == 0 {
            return Ok(WakeComputePermit { permit: None });
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -4,14 +4,13 @@ use super::{
    errors::{ApiError, GetAuthInfoError, WakeComputeError},
    AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
 };
-use crate::cache::Cached;
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
 use crate::context::RequestMonitoring;
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
+use crate::{auth::IpPattern, cache::Cached};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use smol_str::SmolStr;
-use std::sync::Arc;
+use std::{str::FromStr, sync::Arc};
 use thiserror::Error;
 use tokio_postgres::{config::SslMode, Client};
 use tracing::{error, info, info_span, warn, Instrument};
@@ -88,7 +87,9 @@ impl Api {
            {
                Some(s) => {
                    info!("got allowed_ips: {s}");
-                    s.split(',').map(String::from).collect()
+                    s.split(',')
+                        .map(|s| IpPattern::from_str(s).unwrap())
+                        .collect()
                }
                None => vec![],
            };
@@ -100,7 +101,7 @@ impl Api {
        .await?;
        Ok(AuthInfo {
            secret,
-            allowed_ips: allowed_ips.iter().map(SmolStr::from).collect(),
+            allowed_ips,
            project_id: None,
        })
    }
@@ -150,12 +151,10 @@ impl super::Api for Api {
        &self,
        _ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
-    ) -> Result<Option<CachedRoleSecret>, GetAuthInfoError> {
-        Ok(self
-            .do_get_auth_info(user_info)
-            .await?
-            .secret
-            .map(CachedRoleSecret::new_uncached))
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        Ok(CachedRoleSecret::new_uncached(
+            self.do_get_auth_info(user_info).await?.secret,
+        ))
    }

    async fn get_allowed_ips(
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -14,8 +14,6 @@ use crate::{
 };
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use itertools::Itertools;
-use smol_str::SmolStr;
 use std::sync::Arc;
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
@@ -86,20 +84,20 @@ impl Api {
                },
            };

-            let secret = scram::ServerSecret::parse(&body.role_secret)
-                .map(AuthSecret::Scram)
-                .ok_or(GetAuthInfoError::BadSecret)?;
-            let allowed_ips = body
-                .allowed_ips
-                .into_iter()
-                .flatten()
-                .map(SmolStr::from)
-                .collect_vec();
+            let secret = if body.role_secret.is_empty() {
+                None
+            } else {
+                let secret = scram::ServerSecret::parse(&body.role_secret)
+                    .map(AuthSecret::Scram)
+                    .ok_or(GetAuthInfoError::BadSecret)?;
+                Some(secret)
+            };
+            let allowed_ips = body.allowed_ips.unwrap_or_default();
            ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64);
            Ok(AuthInfo {
-                secret: Some(secret),
+                secret,
                allowed_ips,
-                project_id: body.project_id.map(SmolStr::from),
+                project_id: body.project_id,
            })
        }
        .map_err(crate::error::log_error)
@@ -172,19 +170,20 @@ impl super::Api for Api {
        &self,
        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
-    ) -> Result<Option<CachedRoleSecret>, GetAuthInfoError> {
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
        let ep = &user_info.endpoint;
        let user = &user_info.user;
        if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) {
-            return Ok(Some(role_secret));
+            return Ok(role_secret);
        }
        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
        if let Some(project_id) = auth_info.project_id {
-            if let Some(secret) = &auth_info.secret {
-                self.caches
-                    .project_info
-                    .insert_role_secret(&project_id, ep, user, secret.clone())
-            }
+            self.caches.project_info.insert_role_secret(
+                &project_id,
+                ep,
+                user,
+                auth_info.secret.clone(),
+            );
            self.caches.project_info.insert_allowed_ips(
                &project_id,
                ep,
@@ -192,7 +191,7 @@ impl super::Api for Api {
            );
        }
        // When we just got a secret, we don't need to invalidate it.
-        Ok(auth_info.secret.map(Cached::new_uncached))
+        Ok(Cached::new_uncached(auth_info.secret))
    }

    async fn get_allowed_ips(
@@ -214,11 +213,12 @@ impl super::Api for Api {
        let allowed_ips = Arc::new(auth_info.allowed_ips);
        let user = &user_info.user;
        if let Some(project_id) = auth_info.project_id {
-            if let Some(secret) = &auth_info.secret {
-                self.caches
-                    .project_info
-                    .insert_role_secret(&project_id, ep, user, secret.clone())
-            }
+            self.caches.project_info.insert_role_secret(
+                &project_id,
+                ep,
+                user,
+                auth_info.secret.clone(),
+            );
            self.caches
                .project_info
                .insert_allowed_ips(&project_id, ep, allowed_ips.clone());
@@ -238,7 +238,7 @@ impl super::Api for Api {
        // for some time (highly depends on the console's scale-to-zero policy);
        // The connection info remains the same during that period of time,
        // which means that we might cache it to reduce the load and latency.
-        if let Some(cached) = self.caches.node_info.get(&*key) {
+        if let Some(cached) = self.caches.node_info.get(&key) {
            info!(key = &*key, "found cached compute node info");
            return Ok(cached);
        }
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -7,7 +7,10 @@ use std::net::IpAddr;
 use tokio::sync::mpsc;
 use uuid::Uuid;

-use crate::{console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer};
+use crate::{
+    console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer, BranchId,
+    EndpointId, ProjectId, RoleName,
+};

 pub mod parquet;

@@ -26,10 +29,10 @@ pub struct RequestMonitoring {
    region: &'static str,

    // filled in as they are discovered
-    project: Option<SmolStr>,
-    branch: Option<SmolStr>,
-    endpoint_id: Option<SmolStr>,
-    user: Option<SmolStr>,
+    project: Option<ProjectId>,
+    branch: Option<BranchId>,
+    endpoint_id: Option<EndpointId>,
+    user: Option<RoleName>,
    application: Option<SmolStr>,
    error_kind: Option<ErrorKind>,
    success: bool,
@@ -86,7 +89,7 @@ impl RequestMonitoring {
        self.project = Some(x.project_id);
    }

-    pub fn set_endpoint_id(&mut self, endpoint_id: Option<SmolStr>) {
+    pub fn set_endpoint_id(&mut self, endpoint_id: Option<EndpointId>) {
        self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone());
    }

@@ -94,7 +97,7 @@ impl RequestMonitoring {
        self.application = app.or_else(|| self.application.clone());
    }

-    pub fn set_user(&mut self, user: SmolStr) {
+    pub fn set_user(&mut self, user: RoleName) {
        self.user = Some(user);
    }

--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -62,3 +62,79 @@ pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallib
 pub fn flatten_err<T>(r: Result<anyhow::Result<T>, JoinError>) -> anyhow::Result<T> {
    r.context("join error").and_then(|x| x)
 }
+
+macro_rules! smol_str_wrapper {
+    ($name:ident) => {
+        #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+        pub struct $name(smol_str::SmolStr);
+
+        impl $name {
+            pub fn as_str(&self) -> &str {
+                self.0.as_str()
+            }
+        }
+
+        impl std::fmt::Display for $name {
+            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                self.0.fmt(f)
+            }
+        }
+
+        impl<T> std::cmp::PartialEq<T> for $name
+        where
+            smol_str::SmolStr: std::cmp::PartialEq<T>,
+        {
+            fn eq(&self, other: &T) -> bool {
+                self.0.eq(other)
+            }
+        }
+
+        impl<T> From<T> for $name
+        where
+            smol_str::SmolStr: From<T>,
+        {
+            fn from(x: T) -> Self {
+                Self(x.into())
+            }
+        }
+
+        impl AsRef<str> for $name {
+            fn as_ref(&self) -> &str {
+                self.0.as_ref()
+            }
+        }
+
+        impl std::ops::Deref for $name {
+            type Target = str;
+            fn deref(&self) -> &str {
+                &*self.0
+            }
+        }
+
+        impl<'de> serde::de::Deserialize<'de> for $name {
+            fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
+                <smol_str::SmolStr as serde::de::Deserialize<'de>>::deserialize(d).map(Self)
+            }
+        }
+
+        impl serde::Serialize for $name {
+            fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
+                self.0.serialize(s)
+            }
+        }
+    };
+}
+
+// 90% of role name strings are 20 characters or less.
+smol_str_wrapper!(RoleName);
+// 50% of endpoint strings are 23 characters or less.
+smol_str_wrapper!(EndpointId);
+// 50% of branch strings are 23 characters or less.
+smol_str_wrapper!(BranchId);
+// 90% of project strings are 23 characters or less.
+smol_str_wrapper!(ProjectId);
+
+// will usually equal endpoint ID
+smol_str_wrapper!(EndpointCacheKey);
+
+smol_str_wrapper!(DbName);
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -19,6 +19,7 @@ use crate::{
    rate_limiter::EndpointRateLimiter,
    stream::{PqStream, Stream},
    usage_metrics::{Ids, USAGE_METRICS},
+    EndpointCacheKey,
 };
 use anyhow::{bail, Context};
 use futures::TryFutureExt;
@@ -26,7 +27,7 @@ use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use regex::Regex;
-use smol_str::SmolStr;
+use smol_str::{format_smolstr, SmolStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
@@ -516,20 +517,21 @@ impl NeonOptions {
        Self(options)
    }

-    pub fn get_cache_key(&self, prefix: &str) -> SmolStr {
+    pub fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey {
        // prefix + format!(" {k}:{v}")
        // kinda jank because SmolStr is immutable
        std::iter::once(prefix)
            .chain(self.0.iter().flat_map(|(k, v)| [" ", &**k, ":", &**v]))
-            .collect()
+            .collect::<SmolStr>()
+            .into()
    }

    /// <https://swagger.io/docs/specification/serialization/> DeepObject format
    /// `paramName[prop1]=value1&paramName[prop2]=value2&...`
-    pub fn to_deep_object(&self) -> Vec<(String, SmolStr)> {
+    pub fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> {
        self.0
            .iter()
-            .map(|(k, v)| (format!("options[{}]", k), v.clone()))
+            .map(|(k, v)| (format_smolstr!("options[{}]", k), v.clone()))
            .collect()
    }
 }
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -6,13 +6,13 @@ use super::connect_compute::ConnectMechanism;
 use super::retry::ShouldRetry;
 use super::*;
 use crate::auth::backend::{ComputeUserInfo, TestBackend};
+use crate::auth::IpPattern;
 use crate::config::CertResolver;
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
 use async_trait::async_trait;
 use rstest::rstest;
-use smol_str::SmolStr;
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::{MakeTlsConnect, NoTls};
 use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
@@ -471,7 +471,7 @@ impl TestBackend for TestConnectMechanism {
        }
    }

-    fn get_allowed_ips(&self) -> Result<Vec<SmolStr>, console::errors::GetAuthInfoError> {
+    fn get_allowed_ips(&self) -> Result<Vec<IpPattern>, console::errors::GetAuthInfoError> {
        unimplemented!("not used in tests")
    }
 }
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -11,11 +11,12 @@ use anyhow::bail;
 use dashmap::DashMap;
 use itertools::Itertools;
 use rand::{rngs::StdRng, Rng, SeedableRng};
-use smol_str::SmolStr;
 use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
 use tokio::time::{timeout, Duration, Instant};
 use tracing::info;

+use crate::EndpointId;
+
 use super::{
    limit_algorithm::{LimitAlgorithm, Sample},
    RateLimiterConfig,
@@ -33,7 +34,7 @@ use super::{
 // does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
 // I went with a more expensive way that yields user-friendlier error messages.
 pub struct EndpointRateLimiter<Rand = StdRng, Hasher = RandomState> {
-    map: DashMap<SmolStr, Vec<RateBucket>, Hasher>,
+    map: DashMap<EndpointId, Vec<RateBucket>, Hasher>,
    info: &'static [RateBucketInfo],
    access_count: AtomicUsize,
    rand: Mutex<Rand>,
@@ -146,7 +147,7 @@ impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
    }

    /// Check that number of connections to the endpoint is below `max_rps` rps.
-    pub fn check(&self, endpoint: SmolStr) -> bool {
+    pub fn check(&self, endpoint: EndpointId) -> bool {
        // do a partial GC every 2k requests. This cleans up ~ 1/64th of the map.
        // worst case memory usage is about:
        //    = 2 * 2048 * 64 * (48B + 72B)
@@ -493,11 +494,13 @@ mod tests {
    use futures::{task::noop_waker_ref, Future};
    use rand::SeedableRng;
    use rustc_hash::FxHasher;
-    use smol_str::SmolStr;
    use tokio::time;

    use super::{EndpointRateLimiter, Limiter, Outcome};
-    use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm};
+    use crate::{
+        rate_limiter::{RateBucketInfo, RateLimitAlgorithm},
+        EndpointId,
+    };

    #[tokio::test]
    async fn it_works() {
@@ -654,7 +657,7 @@ mod tests {
        RateBucketInfo::validate(&mut rates).unwrap();
        let limiter = EndpointRateLimiter::new(Vec::leak(rates));

-        let endpoint = SmolStr::from("ep-my-endpoint-1234");
+        let endpoint = EndpointId::from("ep-my-endpoint-1234");

        time::pause();

--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -3,9 +3,8 @@ use std::{convert::Infallible, sync::Arc};
 use futures::StreamExt;
 use redis::aio::PubSub;
 use serde::Deserialize;
-use smol_str::SmolStr;

-use crate::cache::project_info::ProjectInfoCache;
+use crate::{cache::project_info::ProjectInfoCache, ProjectId, RoleName};

 const CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
 const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
@@ -46,12 +45,12 @@ enum Notification {
 }
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
 struct AllowedIpsUpdate {
-    project_id: SmolStr,
+    project_id: ProjectId,
 }
 #[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
 struct PasswordUpdate {
-    project_id: SmolStr,
-    role_name: SmolStr,
+    project_id: ProjectId,
+    role_name: RoleName,
 }
 fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
 where
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -31,6 +31,7 @@ use crate::{
    metrics::NUM_DB_CONNECTIONS_GAUGE,
    proxy::connect_compute::ConnectMechanism,
    usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
+    DbName, EndpointCacheKey, RoleName,
 };
 use crate::{compute, config};

@@ -42,17 +43,17 @@ pub const APP_NAME: SmolStr = SmolStr::new_inline("/sql_over_http");
 #[derive(Debug, Clone)]
 pub struct ConnInfo {
    pub user_info: ComputeUserInfo,
-    pub dbname: SmolStr,
+    pub dbname: DbName,
    pub password: SmolStr,
 }

 impl ConnInfo {
    // hm, change to hasher to avoid cloning?
-    pub fn db_and_user(&self) -> (SmolStr, SmolStr) {
+    pub fn db_and_user(&self) -> (DbName, RoleName) {
        (self.dbname.clone(), self.user_info.user.clone())
    }

-    pub fn endpoint_cache_key(&self) -> SmolStr {
+    pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
        self.user_info.endpoint_cache_key()
    }
 }
@@ -79,14 +80,14 @@ struct ConnPoolEntry {
 // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
 pub struct EndpointConnPool {
-    pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>,
+    pools: HashMap<(DbName, RoleName), DbUserConnPool>,
    total_conns: usize,
    max_conns: usize,
    _guard: IntCounterPairGuard,
 }

 impl EndpointConnPool {
-    fn get_conn_entry(&mut self, db_user: (SmolStr, SmolStr)) -> Option<ConnPoolEntry> {
+    fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry> {
        let Self {
            pools, total_conns, ..
        } = self;
@@ -95,7 +96,7 @@ impl EndpointConnPool {
            .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
    }

-    fn remove_client(&mut self, db_user: (SmolStr, SmolStr), conn_id: uuid::Uuid) -> bool {
+    fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool {
        let Self {
            pools, total_conns, ..
        } = self;
@@ -196,7 +197,7 @@ pub struct GlobalConnPool {
    //
    // That should be a fairly conteded map, so return reference to the per-endpoint
    // pool as early as possible and release the lock.
-    global_pool: DashMap<SmolStr, Arc<RwLock<EndpointConnPool>>>,
+    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool>>>,

    /// Number of endpoint-connection pools
    ///
@@ -440,7 +441,10 @@ impl GlobalConnPool {
        Ok(Client::new(new_client, conn_info, endpoint_pool).await)
    }

-    fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc<RwLock<EndpointConnPool>> {
+    fn get_or_create_endpoint_pool(
+        &self,
+        endpoint: &EndpointCacheKey,
+    ) -> Arc<RwLock<EndpointConnPool>> {
        // fast path
        if let Some(pool) = self.global_pool.get(endpoint) {
            return pool.clone();
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -13,7 +13,6 @@ use hyper::{Body, HeaderMap, Request};
 use serde_json::json;
 use serde_json::Map;
 use serde_json::Value;
-use smol_str::SmolStr;
 use tokio_postgres::error::DbError;
 use tokio_postgres::error::ErrorPosition;
 use tokio_postgres::types::Kind;
@@ -36,6 +35,8 @@ use crate::config::TlsConfig;
 use crate::context::RequestMonitoring;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
+use crate::EndpointId;
+use crate::RoleName;

 use super::conn_pool::ConnInfo;
 use super::conn_pool::GlobalConnPool;
@@ -155,7 +156,7 @@ fn get_conn_info(
        .next()
        .ok_or(anyhow::anyhow!("invalid database name"))?;

-    let username = SmolStr::from(connection_url.username());
+    let username = RoleName::from(connection_url.username());
    if username.is_empty() {
        return Err(anyhow::anyhow!("missing username"));
    }
@@ -189,7 +190,7 @@ fn get_conn_info(

    let endpoint = endpoint_sni(hostname, &tls.common_names)?;

-    let endpoint: SmolStr = endpoint.into();
+    let endpoint: EndpointId = endpoint.into();
    ctx.set_endpoint_id(Some(endpoint.clone()));

    let pairs = connection_url.query_pairs();
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -1,12 +1,11 @@
 //! Periodically collect proxy consumption metrics
 //! and push them to a HTTP endpoint.
-use crate::{config::MetricCollectionConfig, http};
+use crate::{config::MetricCollectionConfig, http, BranchId, EndpointId};
 use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use dashmap::{mapref::entry::Entry, DashMap};
 use once_cell::sync::Lazy;
 use serde::{Deserialize, Serialize};
-use smol_str::SmolStr;
 use std::{
    convert::Infallible,
    sync::{
@@ -30,8 +29,8 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// because we enrich the event with project_id in the control-plane endpoint.
 #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
 pub struct Ids {
-    pub endpoint_id: SmolStr,
-    pub branch_id: SmolStr,
+    pub endpoint_id: EndpointId,
+    pub branch_id: BranchId,
 }

 #[derive(Debug)]
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -110,7 +110,7 @@ pub static REMOVED_WAL_SEGMENTS: Lazy<IntCounter> = Lazy::new(|| {
 pub static BACKED_UP_SEGMENTS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "safekeeper_backed_up_segments_total",
-        "Number of WAL segments backed up to the broker"
+        "Number of WAL segments backed up to the S3"
    )
    .expect("Failed to register safekeeper_backed_up_segments_total counter")
 });
@@ -337,6 +337,7 @@ pub struct TimelineCollector {
    flushed_wal_seconds: GaugeVec,
    collect_timeline_metrics: Gauge,
    timelines_count: IntGauge,
+    active_timelines_count: IntGauge,
 }

 impl Default for TimelineCollector {
@@ -520,6 +521,13 @@ impl TimelineCollector {
        .unwrap();
        descs.extend(timelines_count.desc().into_iter().cloned());

+        let active_timelines_count = IntGauge::new(
+            "safekeeper_active_timelines",
+            "Total number of active timelines",
+        )
+        .unwrap();
+        descs.extend(active_timelines_count.desc().into_iter().cloned());
+
        TimelineCollector {
            descs,
            commit_lsn,
@@ -540,6 +548,7 @@ impl TimelineCollector {
            flushed_wal_seconds,
            collect_timeline_metrics,
            timelines_count,
+            active_timelines_count,
        }
    }
 }
@@ -572,6 +581,7 @@ impl Collector for TimelineCollector {

        let timelines = GlobalTimelines::get_all();
        let timelines_count = timelines.len();
+        let mut active_timelines_count = 0;

        // Prometheus Collector is sync, and data is stored under async lock. To
        // bridge the gap with a crutch, collect data in spawned thread with
@@ -590,6 +600,10 @@ impl Collector for TimelineCollector {
            let timeline_id = tli.ttid.timeline_id.to_string();
            let labels = &[tenant_id.as_str(), timeline_id.as_str()];

+            if tli.timeline_is_active {
+                active_timelines_count += 1;
+            }
+
            self.commit_lsn
                .with_label_values(labels)
                .set(tli.mem_state.commit_lsn.into());
@@ -681,6 +695,8 @@ impl Collector for TimelineCollector {

        // report total number of timelines
        self.timelines_count.set(timelines_count as i64);
+        self.active_timelines_count
+            .set(active_timelines_count as i64);
        mfs.extend(self.timelines_count.collect());

        mfs
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -7,12 +7,21 @@ use tracing::*;

 use crate::{GlobalTimelines, SafeKeeperConf};

+const ALLOW_INACTIVE_TIMELINES: bool = true;
+
 pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
    let wal_removal_interval = Duration::from_millis(5000);
    loop {
+        let now = tokio::time::Instant::now();
+        let mut active_timelines = 0;
+
        let tlis = GlobalTimelines::get_all();
        for tli in &tlis {
-            if !tli.is_active().await {
+            let is_active = tli.is_active().await;
+            if is_active {
+                active_timelines += 1;
+            }
+            if !ALLOW_INACTIVE_TIMELINES && !is_active {
                continue;
            }
            let ttid = tli.ttid;
@@ -27,6 +36,17 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
            .instrument(info_span!("WAL removal", ttid = %ttid))
            .await;
        }
+
+        let elapsed = now.elapsed();
+        let total_timelines = tlis.len();
+
+        if elapsed > wal_removal_interval {
+            info!(
+                "WAL removal is too long, processed {} active timelines ({} total) in {:?}",
+                active_timelines, total_timelines, elapsed
+            );
+        }
+
        sleep(wal_removal_interval).await;
    }
 }
--- a/scripts/ps_ec2_setup_instance_store
+++ b/scripts/ps_ec2_setup_instance_store
@@ -39,6 +39,9 @@ SETUP COMPLETE
 To run your local neon.git build on the instance store volume,
 run the following commands from the top of the neon.git checkout

+    # raise file descriptor limit of your shell and its child processes
+    sudo prlimit -p $$ --nofile=800000:800000
+
    # test suite run
    export TEST_OUTPUT="$TEST_OUTPUT"
    DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_latency.py
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -10,16 +10,18 @@ import shutil
 import subprocess
 import tempfile
 import textwrap
+import threading
 import time
 import uuid
 from contextlib import closing, contextmanager
 from dataclasses import dataclass, field
 from datetime import datetime
+from fcntl import LOCK_EX, LOCK_UN, flock
 from functools import cached_property
 from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
 from urllib.parse import urlparse

 import asyncpg
@@ -49,7 +51,10 @@ from fixtures.pageserver.allowed_errors import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.types import IndexPartDump
-from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
+from fixtures.pageserver.utils import (
+    wait_for_last_record_lsn,
+    wait_for_upload,
+)
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import (
@@ -424,6 +429,7 @@ class NeonEnvBuilder:
        pg_distrib_dir: Path,
        pg_version: PgVersion,
        test_name: str,
+        top_output_dir: Path,
        test_output_dir: Path,
        test_overlay_dir: Optional[Path] = None,
        pageserver_remote_storage: Optional[RemoteStorage] = None,
@@ -473,6 +479,7 @@ class NeonEnvBuilder:
        self.test_overlay_dir = test_overlay_dir
        self.overlay_mounts_created_by_us: List[Tuple[str, Path]] = []
        self.config_init_force: Optional[str] = None
+        self.top_output_dir = top_output_dir

        assert test_name.startswith(
            "test_"
@@ -526,6 +533,64 @@ class NeonEnvBuilder:

        return env

+    def build_and_use_snapshot(
+        self, global_ident: str, create_env_for_snapshot: Callable[[NeonEnvBuilder], NeonEnv]
+    ) -> NeonEnv:
+        if os.getenv("CI", "false") == "true":
+            log.info("do not use snapshots in ephemeral CI environment")
+            env = create_env_for_snapshot(self)
+            env.stop(immediate=True, ps_assert_metric_no_errors=False)
+            return env
+
+        with shared_snapshot_dir(self.top_output_dir, global_ident) as snapshot_dir:
+            if not snapshot_dir.is_initialized():
+                self._build_and_use_snapshot_impl(snapshot_dir, create_env_for_snapshot)
+                assert snapshot_dir.is_initialized()
+
+            return self.from_repo_dir(snapshot_dir.path)
+
+    def _build_and_use_snapshot_impl(
+        self,
+        snapshot_dir: SnapshotDirLocked,
+        create_env_for_snapshot: Callable[[NeonEnvBuilder], NeonEnv],
+    ):
+        if snapshot_dir.path.exists():
+            shutil.rmtree(snapshot_dir.path)
+
+        if self.test_overlay_dir is not None:
+            # Make repo_dir an overlayfs mount with lowerdir being the empty snapshot_dir.
+            # When we're done filling up repo_dir, tear everything down, unmount the overlayfs, and use
+            # the upperdir as the snapshot. This is equivalent to docker `FROM scratch`.
+            assert not self.repo_dir.exists()
+            assert self.repo_dir.parent.exists()
+            snapshot_dir.path.mkdir()
+            self.overlay_mount("create-snapshot-repo-dir", snapshot_dir.path, self.repo_dir)
+            self.config_init_force = "empty-dir-ok"
+
+        env = create_env_for_snapshot(self)
+        assert self.env is not None
+        assert self.env == env
+
+        # shut down everything for snapshot
+        env.stop(immediate=True, ps_assert_metric_no_errors=True)
+
+        # TODO: all kinds of assertions to ensure the env is unused
+
+        if self.test_overlay_dir is None:
+            log.info("take snapshot by moving repo dir")
+            env.repo_dir.rename(snapshot_dir.path)
+        else:
+            log.info("take snapshot by using overlayfs upperdir")
+            self.overlay_unmount_and_move("create-snapshot-repo-dir", snapshot_dir.path)
+            log.info("remove empty repo_dir (previously mountpoint) for snapshot overlay_mount")
+            env.repo_dir.rmdir()
+            # TODO from here on, we should be able to reset / goto top where snapshot_dir.is_initialized()
+            log.info("make repo_dir an overlayfs mount of the snapshot we just created")
+        assert not env.repo_dir.exists(), "both branches above should remove it"
+        snapshot_dir.set_initialized()
+
+        self.env = None  # so that from_repo_dir works again
+
    def from_repo_dir(
        self,
        repo_dir: Path,
@@ -557,10 +622,15 @@ class NeonEnvBuilder:
            tenants_from_dir = ps_dir / "tenants"
            tenants_to_dir = self.repo_dir / ps_dir.name / "tenants"

-            log.info(f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}")
            if self.test_overlay_dir is None:
+                log.info(
+                    f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}"
+                )
                shutil.copytree(tenants_from_dir, tenants_to_dir)
            else:
+                log.info(
+                    f"Creating overlayfs mount of pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}"
+                )
                self.overlay_mount(f"{ps_dir.name}:tenants", tenants_from_dir, tenants_to_dir)

        for sk_from_dir in (repo_dir / "safekeepers").glob("sk*"):
@@ -571,10 +641,12 @@ class NeonEnvBuilder:

        shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True)
        if self.test_overlay_dir is None:
+            log.info("Copying local_fs_remote_storage directory from snapshot")
            shutil.copytree(
                repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage"
            )
        else:
+            log.info("Creating overlayfs mount of local_fs_remote_storage directory from snapshot")
            self.overlay_mount(
                "local_fs_remote_storage",
                repo_dir / "local_fs_remote_storage",
@@ -631,6 +703,54 @@ class NeonEnvBuilder:
        )
        self.overlay_mounts_created_by_us.append((ident, dstdir))

+    def _overlay_umount(self, mountpoint: Path):
+        cmd = ["sudo", "umount", str(mountpoint)]
+        assert mountpoint.is_mount()
+        subprocess_capture(
+            self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True
+        )
+
+    def overlay_unmount_and_move(self, ident: str, dst: Path):
+        """
+        Unmount previously established overlayfs mount at `dstdir` and move the upperdir contents to `dst`.
+        If `dst` is an empty directory, it gets replaced.
+        Caller is responsible for ensuring the unmount will succeed, i.e., that there aren't any nested mounts.
+
+        Raises exception if self.test_overlay_dir is None
+        """
+        assert self.test_overlay_dir is not None
+        # not mutating state yet, make checks
+        ident_state_dir = self.test_overlay_dir / ident
+        assert ident_state_dir.is_dir()
+        upper = ident_state_dir / "upper"
+        work = ident_state_dir / "work"
+        assert upper.is_dir()
+        assert work.is_dir()
+        assert (
+            self.test_overlay_dir not in dst.parents
+        ), "otherwise workdir cleanup below wouldn't work"
+        # find index, still not mutating state
+        idxmap = {
+            existing_ident: idx
+            for idx, (existing_ident, _) in enumerate(self.overlay_mounts_created_by_us)
+        }
+        idx = idxmap.get(ident)
+        if idx is None:
+            raise RuntimeError(f"cannot find mount for ident {ident}")
+
+        if dst.is_dir():
+            dst.rmdir()  # raises exception if not empty, which is what we want
+
+        _, mountpoint = self.overlay_mounts_created_by_us.pop(idx)
+        self._overlay_umount(mountpoint)
+        upper.rename(dst)
+        # we moved the upperdir, clean up workdir and then its parent ident_state_dir
+        cmd = ["sudo", "rm", "-rf", str(work)]
+        subprocess_capture(
+            self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True
+        )
+        ident_state_dir.rmdir()  # should be empty since we moved `upper` out
+
    def overlay_cleanup_teardown(self):
        """
        Unmount the overlayfs mounts created by `self.overlay_mount()`.
@@ -641,13 +761,10 @@ class NeonEnvBuilder:
        while len(self.overlay_mounts_created_by_us) > 0:
            (ident, mountpoint) = self.overlay_mounts_created_by_us.pop()
            ident_state_dir = self.test_overlay_dir / ident
-            cmd = ["sudo", "umount", str(mountpoint)]
            log.info(
-                f"Unmounting overlayfs mount created during setup for ident {ident} at {mountpoint}: {cmd}"
-            )
-            subprocess_capture(
-                self.test_output_dir, cmd, check=True, echo_stderr=True, echo_stdout=True
+                f"Unmounting overlayfs mount created during setup for ident {ident} at {mountpoint}"
            )
+            self._overlay_umount(mountpoint)
            log.info(
                f"Cleaning up overlayfs state dir (owned by root user) for ident {ident} at {ident_state_dir}"
            )
@@ -725,8 +842,15 @@ class NeonEnvBuilder:
        if self.preserve_database_files:
            return

+        overlayfs_mounts = {mountpoint for _, mountpoint in self.overlay_mounts_created_by_us}
+
        directories_to_clean: List[Path] = []
        for test_entry in Path(self.repo_dir).glob("**/*"):
+            if test_entry in overlayfs_mounts:
+                continue
+            for parent in test_entry.parents:
+                if parent in overlayfs_mounts:
+                    continue
            if test_entry.is_file():
                test_file = test_entry
                if ATTACHMENT_NAME_REGEX.fullmatch(test_file.name):
@@ -775,13 +899,6 @@ class NeonEnvBuilder:
                    log.error(f"Error during remote storage scrub: {e}")
                    cleanup_error = e

-            try:
-                self.overlay_cleanup_teardown()
-            except Exception as e:
-                log.error(f"Error cleaning up overlay state: {e}")
-                if cleanup_error is not None:
-                    cleanup_error = e
-
            try:
                self.cleanup_remote_storage()
            except Exception as e:
@@ -802,6 +919,13 @@ class NeonEnvBuilder:
            for pageserver in self.env.pageservers:
                pageserver.assert_no_errors()

+        try:
+            self.overlay_cleanup_teardown()
+        except Exception as e:
+            log.error(f"Error cleaning up overlay state: {e}")
+            if cleanup_error is not None:
+                cleanup_error = e
+

 class NeonEnv:
    """
@@ -971,7 +1095,9 @@ class NeonEnv:
        assert that there is only one. Tests with multiple pageservers should always use
        get_pageserver with an explicit ID.
        """
-        assert len(self.pageservers) == 1
+        assert (
+            len(self.pageservers) == 1
+        ), "env.pageserver must only be used with single pageserver NeonEnv"
        return self.pageservers[0]

    def get_pageserver(self, id: Optional[int]) -> NeonPageserver:
@@ -1082,6 +1208,7 @@ def _shared_simple_env(
        shutil.rmtree(repo_dir, ignore_errors=True)

    with NeonEnvBuilder(
+        top_output_dir=top_output_dir,
        repo_dir=repo_dir,
        port_distributor=port_distributor,
        broker=default_broker,
@@ -1130,6 +1257,7 @@ def neon_env_builder(
    run_id: uuid.UUID,
    request: FixtureRequest,
    test_overlay_dir: Path,
+    top_output_dir: Path,
 ) -> Iterator[NeonEnvBuilder]:
    """
    Fixture to create a Neon environment for test.
@@ -1149,6 +1277,7 @@ def neon_env_builder(

    # Return the builder to the caller
    with NeonEnvBuilder(
+        top_output_dir=top_output_dir,
        repo_dir=Path(repo_dir),
        port_distributor=port_distributor,
        mock_s3_server=mock_s3_server,
@@ -2914,6 +3043,7 @@ class Endpoint(PgProtocol):

        # Write it back updated
        with open(config_path, "w") as file:
+            log.info(json.dumps(dict(data_dict, **kwargs)))
            json.dump(dict(data_dict, **kwargs), file, indent=4)

    # Mock the extension part of spec passed from control plane for local testing
@@ -3486,6 +3616,10 @@ def get_test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
    return _get_test_dir(request, top_output_dir, "overlay-")


+def get_shared_snapshot_dir_path(top_output_dir: Path, snapshot_name: str) -> Path:
+    return top_output_dir / "shared-snapshots" / snapshot_name
+
+
 def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
    return get_test_output_dir(request, top_output_dir) / "repo"

@@ -3532,6 +3666,75 @@ def test_output_dir(
    allure_attach_from_dir(test_dir)


+class FileAndThreadLock:
+    def __init__(self, path: Path):
+        self.path = path
+        self.thread_lock = threading.Lock()
+        self.fd: Optional[int] = None
+
+    def __enter__(self):
+        self.fd = os.open(self.path, os.O_CREAT | os.O_WRONLY)
+        # lock thread lock before file lock so that there's no race
+        # around flocking / funlocking the file lock
+        self.thread_lock.acquire()
+        flock(self.fd, LOCK_EX)
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        assert self.fd is not None
+        assert self.thread_lock.locked()  # ... by us
+        flock(self.fd, LOCK_UN)
+        self.thread_lock.release()
+        os.close(self.fd)
+        self.fd = None
+
+
+class SnapshotDirLocked:
+    def __init__(self, parent: SnapshotDir):
+        self._parent = parent
+
+    def is_initialized(self):
+        # TODO: in the future, take a `tag` as argument and store it in the marker in set_initialized.
+        # Then, in this function, compare marker file contents with the tag to invalidate the snapshot if the tag changed.
+        return self._parent._marker_file_path.exists()
+
+    def set_initialized(self):
+        self._parent._marker_file_path.write_text("")
+
+    @property
+    def path(self) -> Path:
+        return self._parent._path / "snapshot"
+
+
+class SnapshotDir:
+    _path: Path
+
+    def __init__(self, path: Path):
+        self._path = path
+        assert self._path.is_dir()
+        self._lock = FileAndThreadLock(self._lock_file_path)
+
+    @property
+    def _lock_file_path(self) -> Path:
+        return self._path / "initializing.flock"
+
+    @property
+    def _marker_file_path(self) -> Path:
+        return self._path / "initialized.marker"
+
+    def __enter__(self) -> SnapshotDirLocked:
+        self._lock.__enter__()
+        return SnapshotDirLocked(self)
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self._lock.__exit__(exc_type, exc_value, exc_traceback)
+
+
+def shared_snapshot_dir(top_output_dir, ident: str) -> SnapshotDir:
+    snapshot_dir_path = get_shared_snapshot_dir_path(top_output_dir, ident)
+    snapshot_dir_path.mkdir(exist_ok=True, parents=True)
+    return SnapshotDir(snapshot_dir_path)
+
+
@pytest.fixture(scope="function")
 def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]:
    """
@@ -3541,7 +3744,7 @@ def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[
    The procedure cleans up after previous runs that were aborted (e.g. due to Ctrl-C, OOM kills, etc).
    """

-    if os.getenv("NEON_ENV_BUILDER_FROM_REPO_DIR_USE_OVERLAYFS") is None:
+    if os.getenv("NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS") is None:
        return None

    overlay_dir = get_test_overlay_dir(request, top_output_dir)
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -20,6 +20,7 @@ from fixtures.utils import Fn
 class PageserverApiException(Exception):
    def __init__(self, message, status_code: int):
        super().__init__(message)
+        self.message = message
        self.status_code = status_code


@@ -261,12 +262,18 @@ class PageserverHttpClient(requests.Session):
        )
        self.verbose_error(res)

-    def tenant_detach(self, tenant_id: TenantId, detach_ignored=False):
+    def tenant_detach(self, tenant_id: TenantId, detach_ignored=False, timeout_secs=None):
        params = {}
        if detach_ignored:
            params["detach_ignored"] = "true"

-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
+        kwargs = {}
+        if timeout_secs is not None:
+            kwargs["timeout"] = timeout_secs
+
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params, **kwargs
+        )
        self.verbose_error(res)

    def tenant_reset(self, tenant_id: Union[TenantId, TenantShardId], drop_cache: bool):
@@ -526,6 +533,17 @@ class PageserverHttpClient(requests.Session):
        res_json = res.json()
        assert res_json is None

+    def timeline_preserve_initdb_archive(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
+    ):
+        log.info(
+            f"Requesting initdb archive preservation for tenant {tenant_id} and timeline {timeline_id}"
+        )
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive",
+        )
+        self.verbose_error(res)
+
    def timeline_get_lsn_by_timestamp(
        self,
        tenant_id: Union[TenantId, TenantShardId],
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -0,0 +1,85 @@
+import concurrent.futures
+import time
+from typing import Any, Callable, Dict, Tuple
+
+import fixtures.pageserver.remote_storage
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+)
+from fixtures.pageserver.utils import (
+    wait_until_tenant_state,
+)
+from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
+from fixtures.types import TenantId, TimelineId
+
+
+def single_timeline(
+    neon_env_builder: NeonEnvBuilder,
+    setup_template: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]],
+    ncopies: int,
+) -> NeonEnv:
+    """
+    Create `ncopies` duplicates of a template tenant that has a single timeline.
+    """
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+
+    env = neon_env_builder.init_start()
+
+    remote_storage = env.pageserver_remote_storage
+    assert isinstance(remote_storage, LocalFsStorage)
+
+    ps_http = env.pageserver.http_client()
+    # clean up the useless default tenant
+    ps_http.tenant_delete(env.initial_tenant)
+
+    log.info("invoking callback to create template tenant")
+    template_tenant, template_timeline, template_config = setup_template(env)
+    log.info(
+        f"template tenant is template_tenant={template_tenant} template_timeline={template_timeline}"
+    )
+
+    log.info("detach template tenant form pageserver")
+    env.pageserver.tenant_detach(template_tenant)
+    env.pageserver.allowed_errors.append(
+        # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+        ".*Dropped remote consistent LSN updates.*",
+    )
+
+    log.info(f"duplicating template tenant {ncopies} times in S3")
+    tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
+
+    log.info("attach duplicated tenants to pageserver")
+    # In theory we could just attach all the tenants, force on-demand downloads via mgmt API, and be done.
+    # However, on-demand downloads are quite slow ATM.
+    # => do the on-demand downloads in Python.
+    assert ps_http.tenant_list() == []
+    # make the attach fail after it created enough on-disk state to retry loading
+    # the tenant next startup, but before it can start background loops that would start download
+    ps_http.configure_failpoints(("attach-before-activate", "return"))
+    env.pageserver.allowed_errors.append(
+        ".*attach failed, setting tenant state to Broken: attach-before-activate.*"
+    )
+
+    def attach_broken(tenant):
+        env.pageserver.tenant_attach(
+            tenant,
+            config=template_config.copy(),
+        )
+        time.sleep(0.1)
+        wait_until_tenant_state(ps_http, tenant, "Broken", 10)
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
+        executor.map(attach_broken, tenants)
+
+    env.pageserver.stop(
+        immediate=True
+    )  # clears the failpoint as a side-effect; immediate to avoid hitting neon_local's timeout
+    tenant_timelines = list(map(lambda tenant: (tenant, template_timeline), tenants))
+    log.info("python-side on-demand download the layer files into local tenant dir")
+    fixtures.pageserver.remote_storage.copy_all_remote_layer_files_to_local_tenant_dir(
+        env, tenant_timelines
+    )
+
+    return env
--- a/test_runner/fixtures/pageserver/remote_storage.py
+++ b/test_runner/fixtures/pageserver/remote_storage.py
@@ -0,0 +1,116 @@
+import concurrent.futures
+import os
+import queue
+import shutil
+import threading
+from pathlib import Path
+from typing import Any, List, Tuple
+
+from fixtures.neon_fixtures import NeonEnv, Pagectl
+from fixtures.pageserver.types import (
+    InvalidFileName,
+    parse_layer_file_name,
+)
+from fixtures.remote_storage import LocalFsStorage
+from fixtures.types import TenantId, TimelineId
+
+
+def duplicate_one_tenant(env: NeonEnv, template_tenant: TenantId, new_tenant: TenantId):
+    remote_storage = env.pageserver_remote_storage
+    assert isinstance(remote_storage, LocalFsStorage)
+
+    src_timelines_dir: Path = remote_storage.tenant_path(template_tenant) / "timelines"
+    assert src_timelines_dir.is_dir(), f"{src_timelines_dir} is not a directory"
+
+    assert isinstance(remote_storage, LocalFsStorage)
+    dst_timelines_dir: Path = remote_storage.tenant_path(new_tenant) / "timelines"
+    dst_timelines_dir.parent.mkdir(parents=False, exist_ok=False)
+    dst_timelines_dir.mkdir(parents=False, exist_ok=False)
+
+    for tl in src_timelines_dir.iterdir():
+        src_tl_dir = src_timelines_dir / tl.name
+        assert src_tl_dir.is_dir(), f"{src_tl_dir} is not a directory"
+        dst_tl_dir = dst_timelines_dir / tl.name
+        dst_tl_dir.mkdir(parents=False, exist_ok=False)
+        for file in tl.iterdir():
+            shutil.copy2(file, dst_tl_dir)
+            if "__" in file.name:
+                Pagectl(env).raw_cli(
+                    [
+                        "layer",
+                        "rewrite-summary",
+                        str(dst_tl_dir / file.name),
+                        "--new-tenant-id",
+                        str(new_tenant),
+                    ]
+                )
+            else:
+                # index_part etc need no patching
+                pass
+    return None
+
+
+def duplicate_tenant(env: NeonEnv, template_tenant: TenantId, ncopies: int) -> List[TenantId]:
+    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+
+    def work(tenant_id):
+        duplicate_one_tenant(env, template_tenant, tenant_id)
+
+    new_tenants: List[TenantId] = [TenantId.generate() for _ in range(0, ncopies)]
+    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
+        executor.map(work, new_tenants)
+    return new_tenants
+
+
+def local_layer_name_from_remote_name(remote_name: str) -> str:
+    try:
+        return parse_layer_file_name(remote_name).to_str()
+    except InvalidFileName as e:
+        comps = remote_name.rsplit("-", 1)
+        if len(comps) == 1:
+            raise InvalidFileName("no generation suffix found") from e
+        else:
+            assert len(comps) == 2
+            layer_file_name, _generation = comps
+            try:
+                return parse_layer_file_name(layer_file_name).to_str()
+            except InvalidFileName:
+                raise
+
+
+def copy_all_remote_layer_files_to_local_tenant_dir(
+    env: NeonEnv, tenant_timelines: List[Tuple[TenantId, TimelineId]]
+):
+    remote_storage = env.pageserver_remote_storage
+    assert isinstance(remote_storage, LocalFsStorage)
+    work: queue.Queue[Any] = queue.Queue()
+    for tenant, timeline in tenant_timelines:
+        remote_timeline_path = remote_storage.timeline_path(tenant, timeline)
+        local_timeline_path = env.pageserver.timeline_dir(tenant, timeline)
+        local_timeline_path.mkdir(parents=True, exist_ok=True)
+        downloads = {}
+        for remote_layer in remote_timeline_path.glob("*__*"):
+            local_name = local_layer_name_from_remote_name(remote_layer.name)
+            assert local_name not in downloads, "remote storage must have had split brain"
+            downloads[local_name] = remote_layer
+        for local_name, remote_path in downloads.items():
+            work.put((remote_path, local_timeline_path / local_name))
+
+    def copy_layer_worker(queue):
+        while True:
+            item = queue.get()
+            if item is None:
+                return
+            remote_path, local_path = item
+            # not copy2, so it looks like a recent download, in case that's relevant to e.g. eviction
+            shutil.copy(remote_path, local_path, follow_symlinks=False)
+
+    workers = []
+    n_threads = os.cpu_count() or 1
+    for _ in range(0, n_threads):
+        w = threading.Thread(target=copy_layer_worker, args=[work])
+        workers.append(w)
+        w.start()
+        work.put(None)
+    for w in workers:
+        w.join()
--- a/test_runner/fixtures/pageserver/types.py
+++ b/test_runner/fixtures/pageserver/types.py
@@ -31,10 +31,10 @@ class DeltaLayerFileName:
    key_start: Key
    key_end: Key

-    def is_l0(self):
+    def is_l0(self) -> bool:
        return self.key_start == KEY_MIN and self.key_end == KEY_MAX

-    def to_str(self):
+    def to_str(self) -> str:
        ret = f"{self.key_start.as_int():036X}-{self.key_end.as_int():036X}__{self.lsn_start.as_int():016X}-{self.lsn_end.as_int():016X}"
        assert self == parse_layer_file_name(ret)
        return ret
@@ -107,7 +107,7 @@ def parse_layer_file_name(file_name: str) -> LayerFileName:
    except InvalidFileName:
        pass

-    raise ValueError()
+    raise InvalidFileName("neither image nor delta layer")


 def is_future_layer(layer_file_name: LayerFileName, disk_consistent_lsn: Lsn):
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -63,6 +63,14 @@ def wait_for_upload(
    )


+def _tenant_in_expected_state(tenant_info: Dict[str, Any], expected_state: str):
+    if tenant_info["state"]["slug"] == expected_state:
+        return True
+    if tenant_info["state"]["slug"] == "Broken":
+        raise RuntimeError(f"tenant became Broken, not {expected_state}")
+    return False
+
+
 def wait_until_tenant_state(
    pageserver_http: PageserverHttpClient,
    tenant_id: TenantId,
@@ -80,10 +88,8 @@ def wait_until_tenant_state(
            log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
        else:
            log.debug(f"Tenant {tenant_id} data: {tenant}")
-            if tenant["state"]["slug"] == expected_state:
+            if _tenant_in_expected_state(tenant, expected_state):
                return tenant
-            if tenant["state"]["slug"] == "Broken":
-                raise RuntimeError(f"tenant became Broken, not {expected_state}")

        time.sleep(period)

@@ -92,6 +98,34 @@ def wait_until_tenant_state(
    )


+def wait_until_all_tenants_state(
+    pageserver_http: PageserverHttpClient,
+    expected_state: str,
+    iterations: int,
+    period: float = 1.0,
+    http_error_ok: bool = True,
+):
+    """
+    Like wait_until_tenant_state, but checks all tenants.
+    """
+    for _ in range(iterations):
+        try:
+            tenants = pageserver_http.tenant_list()
+        except Exception as e:
+            if http_error_ok:
+                log.debug(f"Failed to list tenants: {e}")
+            else:
+                raise
+        else:
+            if all(map(lambda tenant: _tenant_in_expected_state(tenant, expected_state), tenants)):
+                return
+        time.sleep(period)
+
+    raise Exception(
+        f"Not all tenants became active {expected_state} within {iterations * period} seconds"
+    )
+
+
 def wait_until_timeline_state(
    pageserver_http: PageserverHttpClient,
    tenant_id: Union[TenantId, TenantShardId],
@@ -337,8 +371,24 @@ def tenant_delete_wait_completed(
    pageserver_http: PageserverHttpClient,
    tenant_id: TenantId,
    iterations: int,
+    ignore_errors: bool = False,
 ):
-    pageserver_http.tenant_delete(tenant_id=tenant_id)
+    if not ignore_errors:
+        pageserver_http.tenant_delete(tenant_id=tenant_id)
+    else:
+        interval = 0.5
+
+        def delete_request_sent():
+            try:
+                pageserver_http.tenant_delete(tenant_id=tenant_id)
+            except PageserverApiException as e:
+                log.debug(e)
+                if e.status_code == 404:
+                    return
+            except Exception as e:
+                log.debug(e)
+
+        wait_until(iterations, interval=interval, func=delete_request_sent)
    wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)


--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -397,3 +397,36 @@ def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
    }
    """
    pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", connstr])
+
+
+def humantime_to_ms(humantime: str) -> float:
+    """
+    Converts Rust humantime's output string to milliseconds.
+
+    humantime_to_ms("1h 1ms 406us") -> 3600001.406
+    """
+
+    unit_multiplier_map = {
+        "ns": 1e-6,
+        "us": 1e-3,
+        "ms": 1,
+        "s": 1e3,
+        "m": 1e3 * 60,
+        "h": 1e3 * 60 * 60,
+    }
+    matcher = re.compile(rf"^(\d+)({'|'.join(unit_multiplier_map.keys())})$")
+    total_ms = 0.0
+
+    if humantime == "0":
+        return total_ms
+
+    for item in humantime.split():
+        if (match := matcher.search(item)) is not None:
+            n, unit = match.groups()
+            total_ms += int(n) * unit_multiplier_map[unit]
+        else:
+            raise ValueError(
+                f"can't parse '{item}' (from string '{humantime}'), known units are {', '.join(unit_multiplier_map.keys())}."
+            )
+
+    return round(total_ms, 3)
--- a/test_runner/performance/pageserver/README.md
+++ b/test_runner/performance/pageserver/README.md
@@ -0,0 +1,16 @@
+How to reproduce benchmark results / run these benchmarks interactively.
+
+1. Get an EC2 instance with Instance Store. Use the same instance type as used for the benchmark run.
+2. Mount the Instance Store => `neon.git/scripts/ps_ec2_setup_instance_store`
+3. Use a pytest command line (see other READMEs further up in the pytest hierarchy).
+
+For tests that take a long time to set up / consume a lot of storage space,
+we use the test suite's repo_dir snapshotting functionality (`from_repo_dir`).
+It supports mounting snapshots using overlayfs, which improves iteration time.
+
+Here's a full command line.
+
+```
+RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=15 BUILD_TYPE=release \
+    ./scripts/pytest test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+````
--- a/test_runner/performance/pageserver/init.py
+++ b/test_runner/performance/pageserver/init.py
--- a/test_runner/performance/pageserver/interactive/init.py
+++ b/test_runner/performance/pageserver/interactive/init.py
@@ -0,0 +1,8 @@
+"""
+Tests that aren't really tests or benchmarks.
+
+They're intended for the case where we want to standardize & automate setup,
+but then debug a performance problem interactively.
+It's kind of an abuse of the test framework, but, it's our only tool right
+now to automate a complex test bench setup.
+"""
--- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
+++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
@@ -0,0 +1,79 @@
+import os
+import pdb
+
+import fixtures.pageserver.many_tenants as many_tenants
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    last_flush_lsn_upload,
+)
+
+from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
+
+"""
+Usage:
+DEFAULT_PG_VERSION=15 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \
+    ./scripts/pytest --timeout 0 test_runner/performance/pageserver/interactive/test_many_small_tenants.py
+"""
+
+
+@pytest.mark.skipif(
+    os.environ.get("INTERACTIVE", "false") != "true",
+    reason="test is for interactive use only",
+)
+def test_many_small_tenants(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+):
+    _env = setup_env(neon_env_builder, 2)  # vary this to the desired number of tenants
+    _pg_bin = pg_bin
+
+    # drop into pdb so that we can debug pageserver interactively, use pdb here
+    # For example, to interactively examine pageserver startup behavior, call
+    #   _env.pageserver.stop(immediate=True)
+    #   _env.pageserver.start()
+    # from the pdb shell.
+    pdb.set_trace()
+
+
+def setup_env(
+    neon_env_builder: NeonEnvBuilder,
+    n_tenants: int,
+) -> NeonEnv:
+    def setup_template(env: NeonEnv):
+        # create our template tenant
+        config = {
+            "gc_period": "0s",
+            "checkpoint_timeout": "10 years",
+            "compaction_period": "20 s",
+            "compaction_threshold": 10,
+            "compaction_target_size": 134217728,
+            "checkpoint_distance": 268435456,
+            "image_creation_threshold": 3,
+        }
+        template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
+        env.pageserver.tenant_detach(template_tenant)
+        env.pageserver.allowed_errors.append(
+            # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+            ".*Dropped remote consistent LSN updates.*",
+        )
+        env.pageserver.tenant_attach(template_tenant, config)
+        ep = env.endpoints.create_start("main", tenant_id=template_tenant)
+        ep.safe_psql("create table foo(b text)")
+        for _ in range(0, 8):
+            ep.safe_psql("insert into foo(b) values ('some text')")
+            last_flush_lsn_upload(env, ep, template_tenant, template_timeline)
+        ep.stop_and_destroy()
+        return (template_tenant, template_timeline, config)
+
+    def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
+        return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants)
+
+    env = neon_env_builder.build_and_use_snapshot(f"many-small-tenants-{n_tenants}", doit)
+
+    env.start()
+    ensure_pageserver_ready_for_benchmarking(env, n_tenants)
+
+    return env
--- a/test_runner/performance/pageserver/pagebench/init.py
+++ b/test_runner/performance/pageserver/pagebench/init.py
@@ -0,0 +1,10 @@
+"""
+Pagebench-based performance regression tests.
+
+The defining characteristic of tests in this sub-directory is that they
+are component-level tests, i.e., they exercise pageserver directly using `pagebench`
+instead of benchmarking the full stack.
+
+See https://github.com/neondatabase/neon/issues/5771
+for the context in which this was developed.
+"""
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -0,0 +1,210 @@
+import json
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+import fixtures.pageserver.many_tenants as many_tenants
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgBin,
+    wait_for_last_flush_lsn,
+)
+from fixtures.utils import get_scale_for_db, humantime_to_ms
+
+from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
+
+
+# For reference, the space usage of the snapshots:
+# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots
+# 137G    /instance_store/test_output/shared-snapshots
+# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots/*
+# 1.8G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-13
+# 1.1G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-6
+# 8.5G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-13
+# 5.1G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6
+# 76G     /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13
+# 46G     /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6
+@pytest.mark.parametrize("duration", [30])
+@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]])
+@pytest.mark.parametrize("n_tenants", [1, 10, 100])
+@pytest.mark.timeout(
+    10000
+)  # TODO: this value is just "a really high number"; have this per instance type
+def test_pageserver_max_throughput_getpage_at_latest_lsn(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    n_tenants: int,
+    pgbench_scale: int,
+    duration: int,
+):
+    def record(metric, **kwargs):
+        zenbenchmark.record(
+            metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}", **kwargs
+        )
+
+    params: Dict[str, Tuple[Any, Dict[str, Any]]] = {}
+
+    # params from fixtures
+    params.update(
+        {
+            "n_tenants": (n_tenants, {"unit": ""}),
+            "pgbench_scale": (pgbench_scale, {"unit": ""}),
+            "duration": (duration, {"unit": "s"}),
+        }
+    )
+
+    # configure cache sizes like in prod
+    page_cache_size = 16384
+    max_file_descriptors = 500000
+    neon_env_builder.pageserver_config_override = (
+        f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}"
+    )
+    params.update(
+        {
+            "pageserver_config_override.page_cache_size": (
+                page_cache_size * 8192,
+                {"unit": "byte"},
+            ),
+            "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}),
+        }
+    )
+
+    for param, (value, kwargs) in params.items():
+        record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs)
+    env = setup_pageserver_with_pgbench_tenants(neon_env_builder, pg_bin, n_tenants, pgbench_scale)
+    run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration)
+
+
+def run_benchmark_max_throughput_latest_lsn(
+    env: NeonEnv, pg_bin: PgBin, record, duration_secs: int
+):
+    """
+    Benchmark `env.pageserver` for max throughput @ latest LSN and record results in `zenbenchmark`.
+    """
+
+    ps_http = env.pageserver.http_client()
+    cmd = [
+        str(env.neon_binpath / "pagebench"),
+        "get-page-latest-lsn",
+        "--mgmt-api-endpoint",
+        ps_http.base_url,
+        "--page-service-connstring",
+        env.pageserver.connstr(password=None),
+        "--runtime",
+        f"{duration_secs}s",
+        # don't specify the targets explicitly, let pagebench auto-discover them
+    ]
+    log.info(f"command: {' '.join(cmd)}")
+    basepath = pg_bin.run_capture(cmd, with_command_header=False)
+    results_path = Path(basepath + ".stdout")
+    log.info(f"Benchmark results at: {results_path}")
+
+    with open(results_path, "r") as f:
+        results = json.load(f)
+    log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
+
+    total = results["total"]
+
+    metric = "request_count"
+    record(
+        metric,
+        metric_value=total[metric],
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
+    metric = "latency_mean"
+    record(
+        metric,
+        metric_value=humantime_to_ms(total[metric]),
+        unit="ms",
+        report=MetricReport.LOWER_IS_BETTER,
+    )
+
+    metric = "latency_percentiles"
+    for k, v in total[metric].items():
+        record(
+            f"{metric}.{k}",
+            metric_value=humantime_to_ms(v),
+            unit="ms",
+            report=MetricReport.LOWER_IS_BETTER,
+        )
+
+
+def setup_pageserver_with_pgbench_tenants(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+    n_tenants: int,
+    scale: int,
+) -> NeonEnv:
+    """
+    Utility function to set up a pageserver with a given number of identical tenants.
+    Each tenant is a pgbench tenant, initialize to a certain scale, and treated afterwards
+    with a repeat application of (pgbench simple-update workload, checkpoint, compact).
+    """
+
+    def setup_template(env: NeonEnv):
+        # use a config that makes production of on-disk state timing-insensitive
+        # as we ingest data into the tenant.
+        config = {
+            "gc_period": "0s",  # disable periodic gc
+            "checkpoint_timeout": "10 years",
+            "compaction_period": "0s",  # disable periodic compaction
+            "compaction_threshold": 10,
+            "compaction_target_size": 134217728,
+            "checkpoint_distance": 268435456,
+            "image_creation_threshold": 3,
+        }
+        template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
+        env.pageserver.tenant_detach(template_tenant)
+        env.pageserver.allowed_errors.append(
+            # tenant detach causes this because the underlying attach-hook removes the tenant from attachment_service entirely
+            ".*Dropped remote consistent LSN updates.*",
+        )
+        env.pageserver.tenant_attach(template_tenant, config)
+        ps_http = env.pageserver.http_client()
+        with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
+            pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()])
+            wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+            ps_http.timeline_checkpoint(template_tenant, template_timeline)
+            ps_http.timeline_compact(template_tenant, template_timeline)
+            for _ in range(
+                0, 17
+            ):  # some prime number to avoid potential resonances with the "_threshold" variables from the config
+                # the L0s produced by this appear to have size ~5MiB
+                num_txns = 10_000
+                pg_bin.run_capture(
+                    ["pgbench", "-N", "-c1", "--transactions", f"{num_txns}", ep.connstr()]
+                )
+                wait_for_last_flush_lsn(env, ep, template_tenant, template_timeline)
+                ps_http.timeline_checkpoint(template_tenant, template_timeline)
+                ps_http.timeline_compact(template_tenant, template_timeline)
+        # for reference, the output at scale=6 looked like so (306M total)
+        # ls -sh test_output/shared-snapshots/max_throughput_latest_lsn-2-6/snapshot/pageserver_1/tenants/35c30b88ea16a7a09f82d9c6a115551b/timelines/da902b378eebe83dc8a4e81cd3dc1c59
+        # total 306M
+        # 188M 000000000000000000000000000000000000-030000000000000000000000000000000003__000000000149F060-0000000009E75829
+        # 4.5M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000009E75829-000000000A21E919
+        #  33M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000A21E919-000000000C20CB71
+        #  36M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000C20CB71-000000000E470791
+        #  16M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000E470791-000000000F34AEF1
+        # 8.2M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000F34AEF1-000000000FABA8A9
+        # 6.0M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FABA8A9-000000000FFE0639
+        # 6.1M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000000FFE0639-000000001051D799
+        # 4.7M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000001051D799-0000000010908F19
+        # 4.6M 000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000010908F19-0000000010CD3021
+
+        return (template_tenant, template_timeline, config)
+
+    def doit(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
+        return many_tenants.single_timeline(neon_env_builder, setup_template, n_tenants)
+
+    env = neon_env_builder.build_and_use_snapshot(
+        f"max_throughput_latest_lsn-{n_tenants}-{scale}", doit
+    )
+    env.start()
+    ensure_pageserver_ready_for_benchmarking(env, n_tenants)
+    return env
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -0,0 +1,29 @@
+"""
+Utilities used by all code in this sub-directory
+"""
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.pageserver.utils import wait_until_all_tenants_state
+
+
+def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
+    """
+    Helper function.
+    """
+    ps_http = env.pageserver.http_client()
+
+    log.info("wait for all tenants to become active")
+    wait_until_all_tenants_state(
+        ps_http, "Active", iterations=n_tenants, period=1, http_error_ok=False
+    )
+
+    # ensure all layers are resident for predictiable performance
+    tenants = [info["id"] for info in ps_http.tenant_list()]
+    for tenant in tenants:
+        for timeline in ps_http.tenant_status(tenant)["timelines"]:
+            info = ps_http.layer_map_info(tenant, timeline)
+            for layer in info.historic_layers:
+                assert not layer.remote
+
+    log.info("ready")
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -7,11 +7,13 @@ from typing import List, Optional

 import pytest
 import toml
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
    PgBin,
 )
+from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
    timeline_delete_wait_completed,
    wait_for_last_record_lsn,
@@ -269,14 +271,20 @@ def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, r
    timeline_id = env.initial_timeline
    pg_version = env.pg_version

-    # Delete all files from local_fs_remote_storage except initdb.tar.zst,
+    try:
+        pageserver_http.timeline_preserve_initdb_archive(tenant_id, timeline_id)
+    except PageserverApiException as e:
+        # Allow the error as we might be running the old pageserver binary
+        log.info(f"Got allowed error: '{e}'")
+
+    # Delete all files from local_fs_remote_storage except initdb-preserved.tar.zst,
    # the file is required for `timeline_create` with `existing_initdb_timeline_id`.
    #
    # TODO: switch to Path.walk() in Python 3.12
    # for dirpath, _dirnames, filenames in (repo_dir / "local_fs_remote_storage").walk():
    for dirpath, _dirnames, filenames in os.walk(repo_dir / "local_fs_remote_storage"):
        for filename in filenames:
-            if filename != "initdb.tar.zst":
+            if filename != "initdb-preserved.tar.zst" and filename != "initdb.tar.zst":
                (Path(dirpath) / filename).unlink()

    timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id)
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -248,8 +248,15 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
    # We don't have compute_ctl, so here, so create neon_superuser here manually
    cur.execute("CREATE ROLE neon_superuser NOLOGIN CREATEDB CREATEROLE")

-    with pytest.raises(psycopg2.InternalError):
-        cur.execute("ALTER ROLE neon_superuser LOGIN")
+    # Contrary to popular belief, being superman does not make you superuser
+    cur.execute("CREATE ROLE superman LOGIN NOSUPERUSER PASSWORD 'jungle_man'")
+
+    with ddl.pg.cursor(user="superman", password="jungle_man") as superman_cur:
+        # We allow real SUPERUSERs to ALTER neon_superuser
+        with pytest.raises(psycopg2.InternalError):
+            superman_cur.execute("ALTER ROLE neon_superuser LOGIN")
+
+    cur.execute("ALTER ROLE neon_superuser LOGIN")

    with pytest.raises(psycopg2.InternalError):
        cur.execute("CREATE DATABASE trololobus WITH OWNER neon_superuser")
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -2,7 +2,7 @@ import enum
 import time
 from collections import Counter
 from dataclasses import dataclass
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, Iterable, Tuple

 import pytest
 import toml
@@ -121,17 +121,7 @@ class EvictionEnv:
        }

    def count_layers_per_tenant(self, pageserver: NeonPageserver) -> Dict[TenantId, int]:
-        ret: Counter[TenantId] = Counter()
-
-        for tenant_id, timeline_id in self.timelines:
-            timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
-            assert timeline_dir.exists()
-            for file in timeline_dir.iterdir():
-                if "__" not in file.name:
-                    continue
-                ret[tenant_id] += 1
-
-        return dict(ret)
+        return count_layers_per_tenant(pageserver, self.timelines)

    def warm_up_tenant(self, tenant_id: TenantId):
        """
@@ -199,6 +189,22 @@ class EvictionEnv:
        wait_until(10, 1, statvfs_called)


+def count_layers_per_tenant(
+    pageserver: NeonPageserver, timelines: Iterable[Tuple[TenantId, TimelineId]]
+) -> Dict[TenantId, int]:
+    ret: Counter[TenantId] = Counter()
+
+    for tenant_id, timeline_id in timelines:
+        timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
+        assert timeline_dir.exists()
+        for file in timeline_dir.iterdir():
+            if "__" not in file.name:
+                continue
+            ret[tenant_id] += 1
+
+    return dict(ret)
+
+
 def human_bytes(amt: float) -> str:
    suffixes = ["", "Ki", "Mi", "Gi"]

@@ -243,21 +249,7 @@ def _eviction_env(

    timelines = []
    for scale in pgbench_scales:
-        tenant_id, timeline_id = env.neon_cli.create_tenant(
-            conf={
-                "gc_period": "0s",
-                "compaction_period": "0s",
-                "checkpoint_distance": f"{layer_size}",
-                "image_creation_threshold": "100",
-                "compaction_target_size": f"{layer_size}",
-            }
-        )
-
-        with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
-            pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()])
-            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-
-        timelines.append((tenant_id, timeline_id))
+        timelines.append(pgbench_init_tenant(layer_size, scale, env, pg_bin))

    # stop the safekeepers to avoid on-demand downloads caused by
    # initial logical size calculation triggered by walreceiver connection status
@@ -266,25 +258,13 @@ def _eviction_env(

    # after stopping the safekeepers, we know that no new WAL will be coming in
    for tenant_id, timeline_id in timelines:
-        pageserver_http = env.get_tenant_pageserver(tenant_id).http_client()
-
-        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
-        wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
-        tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id)
-        assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"]
-        assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"]
-        pgbench_init_lsns[tenant_id] = Lsn(tl_info["last_record_lsn"])
-
-        layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
-        log.info(f"{layers}")
-        assert (
-            len(layers.historic_layers) >= 10
-        ), "evictions happen at layer granularity, but we often assert at byte-granularity"
+        pgbench_init_lsns[tenant_id] = finish_tenant_creation(env, tenant_id, timeline_id, 10)

    eviction_env = EvictionEnv(
        timelines=timelines,
        neon_env=env,
-        pageserver_http=pageserver_http,
+        # this last tenant http client works for num_pageservers=1
+        pageserver_http=env.get_tenant_pageserver(timelines[-1][0]).http_client(),
        layer_size=layer_size,
        pg_bin=pg_bin,
        pgbench_init_lsns=pgbench_init_lsns,
@@ -293,6 +273,49 @@ def _eviction_env(
    return eviction_env


+def pgbench_init_tenant(
+    layer_size: int, scale: int, env: NeonEnv, pg_bin: PgBin
+) -> Tuple[TenantId, TimelineId]:
+    tenant_id, timeline_id = env.neon_cli.create_tenant(
+        conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            "checkpoint_distance": f"{layer_size}",
+            "image_creation_threshold": "100",
+            "compaction_target_size": f"{layer_size}",
+        }
+    )
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()])
+        wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+
+    return (tenant_id, timeline_id)
+
+
+def finish_tenant_creation(
+    env: NeonEnv,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    min_expected_layers: int,
+) -> Lsn:
+    pageserver_http = env.get_tenant_pageserver(tenant_id).http_client()
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
+    tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id)
+    assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"]
+    assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"]
+    pgbench_init_lsn = Lsn(tl_info["last_record_lsn"])
+
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    # log.info(f"{layers}")
+    assert (
+        len(layers.historic_layers) >= min_expected_layers
+    ), "evictions happen at layer granularity, but we often assert at byte-granularity"
+
+    return pgbench_init_lsn
+
+
@pytest.fixture
 def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
    return _eviction_env(request, neon_env_builder, pg_bin, num_pageservers=1)
@@ -598,9 +621,82 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
            assert abs_diff < 0.05


+@pytest.mark.parametrize(
+    "order",
+    [
+        EvictionOrder.ABSOLUTE_ORDER,
+        EvictionOrder.RELATIVE_ORDER_EQUAL,
+        EvictionOrder.RELATIVE_ORDER_SPARE,
+    ],
+)
+def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, order: EvictionOrder):
+    """
+    Create in order first smaller tenants and finally a single larger tenant.
+    Assert that with relative order modes, the disk usage based eviction is
+    more fair towards the smaller tenants.
+    """
+    env = neon_env_builder.init_configs()
+    env.start()
+    env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")
+
+    # initial_tenant and initial_timeline do not exist
+
+    # create N tenants the same fashion as EvictionEnv
+    layer_size = 5 * 1024**2
+    timelines = []
+    for scale in [1, 1, 1, 4]:
+        timelines.append((pgbench_init_tenant(layer_size, scale, env, pg_bin), scale))
+
+    env.neon_cli.safekeeper_stop()
+
+    for (tenant_id, timeline_id), scale in timelines:
+        min_expected_layers = 4 if scale == 1 else 10
+        finish_tenant_creation(env, tenant_id, timeline_id, min_expected_layers)
+
+    tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines))
+    (total_on_disk, _, _) = poor_mans_du(env, map(lambda x: x[0], timelines), env.pageserver, False)
+
+    # cut 10 percent
+    response = env.pageserver.http_client().disk_usage_eviction_run(
+        {"evict_bytes": total_on_disk // 10, "eviction_order": order.config()}
+    )
+    log.info(f"{response}")
+
+    after_tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines))
+
+    ratios = []
+    for i, ((tenant_id, _timeline_id), _scale) in enumerate(timelines):
+        # we expect the oldest to suffer most
+        originally, after = tenant_layers[tenant_id], after_tenant_layers[tenant_id]
+        log.info(f"{i + 1}th tenant went from {originally} -> {after}")
+        ratio = after / originally
+        ratios.append(ratio)
+
+    assert (
+        len(ratios) == 4
+    ), "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order"
+    log.info(f"{ratios}")
+
+    if order == EvictionOrder.ABSOLUTE_ORDER:
+        # first tenant loses most
+        assert ratios[0] <= ratios[1], "first should lose the most"
+        assert ratios[1] < ratios[2], "second should lose some"
+        assert ratios[1] < 1.0
+        assert ratios[2] <= ratios[3], "third might not lose"
+        assert ratios[3] == 1.0, "tenant created last does not lose"
+    elif order == EvictionOrder.RELATIVE_ORDER_EQUAL:
+        assert all([x for x in ratios if x < 1.0]), "all tenants lose layers"
+    elif order == EvictionOrder.RELATIVE_ORDER_SPARE:
+        # with different layer sizes and pg versions, there are different combinations
+        assert len([x for x in ratios if x < 1.0]) >= 2, "require 2..4 tenants to lose layers"
+        assert ratios[3] < 1.0, "largest tenant always loses layers"
+    else:
+        raise RuntimeError(f"unimplemented {order}")
+
+
 def poor_mans_du(
    env: NeonEnv,
-    timelines: list[Tuple[TenantId, TimelineId]],
+    timelines: Iterable[Tuple[TenantId, TimelineId]],
    pageserver: NeonPageserver,
    verbose: bool = False,
 ) -> Tuple[int, int, int]:
--- a/test_runner/regress/test_migrations.py
+++ b/test_runner/regress/test_migrations.py
@@ -0,0 +1,37 @@
+import time
+
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_migrations(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_migrations", "empty")
+
+    endpoint = env.endpoints.create("test_migrations")
+    log_path = endpoint.endpoint_path() / "compute.log"
+
+    endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"])
+    endpoint.start()
+
+    time.sleep(1)  # Sleep to let migrations run
+
+    with endpoint.cursor() as cur:
+        cur.execute("SELECT id FROM neon_migration.migration_id")
+        migration_id = cur.fetchall()
+        assert migration_id[0][0] == 2
+
+    with open(log_path, "r") as log_file:
+        logs = log_file.read()
+        assert "INFO handle_migrations: Ran 2 migrations" in logs
+
+    endpoint.stop()
+    endpoint.start()
+    time.sleep(1)  # Sleep to let migrations run
+    with endpoint.cursor() as cur:
+        cur.execute("SELECT id FROM neon_migration.migration_id")
+        migration_id = cur.fetchall()
+        assert migration_id[0][0] == 2
+
+    with open(log_path, "r") as log_file:
+        logs = log_file.read()
+        assert "INFO handle_migrations: Ran 0 migrations" in logs
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -0,0 +1,34 @@
+import time
+
+from fixtures.neon_fixtures import NeonEnv
+from fixtures.pg_version import PgVersion
+
+
+def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_neon_superuser", "empty")
+    endpoint = env.endpoints.create("test_neon_superuser")
+    endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"])
+    endpoint.start()
+
+    time.sleep(1)  # Sleep to let migrations run
+
+    with endpoint.cursor() as cur:
+        cur.execute(
+            "CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser"
+        )
+        cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers")
+        cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser")
+
+    with endpoint.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
+        cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'member')")
+        assert cur.fetchall()[0][0]
+        cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'usage')")
+        assert cur.fetchall()[0][0]
+
+        if pg_version == PgVersion.V16:
+            cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'set')")
+            assert cur.fetchall()[0][0]
+
+        cur.execute("CREATE PUBLICATION pub FOR ALL TABLES")
+        cur.execute("CREATE ROLE definitely_not_a_superuser WITH PASSWORD 'nope'")
--- a/test_runner/regress/test_pageserver_reconnect.py
+++ b/test_runner/regress/test_pageserver_reconnect.py
@@ -0,0 +1,42 @@
+import threading
+import time
+from contextlib import closing
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, PgBin
+
+
+# Test updating neon.pageserver_connstring setting on the fly.
+#
+# This merely changes some whitespace in the connection string, so
+# this doesn't prove that the new string actually takes effect. But at
+# least the code gets exercised.
+def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_pageserver_restarts")
+    endpoint = env.endpoints.create_start("test_pageserver_restarts")
+    n_reconnects = 1000
+    timeout = 0.01
+    scale = 10
+
+    def run_pgbench(connstr: str):
+        log.info(f"Start a pgbench workload on pg {connstr}")
+        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
+        pg_bin.run_capture(["pgbench", f"-T{int(n_reconnects*timeout)}", connstr])
+
+    thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
+    thread.start()
+
+    with closing(endpoint.connect()) as con:
+        with con.cursor() as c:
+            c.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
+            connstring = c.fetchall()[0][0]
+            for i in range(n_reconnects):
+                time.sleep(timeout)
+                c.execute(
+                    "alter system set neon.pageserver_connstring=%s",
+                    (connstring + (" " * (i % 2)),),
+                )
+                c.execute("select pg_reload_conf()")
+
+    thread.join()
--- a/Show More
+++ b/Show More