Merge pull request #7507 from neondatabase/rc/proxy/2024-04-25

Proxy release 2024-04-25
proxy: Fix cancellations (#7510 )
2026-04-01 22:50:37 +00:00 · 2024-04-25 13:50:05 +02:00 · 2024-04-25 13:42:25 +02:00 · 2024-04-25 13:20:21 +02:00 · 2024-04-24 18:48:25 +02:00 · 2024-04-24 15:09:23 +00:00
77 changed files with 1996 additions and 1864 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -2,7 +2,6 @@
 # This is only present for local builds, as it will be overridden
 # by the RUSTDOCFLAGS env var in CI.
 rustdocflags = ["-Arustdoc::private_intra_doc_links"]
-rustflags = ["--cfg", "tokio_unstable"]

 [alias]
 build_testing = ["build", "--features", "testing"]
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -214,7 +214,6 @@ jobs:
      BUILD_TYPE: ${{ matrix.build_type }}
      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
      BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
-      RUSTFLAGS: "--cfg=tokio_unstable"

    steps:
      - name: Fix git ownership
@@ -736,7 +735,7 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2

      - uses: docker/login-action@v3
        with:
@@ -793,7 +792,7 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2
        with:
          # Disable parallelism for docker buildkit.
          # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
@@ -866,7 +865,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.23.2
+      VM_BUILDER_VERSION: v0.28.1

    steps:
      - name: Checkout
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -599,7 +599,7 @@ dependencies = [
 "once_cell",
 "pin-project-lite",
 "pin-utils",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "tokio",
 "tracing",
 ]
@@ -1240,43 +1240,6 @@ dependencies = [
 "crossbeam-utils",
 ]

-[[package]]
-name = "console-api"
-version = "0.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd326812b3fd01da5bb1af7d340d0d555fd3d4b641e7f1dfcf5962a902952787"
-dependencies = [
- "futures-core",
- "prost 0.12.4",
- "prost-types 0.12.4",
- "tonic 0.10.2",
- "tracing-core",
-]
-
-[[package]]
-name = "console-subscriber"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7481d4c57092cd1c19dd541b92bdce883de840df30aa5d03fd48a3935c01842e"
-dependencies = [
- "console-api",
- "crossbeam-channel",
- "crossbeam-utils",
- "futures-task",
- "hdrhistogram",
- "humantime",
- "prost-types 0.12.4",
- "serde",
- "serde_json",
- "thread_local",
- "tokio",
- "tokio-stream",
- "tonic 0.10.2",
- "tracing",
- "tracing-core",
- "tracing-subscriber",
-]
-
 [[package]]
 name = "const-oid"
 version = "0.9.5"
@@ -2556,7 +2519,7 @@ dependencies = [
 "http 0.2.9",
 "hyper 0.14.26",
 "log",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "rustls-native-certs 0.6.2",
 "tokio",
 "tokio-rustls 0.24.0",
@@ -3445,11 +3408,11 @@ dependencies = [
 "opentelemetry-semantic-conventions",
 "opentelemetry_api",
 "opentelemetry_sdk",
- "prost 0.11.9",
+ "prost",
 "reqwest",
 "thiserror",
 "tokio",
- "tonic 0.9.2",
+ "tonic",
 ]

 [[package]]
@@ -3460,8 +3423,8 @@ checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb"
 dependencies = [
 "opentelemetry_api",
 "opentelemetry_sdk",
- "prost 0.11.9",
- "tonic 0.9.2",
+ "prost",
+ "tonic",
 ]

 [[package]]
@@ -3695,6 +3658,7 @@ dependencies = [
 "tokio-util",
 "toml_edit",
 "tracing",
+ "twox-hash",
 "url",
 "utils",
 "walkdir",
@@ -4046,6 +4010,17 @@ dependencies = [
 "tokio-postgres",
 ]

+[[package]]
+name = "postgres-native-tls"
+version = "0.5.0"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+dependencies = [
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+ "tokio-postgres",
+]
+
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
@@ -4085,7 +4060,7 @@ dependencies = [
 "futures",
 "once_cell",
 "pq_proto",
- "rustls 0.22.2",
+ "rustls 0.22.4",
 "rustls-pemfile 2.1.1",
 "serde",
 "thiserror",
@@ -4260,17 +4235,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd"
 dependencies = [
 "bytes",
- "prost-derive 0.11.9",
-]
-
-[[package]]
-name = "prost"
-version = "0.12.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d0f5d036824e4761737860779c906171497f6d55681139d8312388f8fe398922"
-dependencies = [
- "bytes",
- "prost-derive 0.12.4",
+ "prost-derive",
 ]

 [[package]]
@@ -4287,8 +4252,8 @@ dependencies = [
 "multimap",
 "petgraph",
 "prettyplease 0.1.25",
- "prost 0.11.9",
- "prost-types 0.11.9",
+ "prost",
+ "prost-types",
 "regex",
 "syn 1.0.109",
 "tempfile",
@@ -4308,35 +4273,13 @@ dependencies = [
 "syn 1.0.109",
 ]

-[[package]]
-name = "prost-derive"
-version = "0.12.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19de2de2a00075bf566bee3bd4db014b11587e84184d3f7a791bc17f1a8e9e48"
-dependencies = [
- "anyhow",
- "itertools",
- "proc-macro2",
- "quote",
- "syn 2.0.52",
-]
-
 [[package]]
 name = "prost-types"
 version = "0.11.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "213622a1460818959ac1181aaeb2dc9c7f63df720db7d788b3e24eacd1983e13"
 dependencies = [
- "prost 0.11.9",
-]
-
-[[package]]
-name = "prost-types"
-version = "0.12.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3235c33eb02c1f1e212abdbe34c78b264b038fb58ca612664343271e36e55ffe"
-dependencies = [
- "prost 0.12.4",
+ "prost",
 ]

 [[package]]
@@ -4358,7 +4301,6 @@ dependencies = [
 "camino-tempfile",
 "chrono",
 "clap",
- "console-subscriber",
 "consumption_metrics",
 "dashmap",
 "env_logger",
@@ -4383,6 +4325,7 @@ dependencies = [
 "md5",
 "measured",
 "metrics",
+ "native-tls",
 "once_cell",
 "opentelemetry",
 "parking_lot 0.12.1",
@@ -4390,6 +4333,7 @@ dependencies = [
 "parquet_derive",
 "pbkdf2",
 "pin-project-lite",
+ "postgres-native-tls",
 "postgres-protocol",
 "postgres_backend",
 "pq_proto",
@@ -4407,8 +4351,7 @@ dependencies = [
 "routerify",
 "rstest",
 "rustc-hash",
- "rustls 0.22.2",
- "rustls-native-certs 0.7.0",
+ "rustls 0.22.4",
 "rustls-pemfile 2.1.1",
 "scopeguard",
 "serde",
@@ -4600,7 +4543,7 @@ dependencies = [
 "itoa",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.22.2",
+ "rustls 0.22.4",
 "rustls-native-certs 0.7.0",
 "rustls-pemfile 2.1.1",
 "rustls-pki-types",
@@ -4754,7 +4697,7 @@ dependencies = [
 "once_cell",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "rustls-pemfile 1.0.2",
 "serde",
 "serde_json",
@@ -5014,9 +4957,9 @@ dependencies = [

 [[package]]
 name = "rustls"
-version = "0.21.9"
+version = "0.21.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "629648aced5775d558af50b2b4c7b02983a04b312126d45eeead26e7caa498b9"
+checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
 dependencies = [
 "log",
 "ring 0.17.6",
@@ -5026,9 +4969,9 @@ dependencies = [

 [[package]]
 name = "rustls"
-version = "0.22.2"
+version = "0.22.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41"
+checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
 dependencies = [
 "log",
 "ring 0.17.6",
@@ -5340,7 +5283,7 @@ checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
 dependencies = [
 "httpdate",
 "reqwest",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "sentry-backtrace",
 "sentry-contexts",
 "sentry-core",
@@ -5775,10 +5718,10 @@ dependencies = [
 "metrics",
 "once_cell",
 "parking_lot 0.12.1",
- "prost 0.11.9",
+ "prost",
 "tokio",
 "tokio-stream",
- "tonic 0.9.2",
+ "tonic",
 "tonic-build",
 "tracing",
 "utils",
@@ -5888,8 +5831,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 [[package]]
 name = "svg_fmt"
 version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f83ba502a3265efb76efb89b0a2f7782ad6f2675015d4ce37e4b547dda42b499"
+source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#b9501105e746629004bc6d0473639320939dbe10"

 [[package]]
 name = "syn"
@@ -6170,7 +6112,6 @@ dependencies = [
 "signal-hook-registry",
 "socket2 0.5.5",
 "tokio-macros",
- "tracing",
 "windows-sys 0.48.0",
 ]

@@ -6252,7 +6193,7 @@ checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
 dependencies = [
 "futures",
 "ring 0.17.6",
- "rustls 0.22.2",
+ "rustls 0.22.4",
 "tokio",
 "tokio-postgres",
 "tokio-rustls 0.25.0",
@@ -6265,7 +6206,7 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
 dependencies = [
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "tokio",
 ]

@@ -6275,7 +6216,7 @@ version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f"
 dependencies = [
- "rustls 0.22.2",
+ "rustls 0.22.4",
 "rustls-pki-types",
 "tokio",
 ]
@@ -6401,7 +6342,7 @@ dependencies = [
 "hyper-timeout",
 "percent-encoding",
 "pin-project",
- "prost 0.11.9",
+ "prost",
 "rustls-native-certs 0.6.2",
 "rustls-pemfile 1.0.2",
 "tokio",
@@ -6413,33 +6354,6 @@ dependencies = [
 "tracing",
 ]

-[[package]]
-name = "tonic"
-version = "0.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e"
-dependencies = [
- "async-stream",
- "async-trait",
- "axum",
- "base64 0.21.1",
- "bytes",
- "h2 0.3.26",
- "http 0.2.9",
- "http-body 0.4.5",
- "hyper 0.14.26",
- "hyper-timeout",
- "percent-encoding",
- "pin-project",
- "prost 0.12.4",
- "tokio",
- "tokio-stream",
- "tower",
- "tower-layer",
- "tower-service",
- "tracing",
-]
-
 [[package]]
 name = "tonic-build"
 version = "0.9.2"
@@ -6763,7 +6677,7 @@ dependencies = [
 "base64 0.21.1",
 "log",
 "once_cell",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "rustls-webpki 0.100.2",
 "url",
 "webpki-roots 0.23.1",
@@ -7420,7 +7334,6 @@ dependencies = [
 "futures-util",
 "getrandom 0.2.11",
 "hashbrown 0.14.0",
- "hdrhistogram",
 "hex",
 "hmac",
 "hyper 0.14.26",
@@ -7435,13 +7348,13 @@ dependencies = [
 "num-traits",
 "once_cell",
 "parquet",
- "prost 0.11.9",
+ "prost",
 "rand 0.8.5",
 "regex",
 "regex-automata 0.4.3",
 "regex-syntax 0.8.2",
 "reqwest",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "scopeguard",
 "serde",
 "serde_json",
@@ -7454,11 +7367,10 @@ dependencies = [
 "time-macros",
 "tokio",
 "tokio-rustls 0.24.0",
- "tokio-stream",
 "tokio-util",
 "toml_datetime",
 "toml_edit",
- "tonic 0.9.2",
+ "tonic",
 "tower",
 "tracing",
 "tracing-core",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -157,7 +157,8 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-svg_fmt = "0.4.1"
+# https://github.com/nical/rust_debug/pull/4
+svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
--- a/2
+++ b/2
@@ -47,7 +47,7 @@ COPY --chown=nonroot . .
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment --cfg=tokio_unstable" cargo build  \
+    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build  \
      --bin pg_sni_router  \
      --bin pageserver  \
      --bin pagectl  \
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -256,7 +256,16 @@ fn update_rusage_metrics() {
    DISK_IO_BYTES
        .with_label_values(&["write"])
        .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
-    MAXRSS_KB.set(rusage_stats.ru_maxrss);
+
+    // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
+    #[cfg(target_os = "macos")]
+    {
+        MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024);
+    }
+    #[cfg(not(target_os = "macos"))]
+    {
+        MAXRSS_KB.set(rusage_stats.ru_maxrss);
+    }
 }

 fn get_rusage_stats() -> libc::rusage {
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,8 +1,10 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
+use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
+use std::ops::RangeInclusive;
 use std::{fmt, ops::Range};

 use crate::reltag::{BlockNumber, RelTag, SlruKind};
@@ -21,9 +23,81 @@ pub struct Key {
    pub field6: u32,
 }

+/// The storage key size.
 pub const KEY_SIZE: usize = 18;

+/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
+/// See [`Key::to_i128`] for more information on the encoding.
+pub const METADATA_KEY_SIZE: usize = 16;
+
+/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x80 is a metadata key.
+pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x80;
+
+/// The (reserved) key prefix of relation sizes.
+pub const RELATION_SIZE_PREFIX: u8 = 0x81;
+
+/// The key prefix of AUX file keys.
+pub const AUX_KEY_PREFIX: u8 = 0x82;
+
+/// Check if the key falls in the range of metadata keys.
+pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
+    key[0] >= METADATA_KEY_BEGIN_PREFIX
+}
+
 impl Key {
+    /// Check if the key falls in the range of metadata keys.
+    pub const fn is_metadata_key(&self) -> bool {
+        self.field1 >= METADATA_KEY_BEGIN_PREFIX
+    }
+
+    /// Encode a metadata key to a storage key.
+    pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
+        assert!(is_metadata_key_slice(key), "key not in metadata key range");
+        Key {
+            field1: key[0],
+            field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
+            field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
+            field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
+            field5: key[11],
+            field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
+        }
+    }
+
+    /// Encode a metadata key to a storage key.
+    pub fn from_metadata_key(key: &[u8]) -> Self {
+        Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
+    }
+
+    /// Extract a metadata key to a writer. The result should always be 16 bytes.
+    pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
+        writer.put_u8(self.field1);
+        assert!(self.field2 <= 0xFFFF);
+        writer.put_u16(self.field2 as u16);
+        writer.put_u32(self.field3);
+        writer.put_u32(self.field4);
+        writer.put_u8(self.field5);
+        writer.put_u32(self.field6);
+    }
+
+    /// Get the range of metadata keys.
+    pub fn metadata_key_range() -> RangeInclusive<Self> {
+        Key {
+            field1: METADATA_KEY_BEGIN_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }..=Key {
+            field1: u8::MAX,
+            field2: u16::MAX as u32,
+            field3: u32::MAX,
+            field4: u32::MAX,
+            field5: u8::MAX,
+            field6: u32::MAX,
+        }
+    }
+
    /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
@@ -48,11 +122,11 @@ impl Key {
        }
    }

-    pub fn next(&self) -> Key {
+    pub const fn next(&self) -> Key {
        self.add(1)
    }

-    pub fn add(&self, x: u32) -> Key {
+    pub const fn add(&self, x: u32) -> Key {
        let mut key = *self;

        let r = key.field6.overflowing_add(x);
@@ -81,6 +155,8 @@ impl Key {
        key
    }

+    /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
+    /// Use [`Key::from_metadata_key`] instead.
    pub fn from_slice(b: &[u8]) -> Self {
        Key {
            field1: b[0],
@@ -92,6 +168,8 @@ impl Key {
        }
    }

+    /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
+    /// Use [`Key::extract_metadata_key_to_writer`] instead.
    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
        buf[0] = self.field1;
        BE::write_u32(&mut buf[1..5], self.field2);
@@ -475,12 +553,14 @@ pub const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

+pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
+
 // AUX_FILES currently stores only data for logical replication (slots etc), and
 // we don't preserve these on a branch because safekeepers can't follow timeline
 // switch (and generally it likely should be optional), so ignore these.
 #[inline(always)]
 pub fn is_inherited_key(key: Key) -> bool {
-    key != AUX_FILES_KEY
+    !NON_INHERITED_RANGE.contains(&key)
 }

 #[inline(always)]
@@ -556,11 +636,14 @@ impl std::str::FromStr for Key {
 mod tests {
    use std::str::FromStr;

+    use crate::key::is_metadata_key_slice;
    use crate::key::Key;

    use rand::Rng;
    use rand::SeedableRng;

+    use super::AUX_KEY_PREFIX;
+
    #[test]
    fn display_fromstr_bijection() {
        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
@@ -576,4 +659,16 @@ mod tests {

        assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
    }
+
+    #[test]
+    fn test_metadata_keys() {
+        let mut metadata_key = vec![AUX_KEY_PREFIX];
+        metadata_key.extend_from_slice(&[0xFF; 15]);
+        let encoded_key = Key::from_metadata_key(&metadata_key);
+        let mut output_key = Vec::new();
+        encoded_key.extract_metadata_key_to_writer(&mut output_key);
+        assert_eq!(metadata_key, output_key);
+        assert!(encoded_key.is_metadata_key());
+        assert!(is_metadata_key_slice(&metadata_key));
+    }
 }
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -94,12 +94,13 @@ impl KeySpace {

    /// Remove all keys in `other` from `self`.
    /// This can involve splitting or removing of existing ranges.
-    pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
+    /// Returns the removed keyspace
+    pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
        let (self_start, self_end) = match (self.start(), self.end()) {
            (Some(start), Some(end)) => (start, end),
            _ => {
                // self is empty
-                return;
+                return KeySpace::default();
            }
        };

@@ -112,30 +113,37 @@ impl KeySpace {
            .skip_while(|range| self_start >= range.end)
            .take_while(|range| self_end > range.start);

+        let mut removed_accum = KeySpaceRandomAccum::new();
        for range in other_ranges {
            while let Some(overlap_at) = self.overlaps_at(range) {
                let overlapped = self.ranges[overlap_at].clone();

                if overlapped.start < range.start && overlapped.end <= range.end {
                    // Higher part of the range is completely overlapped.
+                    removed_accum.add_range(range.start..self.ranges[overlap_at].end);
                    self.ranges[overlap_at].end = range.start;
                }
                if overlapped.start >= range.start && overlapped.end > range.end {
                    // Lower part of the range is completely overlapped.
+                    removed_accum.add_range(self.ranges[overlap_at].start..range.end);
                    self.ranges[overlap_at].start = range.end;
                }
                if overlapped.start < range.start && overlapped.end > range.end {
                    // Middle part of the range is overlapped.
+                    removed_accum.add_range(range.clone());
                    self.ranges[overlap_at].end = range.start;
                    self.ranges
                        .insert(overlap_at + 1, range.end..overlapped.end);
                }
                if overlapped.start >= range.start && overlapped.end <= range.end {
                    // Whole range is overlapped
+                    removed_accum.add_range(self.ranges[overlap_at].clone());
                    self.ranges.remove(overlap_at);
                }
            }
        }
+
+        removed_accum.to_keyspace()
    }

    pub fn start(&self) -> Option<Key> {
@@ -154,6 +162,10 @@ impl KeySpace {
            .sum()
    }

+    pub fn is_empty(&self) -> bool {
+        self.total_size() == 0
+    }
+
    fn overlaps_at(&self, range: &Range<Key>) -> Option<usize> {
        match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
            Ok(0) => None,
@@ -553,7 +565,16 @@ mod tests {
                Key::from_i128(11)..Key::from_i128(13),
            ],
        };
-        key_space1.remove_overlapping_with(&key_space2);
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace {
+            ranges: vec![
+                Key::from_i128(2)..Key::from_i128(3),
+                Key::from_i128(6)..Key::from_i128(7),
+                Key::from_i128(11)..Key::from_i128(12),
+            ],
+        };
+        assert_eq!(removed, removed_expected);
+
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -583,7 +604,17 @@ mod tests {
                Key::from_i128(14)..Key::from_i128(17),
            ],
        };
-        key_space1.remove_overlapping_with(&key_space2);
+
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace {
+            ranges: vec![
+                Key::from_i128(3)..Key::from_i128(5),
+                Key::from_i128(8)..Key::from_i128(10),
+                Key::from_i128(14)..Key::from_i128(15),
+            ],
+        };
+        assert_eq!(removed, removed_expected);
+
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -610,7 +641,11 @@ mod tests {
                Key::from_i128(15)..Key::from_i128(17),
            ],
        };
-        key_space1.remove_overlapping_with(&key_space2);
+
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace::default();
+        assert_eq!(removed, removed_expected);
+
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -637,7 +672,17 @@ mod tests {
        let key_space2 = KeySpace {
            ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
        };
-        key_space1.remove_overlapping_with(&key_space2);
+
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace {
+            ranges: vec![
+                Key::from_i128(9)..Key::from_i128(10),
+                Key::from_i128(12)..Key::from_i128(15),
+                Key::from_i128(17)..Key::from_i128(19),
+            ],
+        };
+        assert_eq!(removed, removed_expected);
+
        assert_eq!(
            key_space1.ranges,
            vec![
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -429,6 +429,7 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
+    #[serde(skip_serializing_if = "Option::is_none")]
    pub tenant_id: Option<TenantShardId>,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -5,6 +5,7 @@ use crate::{
    models::ShardParameters,
 };
 use hex::FromHex;
+use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
 use utils::id::TenantId;

@@ -537,6 +538,24 @@ impl ShardIdentity {
        }
    }

+    /// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
+    ///
+    /// When we fail to read a forknum block, this function tells us whether we may ignore the error
+    /// as a symptom of that issue.
+    pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
+        if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
+            return false;
+        }
+
+        let mut hash = murmurhash32(key.field4);
+        hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
+        let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
+
+        // The key may be affected by issue #7454: it is an initfork and it would not
+        // have mapped to shard 0 until we fixed that issue.
+        mapped_shard != ShardNumber(0)
+    }
+
    /// Return true if the key should be discarded if found in this shard's
    /// data store, e.g. during compaction after a split.
    ///
@@ -649,7 +668,13 @@ fn key_is_shard0(key: &Key) -> bool {
    // relation pages are distributed to shards other than shard zero. Everything else gets
    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
    // requests, and any request other than those for particular blocks in relations.
-    !is_rel_block_key(key)
+    //
+    // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
+    // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
+    // because they must be included in basebackups.
+    let is_initfork = key.field5 == INIT_FORKNUM;
+
+    !is_rel_block_key(key) || is_initfork
 }

 /// Provide the same result as the function in postgres `hashfn.h` with the same name
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -118,7 +118,9 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
 // Likewise for these, although the assumption that these don't change is a little more iffy.
 pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
 pub use v14::bindings::{PageHeaderData, XLogRecord};
-pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+pub use v14::xlog_utils::{
+    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
+};

 pub use v14::bindings::{CheckPoint, ControlFileData};

--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -4,7 +4,9 @@ use log::*;
 use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
-use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+use postgres_ffi::{
+    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
+};
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -262,11 +264,21 @@ fn craft_internal<C: postgres::GenericClient>(
        intermediate_lsns.insert(0, initial_lsn);
    }

-    // Some records may be not flushed, e.g. non-transactional logical messages.
+    // Some records may be not flushed, e.g. non-transactional logical messages. Flush now.
    //
-    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
-    // because pg_current_wal_insert_lsn skips page headers.
-    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
+    // If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn
+    // returns the position just after the page header on the next page. That's where the next
+    // record will be inserted. But the page header hasn't actually been written to the WAL
+    // yet, and if you try to flush it, you get a "request to flush past end of generated WAL"
+    // error. Because of that, if the insert location is just after a page header, back off to
+    // previous page boundary.
+    let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?);
+    if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 {
+        lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
+    } else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 {
+        lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
+    }
+    client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?;
    Ok(intermediate_lsns)
 }

@@ -320,38 +332,49 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {

        client.execute("CREATE table t(x int)", &[])?;

-        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
-        // We will use logical message as the padding. We start with detecting how much WAL
-        // it takes for one logical message, considering all alignments and headers.
-        let base_wal_advance = {
+        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.  We
+        // will use carefully-sized logical messages to advance WAL insert location such
+        // that there is just enough space on the page for the XLOG_SWITCH record.
+        loop {
+            // We start with measuring how much WAL it takes for one logical message,
+            // considering all alignments and headers.
            let before_lsn = client.pg_current_wal_insert_lsn()?;
-            // Small non-empty message bigger than few bytes is more likely than an empty
-            // message to have the same format as the big padding message.
            client.execute(
                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
                &[],
            )?;
-            // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
-            (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
-                + XLOG_SIZE_OF_XLOG_RECORD
-        };
-        let mut remaining_lsn =
-            XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
-        if remaining_lsn < base_wal_advance {
-            remaining_lsn += XLOG_BLCKSZ;
+            let after_lsn = client.pg_current_wal_insert_lsn()?;
+
+            // Did the record cross a page boundary? If it did, start over. Crossing a
+            // page boundary adds to the apparent size of the record because of the page
+            // header, which throws off the calculation.
+            if u64::from(before_lsn) / XLOG_BLCKSZ as u64
+                != u64::from(after_lsn) / XLOG_BLCKSZ as u64
+            {
+                continue;
+            }
+            // base_size is the size of a logical message without the payload
+            let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10;
+
+            // Is there enough space on the page for another logical message and an
+            // XLOG_SWITCH? If not, start over.
+            let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
+            if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
+                continue;
+            }
+
+            // We will write another logical message, such that after the logical message
+            // record, there will be space for exactly one XLOG_SWITCH. How large should
+            // the logical message's payload be? An XLOG_SWITCH record has no data => its
+            // size is exactly XLOG_SIZE_OF_XLOG_RECORD.
+            let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64;
+
+            client.execute(
+                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
+                &[&(repeats as i32)],
+            )?;
+            break;
        }
-        let repeats = 10 + remaining_lsn - base_wal_advance;
-        info!(
-            "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
-            client.pg_current_wal_insert_lsn()?,
-            remaining_lsn,
-            base_wal_advance,
-            repeats
-        );
-        client.execute(
-            "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
-            &[&(repeats as i32)],
-        )?;
        info!(
            "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
            client.pg_current_wal_insert_lsn()?,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -21,11 +21,13 @@ use std::{
    fmt::Debug,
    num::{NonZeroU32, NonZeroUsize},
    pin::Pin,
+    str::FromStr,
    sync::Arc,
    time::{Duration, SystemTime},
 };

 use anyhow::{bail, Context};
+use aws_sdk_s3::types::StorageClass;
 use camino::{Utf8Path, Utf8PathBuf};

 use bytes::Bytes;
@@ -134,6 +136,11 @@ impl RemotePath {
    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
        self.0.strip_prefix(&p.0)
    }
+
+    pub fn add_trailing_slash(&self) -> Self {
+        // Unwrap safety inputs are guararnteed to be valid UTF-8
+        Self(format!("{}/", self.0).try_into().unwrap())
+    }
 }

 /// We don't need callers to be able to pass arbitrary delimiters: just control
@@ -157,47 +164,21 @@ pub struct Listing {
 /// providing basic CRUD operations for storage files.
 #[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// Lists all top level subdirectories for a given prefix
-    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
-    /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
-    /// so this method doesnt need to.
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let result = self
-            .list(prefix, ListingMode::WithDelimiter, None, cancel)
-            .await?
-            .prefixes;
-        Ok(result)
-    }
-    /// Lists all files in directory "recursively"
-    /// (not really recursively, because AWS has a flat namespace)
-    /// Note: This is subtely different than list_prefixes,
-    /// because it is for listing files instead of listing
-    /// names sharing common prefixes.
-    /// For example,
-    /// list_files("foo/bar") = ["foo/bar/cat123.txt",
-    /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
-    /// whereas,
-    /// list_prefixes("foo/bar/") = ["cat", "dog"]
-    /// See `test_real_s3.rs` for more details.
+    /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
+    /// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
+    ///
+    /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
+    /// from the absolute root of the bucket.
+    ///
+    /// `mode` configures whether to use a delimiter.  Without a delimiter all keys
+    /// within the prefix are listed in the `keys` of the result.  With a delimiter, any "directories" at the top level of
+    /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
+    /// returned in `keys` ().
+    ///
+    /// `max_keys` controls the maximum number of keys that will be returned.  If this is None, this function
+    /// will iteratively call listobjects until it runs out of keys.  Note that this is not safe to use on
+    /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
    ///
-    /// max_keys limits max number of keys returned; None means unlimited.
-    async fn list_files(
-        &self,
-        prefix: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let result = self
-            .list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
-            .await?
-            .keys;
-        Ok(result)
-    }
-
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
@@ -336,41 +317,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

-    // A function for listing all the files in a "directory"
-    // Example:
-    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
-    //
-    // max_keys limits max number of keys returned; None means unlimited.
-    pub async fn list_files(
-        &self,
-        folder: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        match self {
-            Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
-            Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
-            Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
-            Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
-        }
-    }
-
-    // lists common *prefixes*, if any of files
-    // Example:
-    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
-    pub async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        match self {
-            Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
-            Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
-            Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
-            Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
-        }
-    }
-
    /// See [`RemoteStorage::upload`]
    pub async fn upload(
        &self,
@@ -619,6 +565,7 @@ pub struct S3Config {
    /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
    pub concurrency_limit: NonZeroUsize,
    pub max_keys_per_list_response: Option<i32>,
+    pub upload_storage_class: Option<StorageClass>,
 }

 impl Debug for S3Config {
@@ -747,6 +694,18 @@ impl RemoteStorageConfig {
                    endpoint,
                    concurrency_limit,
                    max_keys_per_list_response,
+                    upload_storage_class: toml
+                        .get("upload_storage_class")
+                        .map(|prefix_in_bucket| -> anyhow::Result<_> {
+                            let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?;
+                            let storage_class = StorageClass::from_str(&s).expect("infallible");
+                            #[allow(deprecated)]
+                            if matches!(storage_class, StorageClass::Unknown(_)) {
+                                bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values());
+                            }
+                            Ok(storage_class)
+                        })
+                        .transpose()?,
                })
            }
            (_, _, _, Some(_), None) => {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,11 +5,9 @@
 //! volume is mounted to the local FS.

 use std::{
-    borrow::Cow,
-    future::Future,
+    collections::HashSet,
    io::ErrorKind,
    num::NonZeroU32,
-    pin::Pin,
    time::{Duration, SystemTime, UNIX_EPOCH},
 };

@@ -22,11 +20,11 @@ use tokio::{
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tokio_util::{io::ReaderStream, sync::CancellationToken};
-use tracing::*;
-use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
+use utils::crashsafe::path_with_suffix_extension;

 use crate::{
    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 use super::{RemoteStorage, StorageMetadata};
@@ -93,7 +91,47 @@ impl LocalFs {

    #[cfg(test)]
    async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
-        Ok(get_all_files(&self.storage_root, true)
+        use std::{future::Future, pin::Pin};
+        fn get_all_files<'a, P>(
+            directory_path: P,
+        ) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
+        where
+            P: AsRef<Utf8Path> + Send + Sync + 'a,
+        {
+            Box::pin(async move {
+                let directory_path = directory_path.as_ref();
+                if directory_path.exists() {
+                    if directory_path.is_dir() {
+                        let mut paths = Vec::new();
+                        let mut dir_contents = fs::read_dir(directory_path).await?;
+                        while let Some(dir_entry) = dir_contents.next_entry().await? {
+                            let file_type = dir_entry.file_type().await?;
+                            let entry_path =
+                                Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
+                                    anyhow::Error::msg(format!(
+                                        "non-Unicode path: {}",
+                                        pb.to_string_lossy()
+                                    ))
+                                })?;
+                            if file_type.is_symlink() {
+                                tracing::debug!("{entry_path:?} is a symlink, skipping")
+                            } else if file_type.is_dir() {
+                                paths.extend(get_all_files(&entry_path).await?.into_iter())
+                            } else {
+                                paths.push(entry_path);
+                            }
+                        }
+                        Ok(paths)
+                    } else {
+                        bail!("Path {directory_path:?} is not a directory")
+                    }
+                } else {
+                    Ok(Vec::new())
+                }
+            })
+        }
+
+        Ok(get_all_files(&self.storage_root)
            .await?
            .into_iter()
            .map(|path| {
@@ -120,6 +158,14 @@ impl LocalFs {
        // S3 object list prefixes can be arbitrary strings, but when reading
        // the local filesystem we need a directory to start calling read_dir on.
        let mut initial_dir = full_path.clone();
+
+        // If there's no trailing slash, we have to start looking from one above: even if
+        // `initial_dir` is a directory, we should still list any prefixes in the parent
+        // that start with the same string.
+        if !full_path.to_string().ends_with('/') {
+            initial_dir.pop();
+        }
+
        loop {
            // Did we make it to the root?
            if initial_dir.parent().is_none() {
@@ -295,61 +341,66 @@ impl RemoteStorage for LocalFs {
        let op = async {
            let mut result = Listing::default();

-            if let ListingMode::NoDelimiter = mode {
-                let keys = self
-                    .list_recursive(prefix)
-                    .await
-                    .map_err(DownloadError::Other)?;
-
-                result.keys = keys
-                    .into_iter()
-                    .filter(|k| {
-                        let path = k.with_base(&self.storage_root);
-                        !path.is_dir()
-                    })
-                    .collect();
-
-                if let Some(max_keys) = max_keys {
-                    result.keys.truncate(max_keys.get() as usize);
-                }
-
-                return Ok(result);
-            }
-
-            let path = match prefix {
-                Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
-                None => Cow::Borrowed(&self.storage_root),
-            };
-
-            let prefixes_to_filter = get_all_files(path.as_ref(), false)
+            // Filter out directories: in S3 directories don't exist, only the keys within them do.
+            let keys = self
+                .list_recursive(prefix)
                .await
                .map_err(DownloadError::Other)?;
+            let keys = keys
+                .into_iter()
+                .filter(|k| {
+                    let path = k.with_base(&self.storage_root);
+                    !path.is_dir()
+                })
+                .collect();

-            // filter out empty directories to mirror s3 behavior.
-            for prefix in prefixes_to_filter {
-                if prefix.is_dir()
-                    && is_directory_empty(&prefix)
-                        .await
-                        .map_err(DownloadError::Other)?
-                {
-                    continue;
-                }
-
-                let stripped = prefix
-                    .strip_prefix(&self.storage_root)
-                    .context("Failed to strip prefix")
-                    .and_then(RemotePath::new)
-                    .expect(
-                        "We list files for storage root, hence should be able to remote the prefix",
-                    );
-
-                if prefix.is_dir() {
-                    result.prefixes.push(stripped);
-                } else {
-                    result.keys.push(stripped);
+            if let ListingMode::NoDelimiter = mode {
+                result.keys = keys;
+            } else {
+                let mut prefixes = HashSet::new();
+                for key in keys {
+                    // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
+                    let relative_key = if let Some(prefix) = prefix {
+                        let mut prefix = prefix.clone();
+                        // We only strip the dirname of the prefix, so that when we strip it from the start of keys we
+                        // end up with full file/dir names.
+                        let prefix_full_local_path = prefix.with_base(&self.storage_root);
+                        let has_slash = prefix.0.to_string().ends_with('/');
+                        let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
+                            prefix
+                        } else {
+                            prefix.0.pop();
+                            prefix
+                        };
+
+                        RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
+                    } else {
+                        key
+                    };
+
+                    let relative_key = format!("{}", relative_key);
+                    if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                        let first_part = relative_key
+                            .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                            .next()
+                            .unwrap()
+                            .to_owned();
+                        prefixes.insert(first_part);
+                    } else {
+                        result
+                            .keys
+                            .push(RemotePath::from_string(&relative_key).unwrap());
+                    }
                }
+                result.prefixes = prefixes
+                    .into_iter()
+                    .map(|s| RemotePath::from_string(&s).unwrap())
+                    .collect();
            }

+            if let Some(max_keys) = max_keys {
+                result.keys.truncate(max_keys.get() as usize);
+            }
            Ok(result)
        };

@@ -560,50 +611,6 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
    path_with_suffix_extension(original_path, "metadata")
 }

-fn get_all_files<'a, P>(
-    directory_path: P,
-    recursive: bool,
-) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
-where
-    P: AsRef<Utf8Path> + Send + Sync + 'a,
-{
-    Box::pin(async move {
-        let directory_path = directory_path.as_ref();
-        if directory_path.exists() {
-            if directory_path.is_dir() {
-                let mut paths = Vec::new();
-                let mut dir_contents = fs::read_dir(directory_path).await?;
-                while let Some(dir_entry) = dir_contents.next_entry().await? {
-                    let file_type = dir_entry.file_type().await?;
-                    let entry_path =
-                        Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
-                            anyhow::Error::msg(format!(
-                                "non-Unicode path: {}",
-                                pb.to_string_lossy()
-                            ))
-                        })?;
-                    if file_type.is_symlink() {
-                        debug!("{entry_path:?} is a symlink, skipping")
-                    } else if file_type.is_dir() {
-                        if recursive {
-                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
-                        } else {
-                            paths.push(entry_path)
-                        }
-                    } else {
-                        paths.push(entry_path);
-                    }
-                }
-                Ok(paths)
-            } else {
-                bail!("Path {directory_path:?} is not a directory")
-            }
-        } else {
-            Ok(Vec::new())
-        }
-    })
-}
-
 async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
    let target_dir = match target_file_path.parent() {
        Some(parent_dir) => parent_dir,
@@ -923,13 +930,18 @@ mod fs_tests {
        // No delimiter: should recursively list everything
        let (storage, cancel) = create_storage()?;
        let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
+        let child_sibling =
+            upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;

        let listing = storage
            .list(None, ListingMode::NoDelimiter, None, &cancel)
            .await?;
        assert!(listing.prefixes.is_empty());
-        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
+        assert_eq!(
+            listing.keys.into_iter().collect::<HashSet<_>>(),
+            HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
+        );

        // Delimiter: should only go one deep
        let listing = storage
@@ -942,7 +954,25 @@ mod fs_tests {
        );
        assert!(listing.keys.is_empty());

-        // Delimiter & prefix
+        // Delimiter & prefix with a trailing slash
+        let listing = storage
+            .list(
+                Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
+            )
+            .await?;
+        assert_eq!(
+            listing.keys,
+            [RemotePath::from_string("uncle").unwrap()].to_vec()
+        );
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("parent").unwrap()].to_vec()
+        );
+
+        // Delimiter and prefix without a trailing slash
        let listing = storage
            .list(
                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
@@ -951,12 +981,66 @@ mod fs_tests {
                &cancel,
            )
            .await?;
+        assert_eq!(listing.keys, [].to_vec());
        assert_eq!(
            listing.prefixes,
-            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
-                .to_vec()
+            [RemotePath::from_string("grandparent").unwrap()].to_vec()
+        );
+
+        // Delimiter and prefix that's partway through a path component
+        let listing = storage
+            .list(
+                Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
+            )
+            .await?;
+        assert_eq!(listing.keys, [].to_vec());
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("grandparent").unwrap()].to_vec()
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn list_part_component() -> anyhow::Result<()> {
+        // No delimiter: should recursively list everything
+        let (storage, cancel) = create_storage()?;
+
+        // Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
+        // of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
+        // a freeform prefix.
+        let _child_a =
+            upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
+        let _child_b =
+            upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
+
+        // Delimiter and prefix that's partway through a path component
+        let listing = storage
+            .list(
+                Some(
+                    &RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
+                ),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
+            )
+            .await?;
+        assert_eq!(listing.keys, [].to_vec());
+
+        let mut found_prefixes = listing.prefixes.clone();
+        found_prefixes.sort();
+        assert_eq!(
+            found_prefixes,
+            [
+                RemotePath::from_string("tenant").unwrap(),
+                RemotePath::from_string("tenant-01").unwrap(),
+            ]
+            .to_vec()
        );
-        assert_eq!(listing.keys, [uncle.clone()].to_vec());

        Ok(())
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -30,7 +30,7 @@ use aws_sdk_s3::{
    config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
    operation::get_object::GetObjectError,
-    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
+    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
    Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
@@ -62,6 +62,7 @@ pub struct S3Bucket {
    bucket_name: String,
    prefix_in_bucket: Option<String>,
    max_keys_per_list_response: Option<i32>,
+    upload_storage_class: Option<StorageClass>,
    concurrency_limiter: ConcurrencyLimiter,
    // Per-request timeout. Accessible for tests.
    pub timeout: Duration,
@@ -154,6 +155,7 @@ impl S3Bucket {
            max_keys_per_list_response: aws_config.max_keys_per_list_response,
            prefix_in_bucket,
            concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
+            upload_storage_class: aws_config.upload_storage_class.clone(),
            timeout,
        })
    }
@@ -178,10 +180,7 @@ impl S3Bucket {

    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path
-            .get_path()
-            .as_str()
-            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
+        let path_string = path.get_path().as_str();
        match &self.prefix_in_bucket {
            Some(prefix) => prefix.clone() + "/" + path_string,
            None => path_string.to_string(),
@@ -471,16 +470,11 @@ impl RemoteStorage for S3Bucket {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| self.prefix_in_bucket.clone())
-            .map(|mut p| {
-                // required to end with a separator
-                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
-                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                }
-                p
+            .or_else(|| {
+                self.prefix_in_bucket.clone().map(|mut s| {
+                    s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                    s
+                })
            });

        let _permit = self.permit(kind, cancel).await?;
@@ -549,11 +543,15 @@ impl RemoteStorage for S3Bucket {
                }
            }

-            result.prefixes.extend(
-                prefixes
-                    .iter()
-                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
-            );
+            // S3 gives us prefixes like "foo/", we return them like "foo"
+            result.prefixes.extend(prefixes.iter().filter_map(|o| {
+                Some(
+                    self.s3_object_to_relative_path(
+                        o.prefix()?
+                            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
+                    ),
+                )
+            }));

            continuation_token = match response.next_continuation_token {
                Some(new_token) => Some(new_token),
@@ -586,6 +584,7 @@ impl RemoteStorage for S3Bucket {
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(to))
            .set_metadata(metadata.map(|m| m.0))
+            .set_storage_class(self.upload_storage_class.clone())
            .content_length(from_size_bytes.try_into()?)
            .body(bytes_stream)
            .send();
@@ -637,6 +636,7 @@ impl RemoteStorage for S3Bucket {
            .copy_object()
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(to))
+            .set_storage_class(self.upload_storage_class.clone())
            .copy_source(copy_source)
            .send();

@@ -894,6 +894,7 @@ impl RemoteStorage for S3Bucket {
                                    .copy_object()
                                    .bucket(self.bucket_name.clone())
                                    .key(key)
+                                    .set_storage_class(self.upload_storage_class.clone())
                                    .copy_source(&source_id)
                                    .send();

@@ -1050,22 +1051,22 @@ mod tests {
            Some("/test/prefix/"),
        ];
        let expected_outputs = [
-            vec!["", "some/path", "some/path"],
-            vec!["/", "/some/path", "/some/path"],
+            vec!["", "some/path", "some/path/"],
+            vec!["/", "/some/path", "/some/path/"],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
            ],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
            ],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
            ],
        ];

@@ -1077,6 +1078,7 @@ mod tests {
                endpoint: None,
                concurrency_limit: NonZeroUsize::new(100).unwrap(),
                max_keys_per_list_response: Some(5),
+                upload_storage_class: None,
            };
            let storage =
                S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -107,27 +107,6 @@ impl UnreliableWrapper {
 type VoidStorage = crate::LocalFs;

 impl RemoteStorage for UnreliableWrapper {
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
-            .map_err(DownloadError::Other)?;
-        self.inner.list_prefixes(prefix, cancel).await
-    }
-
-    async fn list_files(
-        &self,
-        folder: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
-            .map_err(DownloadError::Other)?;
-        self.inner.list_files(folder, max_keys, cancel).await
-    }
-
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,5 +1,6 @@
 use anyhow::Context;
 use camino::Utf8Path;
+use remote_storage::ListingMode;
 use remote_storage::RemotePath;
 use std::sync::Arc;
 use std::{collections::HashSet, num::NonZeroU32};
@@ -54,9 +55,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
        .context("common_prefix construction")?;
    let root_remote_prefixes = test_client
-        .list_prefixes(None, &cancel)
-        .await
-        .context("client list root prefixes failure")?
+        .list(None, ListingMode::WithDelimiter, None, &cancel)
+        .await?
+        .prefixes
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
@@ -65,9 +66,14 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
    );

    let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix), &cancel)
-        .await
-        .context("client list nested prefixes failure")?
+        .list(
+            Some(&base_prefix.add_trailing_slash()),
+            ListingMode::WithDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .prefixes
        .into_iter()
        .collect::<HashSet<_>>();
    let remote_only_prefixes = nested_remote_prefixes
@@ -90,11 +96,13 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
 ///
 /// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
 /// Then performs the following queries:
-///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+///    1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
 #[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
 #[tokio::test]
-async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
+async fn list_no_delimiter_works(
+    ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
+) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
@@ -107,29 +115,36 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
    let base_prefix =
        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
    let root_files = test_client
-        .list_files(None, None, &cancel)
+        .list(None, ListingMode::NoDelimiter, None, &cancel)
        .await
        .context("client list root files failure")?
+        .keys
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
        root_files,
        ctx.remote_blobs.clone(),
-        "remote storage list_files on root mismatches with the uploads."
+        "remote storage list on root mismatches with the uploads."
    );

    // Test that max_keys limit works. In total there are about 21 files (see
    // upload_simple_remote_data call in test_real_s3.rs).
    let limited_root_files = test_client
-        .list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
+        .list(
+            None,
+            ListingMode::NoDelimiter,
+            Some(NonZeroU32::new(2).unwrap()),
+            &cancel,
+        )
        .await
        .context("client list root files failure")?;
-    assert_eq!(limited_root_files.len(), 2);
+    assert_eq!(limited_root_files.keys.len(), 2);

    let nested_remote_files = test_client
-        .list_files(Some(&base_prefix), None, &cancel)
+        .list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
        .await
        .context("client list nested files failure")?
+        .keys
        .into_iter()
        .collect::<HashSet<_>>();
    let trim_remote_blobs: HashSet<_> = ctx
@@ -141,7 +156,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
        .collect();
    assert_eq!(
        nested_remote_files, trim_remote_blobs,
-        "remote storage list_files on subdirrectory mismatches with the uploads."
+        "remote storage list on subdirrectory mismatches with the uploads."
    );
    Ok(())
 }
@@ -199,7 +214,11 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(

    ctx.client.delete_objects(&[path1, path2], &cancel).await?;

-    let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
+    let prefixes = ctx
+        .client
+        .list(None, ListingMode::WithDelimiter, None, &cancel)
+        .await?
+        .prefixes;

    assert_eq!(prefixes.len(), 1);

--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -132,10 +132,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    }
 }

-// NOTE: the setups for the list_prefixes test and the list_files test are very similar
-// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
-// whereas the list_files function is concerned with listing files.
-// See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(AzureWithSimpleTestBlobs),
    Disabled,
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -12,8 +12,8 @@ use anyhow::Context;
 use camino::Utf8Path;
 use futures_util::StreamExt;
 use remote_storage::{
-    DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
-    S3Config,
+    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
+    RemoteStorageKind, S3Config,
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
@@ -75,11 +75,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
        client: &Arc<GenericRemoteStorage>,
        cancel: &CancellationToken,
    ) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(retry(|| client.list_files(None, None, cancel))
-            .await
-            .context("list root files failure")?
-            .into_iter()
-            .collect::<HashSet<_>>())
+        Ok(
+            retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
+                .await
+                .context("list root files failure")?
+                .keys
+                .into_iter()
+                .collect::<HashSet<_>>(),
+        )
    }

    let cancel = CancellationToken::new();
@@ -294,10 +297,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    }
 }

-// NOTE: the setups for the list_prefixes test and the list_files test are very similar
-// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
-// whereas the list_files function is concerned with listing files.
-// See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(S3WithSimpleTestBlobs),
    Disabled,
@@ -381,6 +380,7 @@ fn create_s3_client(
            endpoint: None,
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
            max_keys_per_list_response,
+            upload_storage_class: None,
        }),
        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
    };
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -70,6 +70,7 @@ tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
+twox-hash.workspace = true
 url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -279,7 +279,7 @@ impl Client {
        lazy: bool,
    ) -> Result<()> {
        let req_body = TenantLocationConfigRequest {
-            tenant_id: Some(tenant_shard_id),
+            tenant_id: None,
            config,
        };

--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -9,18 +9,45 @@
 //! Coordinates in both axis are compressed for better readability.
 //! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
 //!
-//! Example use:
+//! The plain text API was chosen so that we can easily work with filenames from various
+//! sources; see the Usage section below for examples.
+//!
+//! # Usage
+//!
+//! ## Producing the SVG
+//!
 //! ```bash
-//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
-//! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
-//! $ firefox out.svg
+//!
+//! # local timeline dir
+//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
+//!     grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
+//!
+//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer`
+//! (jq -r '.historic_layers[] | .layer_file_name' | cargo  run -p pagectl draw-timeline) < layer-map.json > out.svg
+//!
+//! # From an `index_part.json` in S3
+//! (jq -r '.layer_metadata | keys[]' | cargo  run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
+//!
 //! ```
 //!
-//! This API was chosen so that we can easily work with filenames extracted from ssh,
-//! or from pageserver log files.
+//! ## Viewing
 //!
-//! TODO Consider shipping this as a grafana panel plugin:
-//!      <https://grafana.com/tutorials/build-a-panel-plugin/>
+//! **Inkscape** is better than the built-in viewers in browsers.
+//!
+//! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X)
+//! to see the layer file name in the comment field.
+//!
+//! ```bash
+//!
+//! # Linux
+//! inkscape out.svg
+//!
+//! # macOS
+//! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg
+//!
+//! ```
+//!
+
 use anyhow::Result;
 use pageserver::repository::Key;
 use pageserver::METADATA_FILE_NAME;
@@ -65,7 +92,12 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {

 pub fn main() -> Result<()> {
    // Parse layer filenames from stdin
-    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
+    struct Layer {
+        filename: String,
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+    }
+    let mut files: Vec<Layer> = vec![];
    let stdin = io::stdin();
    for line in stdin.lock().lines() {
        let line = line.unwrap();
@@ -76,14 +108,23 @@ pub fn main() -> Result<()> {
            // Don't try and parse "metadata" like a key-lsn range
            continue;
        }
-        let range = parse_filename(filename);
-        ranges.push(range);
+        let (key_range, lsn_range) = parse_filename(filename);
+        files.push(Layer {
+            filename: filename.to_owned(),
+            key_range,
+            lsn_range,
+        });
    }

    // Collect all coordinates
    let mut keys: Vec<Key> = vec![];
    let mut lsns: Vec<Lsn> = vec![];
-    for (keyr, lsnr) in &ranges {
+    for Layer {
+        key_range: keyr,
+        lsn_range: lsnr,
+        ..
+    } in &files
+    {
        keys.push(keyr.start);
        keys.push(keyr.end);
        lsns.push(lsnr.start);
@@ -107,7 +148,12 @@ pub fn main() -> Result<()> {
            h: stretch * lsn_map.len() as f32
        }
    );
-    for (keyr, lsnr) in &ranges {
+    for Layer {
+        filename,
+        key_range: keyr,
+        lsn_range: lsnr,
+    } in &files
+    {
        let key_start = *key_map.get(&keyr.start).unwrap();
        let key_end = *key_map.get(&keyr.end).unwrap();
        let key_diff = key_end - key_start;
@@ -151,6 +197,7 @@ pub fn main() -> Result<()> {
            .fill(fill)
            .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
            .border_radius(0.4)
+            .comment(filename)
        );
    }
    println!("{}", EndSvg);
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -0,0 +1,112 @@
+use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
+use tracing::warn;
+
+/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
+fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
+    let mut key = [0; METADATA_KEY_SIZE];
+    let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
+    key[0] = AUX_KEY_PREFIX;
+    key[1] = dir_level1;
+    key[2] = dir_level2;
+    key[3..16].copy_from_slice(&hash[0..13]);
+    Key::from_metadata_key_fixed_size(&key)
+}
+
+const AUX_DIR_PG_LOGICAL: u8 = 0x01;
+const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
+const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
+
+/// Encode the aux file into a fixed-size key.
+///
+/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type.
+/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path
+/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix
+/// is roughly based on the first two components of the path, one unique number for one component.
+///
+/// * pg_logical/mappings -> 0x0101
+/// * pg_logical/snapshots -> 0x0102
+/// * pg_logical/replorigin_checkpoint -> 0x0103
+/// * pg_logical/others -> 0x01FF
+/// * pg_replslot/ -> 0x0201
+/// * others -> 0xFFFF
+///
+/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
+/// The new file type must have never been written to the storage before. Otherwise, there could be data
+/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix.
+pub fn encode_aux_file_key(path: &str) -> Key {
+    if let Some(fname) = path.strip_prefix("pg_logical/mappings/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes())
+    } else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes())
+    } else if path == "pg_logical/replorigin_checkpoint" {
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"")
+    } else if let Some(fname) = path.strip_prefix("pg_logical/") {
+        if cfg!(debug_assertions) {
+            warn!(
+                "unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning",
+                path
+            );
+        }
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
+    } else if let Some(fname) = path.strip_prefix("pg_replslot/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
+    } else {
+        if cfg!(debug_assertions) {
+            warn!(
+                "unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning",
+                path
+            );
+        }
+        aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_hash_portable() {
+        // AUX file encoding requires the hash to be portable across all platforms. This test case checks
+        // if the algorithm produces the same hash across different environments.
+        assert_eq!(
+            305317690835051308206966631765527126151,
+            twox_hash::xxh3::hash128("test1".as_bytes())
+        );
+        assert_eq!(
+            85104974691013376326742244813280798847,
+            twox_hash::xxh3::hash128("test/test2".as_bytes())
+        );
+        assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
+    }
+
+    #[test]
+    fn test_encoding_portable() {
+        // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
+        // of the page server.
+        assert_eq!(
+            "8200000101E5B20C5F8DD5AA3289D6D9EAFA",
+            encode_aux_file_key("pg_logical/mappings/test1").to_string()
+        );
+        assert_eq!(
+            "820000010239AAC544893139B26F501B97E6",
+            encode_aux_file_key("pg_logical/snapshots/test2").to_string()
+        );
+        assert_eq!(
+            "820000010300000000000000000000000000",
+            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
+        );
+        assert_eq!(
+            "82000001FF8635AF2134B7266EC5B4189FD6",
+            encode_aux_file_key("pg_logical/unsupported").to_string()
+        );
+        assert_eq!(
+            "8200000201772D0E5D71DE14DA86142A1619",
+            encode_aux_file_key("pg_replslot/test3").to_string()
+        );
+        assert_eq!(
+            "820000FFFF1866EBEB53B807B26A2416F317",
+            encode_aux_file_key("other_file_not_supported").to_string()
+        );
+    }
+}
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::{key_to_slru_block, Key};
+use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -297,7 +297,20 @@ where
                if rel.forknum == INIT_FORKNUM {
                    // I doubt we need _init fork itself, but having it at least
                    // serves as a marker relation is unlogged.
-                    self.add_rel(rel, rel).await?;
+                    if let Err(_e) = self.add_rel(rel, rel).await {
+                        if self
+                            .timeline
+                            .get_shard_identity()
+                            .is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
+                        {
+                            // Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
+                            // whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup.  This allows
+                            // postgres to start up.  The relation won't work, but it will be possible to DROP TABLE on it and
+                            // recreate.
+                            tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
+                            continue;
+                        }
+                    };
                    self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
                    continue;
                }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1557,6 +1557,7 @@ broker_endpoint = '{broker_endpoint}'
                        endpoint: Some(endpoint.clone()),
                        concurrency_limit: s3_concurrency_limit,
                        max_keys_per_list_response: None,
+                        upload_storage_class: None,
                    }),
                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                },
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -160,6 +160,9 @@ impl From<PageReconstructError> for ApiError {
    fn from(pre: PageReconstructError) -> ApiError {
        match pre {
            PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
+            PageReconstructError::MissingKey(e) => {
+                ApiError::InternalServerError(anyhow::anyhow!("{e}"))
+            }
            PageReconstructError::Cancelled => {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -12,6 +12,7 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub use pageserver_api::keyspace;
+pub mod aux_file;
 pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -86,41 +86,58 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
-        "pageserver_read_num_fs_layers",
-        "Number of persistent layers accessed for processing a read request, including those in the cache",
-        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
+        "pageserver_layers_visited_per_read_global",
+        "Number of layers visited to reconstruct one key",
+        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_layers_visited_per_vectored_read_global",
+        "Average number of layers visited to reconstruct one key",
+        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
    )
    .expect("failed to define a metric")
 });

 // Metrics collected on operations on the storage repository.
+#[derive(
+    Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
+)]
+pub(crate) enum GetKind {
+    Singular,
+    Vectored,
+}

 pub(crate) struct ReconstructTimeMetrics {
-    ok: Histogram,
-    err: Histogram,
+    singular: Histogram,
+    vectored: Histogram,
 }

 pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
    let inner = register_histogram_vec!(
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value (reconstruct a page from deltas)",
-        &["result"],
+        &["get_kind"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric");
+
    ReconstructTimeMetrics {
-        ok: inner.get_metric_with_label_values(&["ok"]).unwrap(),
-        err: inner.get_metric_with_label_values(&["err"]).unwrap(),
+        singular: inner.with_label_values(&[GetKind::Singular.into()]),
+        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
    }
 });

 impl ReconstructTimeMetrics {
-    pub(crate) fn for_result<T, E>(&self, result: &Result<T, E>) -> &Histogram {
-        match result {
-            Ok(_) => &self.ok,
-            Err(_) => &self.err,
+    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
+        match get_kind {
+            GetKind::Singular => &self.singular,
+            GetKind::Vectored => &self.vectored,
        }
    }
 }
@@ -133,13 +150,33 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::n
    .expect("failed to define a metric")
 });

-pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+pub(crate) struct ReconstructDataTimeMetrics {
+    singular: Histogram,
+    vectored: Histogram,
+}
+
+impl ReconstructDataTimeMetrics {
+    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
+        match get_kind {
+            GetKind::Singular => &self.singular,
+            GetKind::Vectored => &self.vectored,
+        }
+    }
+}
+
+pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> = Lazy::new(|| {
+    let inner = register_histogram_vec!(
        "pageserver_getpage_get_reconstruct_data_seconds",
        "Time spent in get_reconstruct_value_data",
+        &["get_kind"],
        CRITICAL_OP_BUCKETS.into(),
    )
-    .expect("failed to define a metric")
+    .expect("failed to define a metric");
+
+    ReconstructDataTimeMetrics {
+        singular: inner.with_label_values(&[GetKind::Singular.into()]),
+        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
+    }
 });

 pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
@@ -2771,7 +2808,8 @@ pub fn preinitialize_metrics() {

    // histograms
    [
-        &READ_NUM_FS_LAYERS,
+        &READ_NUM_LAYERS_VISITED,
+        &VEC_READ_NUM_LAYERS_VISITED,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
        &WAL_REDO_RECORDS_HISTOGRAM,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -874,6 +874,11 @@ impl PageServerHandler {
            // walsender completes the authentication and starts streaming the
            // WAL.
            if lsn <= last_record_lsn {
+                // It might be better to use max(lsn, latest_gc_cutoff_lsn) instead
+                // last_record_lsn. That would give the same result, since we know
+                // that there haven't been modifications since 'lsn'. Using an older
+                // LSN might be faster, because that could allow skipping recent
+                // layers when finding the page.
                lsn = last_record_lsn;
            } else {
                timeline
@@ -1201,6 +1206,10 @@ impl PageServerHandler {
        ))
    }

+    /// Note on "fullbackup":
+    /// Full basebackups should only be used for debugging purposes.
+    /// Originally, it was introduced to enable breaking storage format changes,
+    /// but that is not applicable anymore.
    #[allow(clippy::too_many_arguments)]
    #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
    async fn handle_basebackup_request<IO>(
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -252,16 +252,8 @@ impl Timeline {
        let mut buf = version.get(self, key, ctx).await?;
        let nblocks = buf.get_u32_le();

-        if latest {
-            // Update relation size cache only if "latest" flag is set.
-            // This flag is set by compute when it is working with most recent version of relation.
-            // Typically master compute node always set latest=true.
-            // Please notice, that even if compute node "by mistake" specifies old LSN but set
-            // latest=true, then it can not cause cache corruption, because with latest=true
-            // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
-            // associated with most recent value of LSN.
-            self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
-        }
+        self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
+
        Ok(nblocks)
    }

@@ -456,6 +448,11 @@ impl Timeline {
        // include physical changes from later commits that will be marked
        // as aborted, and will need to be vacuumed away.
        let commit_lsn = Lsn((low - 1) * 8);
+        // This maxing operation is for the edge case that the search above did
+        // set found_smaller to true but it never increased the lsn. Then, low
+        // is still the old min_lsn the subtraction above could possibly give a value
+        // below the anchestor_lsn.
+        let commit_lsn = commit_lsn.max(min_lsn);
        match (found_smaller, found_larger) {
            (false, false) => {
                // This can happen if no commit records have been processed yet, e.g.
@@ -817,7 +814,7 @@ impl Timeline {
    /// Get cached size of relation if it not updated after specified LSN
    pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
        let rel_size_cache = self.rel_size_cache.read().unwrap();
-        if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
+        if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
            if lsn >= *cached_lsn {
                return Some(*nblocks);
            }
@@ -828,7 +825,16 @@ impl Timeline {
    /// Update cached relation size if there is no more recent update
    pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        match rel_size_cache.entry(tag) {
+
+        if lsn < rel_size_cache.complete_as_of {
+            // Do not cache old values. It's safe to cache the size on read, as long as
+            // the read was at an LSN since we started the WAL ingestion. Reasoning: we
+            // never evict values from the cache, so if the relation size changed after
+            // 'lsn', the new value is already in the cache.
+            return;
+        }
+
+        match rel_size_cache.map.entry(tag) {
            hash_map::Entry::Occupied(mut entry) => {
                let cached_lsn = entry.get_mut();
                if lsn >= cached_lsn.0 {
@@ -844,13 +850,13 @@ impl Timeline {
    /// Store cached relation size
    pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.insert(tag, (lsn, nblocks));
+        rel_size_cache.map.insert(tag, (lsn, nblocks));
    }

    /// Remove cached relation size
    pub fn remove_cached_rel_size(&self, tag: &RelTag) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.remove(tag);
+        rel_size_cache.map.remove(tag);
    }
 }

@@ -1401,7 +1407,7 @@ impl<'a> DatadirModification<'a> {
        let n_files;
        let mut aux_files = self.tline.aux_files.lock().await;
        if let Some(mut dir) = aux_files.dir.take() {
-            // We already updated aux files in `self`: emit a delta and update our latest value
+            // We already updated aux files in `self`: emit a delta and update our latest value.
            dir.upsert(file_path.clone(), content.clone());
            n_files = dir.files.len();
            if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
@@ -1446,10 +1452,14 @@ impl<'a> DatadirModification<'a> {
                    // reset the map.
                    return Err(e.into());
                }
-                // FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
-                // we are assuming that all _other_ possible errors represents a missing key.  If some
-                // other error occurs, we may incorrectly reset the map of aux files.
-                Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
+                // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
+                // the original code assumes all other errors are missing keys. Therefore, we keep the code path
+                // the same for now, though in theory, we should only match the `MissingKey` variant.
+                Err(
+                    PageReconstructError::Other(_)
+                    | PageReconstructError::WalRedo(_)
+                    | PageReconstructError::MissingKey { .. },
+                ) => {
                    // Key is missing, we must insert an image as the basis for subsequent deltas.

                    let mut dir = AuxFilesDirectory {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -559,9 +559,10 @@ impl Tenant {
            // By doing what we do here, the index part upload is retried.
            // If control plane retries timeline creation in the meantime, the mgmt API handler
            // for timeline creation will coalesce on the upload we queue here.
+            // FIXME: this branch should be dead code as we no longer write local metadata.
            let rtc = timeline.remote_client.as_ref().unwrap();
            rtc.init_upload_queue_for_empty_remote(&metadata)?;
-            rtc.schedule_index_upload_for_metadata_update(&metadata)?;
+            rtc.schedule_index_upload_for_full_metadata_update(&metadata)?;
        }

        timeline
@@ -2869,20 +2870,23 @@ impl Tenant {
                }
            }

-            if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) {
-                let branchpoints: Vec<Lsn> = all_branchpoints
-                    .range((
-                        Included((timeline_id, Lsn(0))),
-                        Included((timeline_id, Lsn(u64::MAX))),
-                    ))
-                    .map(|&x| x.1)
-                    .collect();
-                timeline
-                    .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
-                    .await?;
+            let cutoff = timeline
+                .get_last_record_lsn()
+                .checked_sub(horizon)
+                .unwrap_or(Lsn(0));

-                gc_timelines.push(timeline);
-            }
+            let branchpoints: Vec<Lsn> = all_branchpoints
+                .range((
+                    Included((timeline_id, Lsn(0))),
+                    Included((timeline_id, Lsn(u64::MAX))),
+                ))
+                .map(|&x| x.1)
+                .collect();
+            timeline
+                .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
+                .await?;
+
+            gc_timelines.push(timeline);
        }
        drop(gc_cs);
        Ok(gc_timelines)
@@ -3027,7 +3031,7 @@ impl Tenant {
        // See also https://github.com/neondatabase/neon/issues/3865
        if let Some(remote_client) = new_timeline.remote_client.as_ref() {
            remote_client
-                .schedule_index_upload_for_metadata_update(&metadata)
+                .schedule_index_upload_for_full_metadata_update(&metadata)
                .context("branch initial metadata upload")?;
        }

@@ -3858,6 +3862,7 @@ mod tests {
    use crate::DEFAULT_PG_VERSION;
    use bytes::BytesMut;
    use hex_literal::hex;
+    use pageserver_api::key::NON_INHERITED_RANGE;
    use pageserver_api::keyspace::KeySpace;
    use rand::{thread_rng, Rng};
    use tests::timeline::{GetVectoredError, ShutdownMode};
@@ -4657,6 +4662,62 @@ mod tests {
        Ok(())
    }

+    #[tokio::test]
+    async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_get_vectored_aux_files")?;
+
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        let tline = tline.raw_timeline().unwrap();
+
+        let mut modification = tline.begin_modification(Lsn(0x1000));
+        modification.put_file("foo/bar1", b"content1", &ctx).await?;
+        modification.set_lsn(Lsn(0x1008))?;
+        modification.put_file("foo/bar2", b"content2", &ctx).await?;
+        modification.commit(&ctx).await?;
+
+        let child_timeline_id = TimelineId::generate();
+        tenant
+            .branch_timeline_test(
+                tline,
+                child_timeline_id,
+                Some(tline.get_last_record_lsn()),
+                &ctx,
+            )
+            .await?;
+
+        let child_timeline = tenant
+            .get_timeline(child_timeline_id, true)
+            .expect("Should have the branched timeline");
+
+        let aux_keyspace = KeySpace {
+            ranges: vec![NON_INHERITED_RANGE],
+        };
+        let read_lsn = child_timeline.get_last_record_lsn();
+
+        let vectored_res = child_timeline
+            .get_vectored_impl(aux_keyspace.clone(), read_lsn, &ctx)
+            .await;
+
+        child_timeline
+            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
+            .await;
+
+        let images = vectored_res?;
+        let mut key = NON_INHERITED_RANGE.start;
+        while key < NON_INHERITED_RANGE.end {
+            assert!(matches!(
+                images[&key],
+                Err(PageReconstructError::MissingKey(_))
+            ));
+            key = key.next();
+        }
+
+        Ok(())
+    }
+
    // Test that vectored get handles layer gaps correctly
    // by advancing into the next ancestor timeline if required.
    //
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -235,6 +235,12 @@ impl TimelineMetadata {
        let bytes = instance.to_bytes().unwrap();
        Self::from_bytes(&bytes).unwrap()
    }
+
+    pub(crate) fn apply(&mut self, update: &MetadataUpdate) {
+        self.body.disk_consistent_lsn = update.disk_consistent_lsn;
+        self.body.prev_record_lsn = update.prev_record_lsn;
+        self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn;
+    }
 }

 impl<'de> Deserialize<'de> for TimelineMetadata {
@@ -259,6 +265,27 @@ impl Serialize for TimelineMetadata {
    }
 }

+/// Parts of the metadata which are regularly modified.
+pub(crate) struct MetadataUpdate {
+    disk_consistent_lsn: Lsn,
+    prev_record_lsn: Option<Lsn>,
+    latest_gc_cutoff_lsn: Lsn,
+}
+
+impl MetadataUpdate {
+    pub(crate) fn new(
+        disk_consistent_lsn: Lsn,
+        prev_record_lsn: Option<Lsn>,
+        latest_gc_cutoff_lsn: Lsn,
+    ) -> Self {
+        Self {
+            disk_consistent_lsn,
+            prev_record_lsn,
+            latest_gc_cutoff_lsn,
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -202,7 +202,9 @@ use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
 use std::time::Duration;

-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
+use remote_storage::{
+    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel,
+};
 use std::ops::DerefMut;
 use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
@@ -236,6 +238,7 @@ use utils::id::{TenantId, TimelineId};

 use self::index::IndexPart;

+use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;
@@ -536,9 +539,10 @@ impl RemoteTimelineClient {
    // Upload operations.
    //

-    ///
    /// Launch an index-file upload operation in the background, with
-    /// updated metadata.
+    /// fully updated metadata.
+    ///
+    /// This should only be used to upload initial metadata to remote storage.
    ///
    /// The upload will be added to the queue immediately, but it
    /// won't be performed until all previously scheduled layer file
@@ -550,7 +554,7 @@ impl RemoteTimelineClient {
    /// If there were any changes to the list of files, i.e. if any
    /// layer file uploads were scheduled, since the last index file
    /// upload, those will be included too.
-    pub fn schedule_index_upload_for_metadata_update(
+    pub fn schedule_index_upload_for_full_metadata_update(
        self: &Arc<Self>,
        metadata: &TimelineMetadata,
    ) -> anyhow::Result<()> {
@@ -566,6 +570,27 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Launch an index-file upload operation in the background, with only parts of the metadata
+    /// updated.
+    ///
+    /// This is the regular way of updating metadata on layer flushes or Gc.
+    ///
+    /// Using this lighter update mechanism allows for reparenting and detaching without changes to
+    /// `index_part.json`, while being more clear on what values update regularly.
+    pub(crate) fn schedule_index_upload_for_metadata_update(
+        self: &Arc<Self>,
+        update: &MetadataUpdate,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        upload_queue.latest_metadata.apply(update);
+
+        self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+
+        Ok(())
+    }
+
    ///
    /// Launch an index-file upload operation in the background, if necessary.
    ///
@@ -1122,7 +1147,7 @@ impl RemoteTimelineClient {
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
        let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);

-        // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
+        // Execute all pending deletions, so that when we proceed to do a listing below, we aren't
        // taking the burden of listing all the layers that we already know we should delete.
        self.flush_deletion_queue().await?;

@@ -1131,14 +1156,20 @@ impl RemoteTimelineClient {
        let remaining = download_retry(
            || async {
                self.storage_impl
-                    .list_files(Some(&timeline_storage_path), None, &cancel)
+                    .list(
+                        Some(&timeline_storage_path),
+                        ListingMode::NoDelimiter,
+                        None,
+                        &cancel,
+                    )
                    .await
            },
            "list remaining files",
            &cancel,
        )
        .await
-        .context("list files remaining files")?;
+        .context("list files remaining files")?
+        .keys;

        // We will delete the current index_part object last, since it acts as a deletion
        // marker via its deleted_at attribute
@@ -2024,7 +2055,7 @@ mod tests {
        // Schedule upload of index. Check that it is queued
        let metadata = dummy_metadata(Lsn(0x20));
        client
-            .schedule_index_upload_for_metadata_update(&metadata)
+            .schedule_index_upload_for_full_metadata_update(&metadata)
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -258,7 +258,7 @@ pub async fn list_remote_timelines(
    tenant_shard_id: TenantShardId,
    cancel: CancellationToken,
 ) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
-    let remote_path = remote_timelines_path(&tenant_shard_id);
+    let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();

    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
        anyhow::bail!("storage-sync-list-remote-timelines");
@@ -417,11 +417,16 @@ pub(super) async fn download_index_part(
    let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());

    let indices = download_retry(
-        || async { storage.list_files(Some(&index_prefix), None, cancel).await },
+        || async {
+            storage
+                .list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
+                .await
+        },
        "list index_part files",
        cancel,
    )
-    .await?;
+    .await?
+    .keys;

    // General case logic for which index to use: the latest index whose generation
    // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -118,6 +118,7 @@ pub(crate) struct ValuesReconstructState {
    pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,

    keys_done: KeySpaceRandomAccum,
+    layers_visited: u32,
 }

 impl ValuesReconstructState {
@@ -125,6 +126,7 @@ impl ValuesReconstructState {
        Self {
            keys: HashMap::new(),
            keys_done: KeySpaceRandomAccum::new(),
+            layers_visited: 0,
        }
    }

@@ -138,6 +140,14 @@ impl ValuesReconstructState {
        }
    }

+    pub(crate) fn on_layer_visited(&mut self) {
+        self.layers_visited += 1;
+    }
+
+    pub(crate) fn get_layers_visited(&self) -> u32 {
+        self.layers_visited
+    }
+
    /// Update the state collected for a given key.
    /// Returns true if this was the last value needed for the key and false otherwise.
    ///
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -728,6 +728,9 @@ impl DeltaLayerInner {
            // production code path
            expected_summary.index_start_blk = actual_summary.index_start_blk;
            expected_summary.index_root_blk = actual_summary.index_root_blk;
+            // mask out the timeline_id, but still require the layers to be from the same tenant
+            expected_summary.timeline_id = actual_summary.timeline_id;
+
            if actual_summary != expected_summary {
                bail!(
                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -396,6 +396,8 @@ impl ImageLayerInner {
            // production code path
            expected_summary.index_start_blk = actual_summary.index_start_blk;
            expected_summary.index_root_blk = actual_summary.index_root_blk;
+            // mask out the timeline_id, but still require the layers to be from the same tenant
+            expected_summary.timeline_id = actual_summary.timeline_id;

            if actual_summary != expected_summary {
                bail!(
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -26,7 +26,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // while being able to use std::fmt::Write's methods
 use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
 use std::cmp::Ordering;
-use std::fmt::Write as _;
+use std::fmt::Write;
 use std::ops::Range;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
@@ -54,6 +54,12 @@ pub struct InMemoryLayer {
    /// Writes are only allowed when this is `None`.
    end_lsn: OnceLock<Lsn>,

+    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
+    local_path_str: Arc<str>,
+
+    /// Used for traversal path. Cached representation of the in-memory layer after frozen.
+    frozen_local_path_str: OnceLock<Arc<str>>,
+
    opened_at: Instant,

    /// The above fields never change, except for `end_lsn`, which is only set once.
@@ -241,6 +247,12 @@ impl InMemoryLayer {
        self.start_lsn..self.end_lsn_or_max()
    }

+    pub(crate) fn local_path_str(&self) -> &Arc<str> {
+        self.frozen_local_path_str
+            .get()
+            .unwrap_or(&self.local_path_str)
+    }
+
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
@@ -430,10 +442,24 @@ impl InMemoryLayer {
    }
 }

+fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
+    write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
+}
+
+fn inmem_layer_log_display(
+    mut f: impl Write,
+    timeline: TimelineId,
+    start_lsn: Lsn,
+    end_lsn: Lsn,
+) -> std::fmt::Result {
+    write!(f, "timeline {} in-memory ", timeline)?;
+    inmem_layer_display(f, start_lsn, end_lsn)
+}
+
 impl std::fmt::Display for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let end_lsn = self.end_lsn_or_max();
-        write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
+        inmem_layer_display(f, self.start_lsn, end_lsn)
    }
 }

@@ -458,6 +484,12 @@ impl InMemoryLayer {

        Ok(InMemoryLayer {
            file_id: key,
+            local_path_str: {
+                let mut buf = String::new();
+                inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
+                buf.into()
+            },
+            frozen_local_path_str: OnceLock::new(),
            conf,
            timeline_id,
            tenant_shard_id,
@@ -552,6 +584,15 @@ impl InMemoryLayer {
        );
        self.end_lsn.set(end_lsn).expect("end_lsn set only once");

+        self.frozen_local_path_str
+            .set({
+                let mut buf = String::new();
+                inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn)
+                    .unwrap();
+                buf.into()
+            })
+            .expect("frozen_local_path_str set only once");
+
        for vec_map in inner.index.values() {
            for (lsn, _pos) in vec_map.as_slice() {
                assert!(*lsn < end_lsn);
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -395,6 +395,10 @@ impl Layer {
        &self.0.path
    }

+    pub(crate) fn local_path_str(&self) -> &Arc<str> {
+        &self.0.path_str
+    }
+
    pub(crate) fn metadata(&self) -> LayerFileMetadata {
        self.0.metadata()
    }
@@ -517,6 +521,9 @@ struct LayerInner {
    /// Full path to the file; unclear if this should exist anymore.
    path: Utf8PathBuf,

+    /// String representation of the full path, used for traversal id.
+    path_str: Arc<str>,
+
    desc: PersistentLayerDesc,

    /// Timeline access is needed for remote timeline client and metrics.
@@ -722,6 +729,7 @@ impl LayerInner {

        LayerInner {
            conf,
+            path_str: path.to_string().into(),
            path,
            desc,
            timeline: Arc::downgrade(timeline),
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -818,11 +818,13 @@ async fn eviction_cancellation_on_drop() {
    }
 }

+/// A test case to remind you the cost of these structures. You can bump the size limit
+/// below if it is really necessary to add more fields to the structures.
 #[test]
 fn layer_size() {
    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
    assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(std::mem::size_of::<LayerInner>(), 2328);
+    assert_eq!(std::mem::size_of::<LayerInner>(), 2344);
    // it also has the utf8 path
 }

--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -62,7 +62,7 @@ impl BackgroundLoopKind {
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
    loop_kind: BackgroundLoopKind,
    _ctx: &RequestContext,
-) -> impl Drop {
+) -> tokio::sync::SemaphorePermit<'static> {
    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
        .with_label_values(&[loop_kind.as_static_str()])
        .guard();
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -16,14 +16,14 @@ use enumset::EnumSet;
 use fail::fail_point;
 use once_cell::sync::Lazy;
 use pageserver_api::{
-    key::AUX_FILES_KEY,
+    key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
    keyspace::KeySpaceAccum,
    models::{
        CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
        EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
    },
    reltag::BlockNumber,
-    shard::{ShardIdentity, TenantShardId},
+    shard::{ShardIdentity, ShardNumber, TenantShardId},
 };
 use rand::Rng;
 use serde_with::serde_as;
@@ -86,7 +86,7 @@ use crate::{
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::{
-    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
+    GetKind, TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
@@ -182,6 +182,16 @@ pub(crate) struct AuxFilesState {
    pub(crate) n_deltas: usize,
 }

+/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL
+/// ingestion considerably, because WAL ingestion needs to check on most records if the record
+/// implicitly extends the relation.  At startup, `complete_as_of` is initialized to the current end
+/// of the timeline (disk_consistent_lsn).  It's used on reads of relation sizes to check if the
+/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`].
+pub(crate) struct RelSizeCache {
+    pub(crate) complete_as_of: Lsn,
+    pub(crate) map: HashMap<RelTag, (Lsn, BlockNumber)>,
+}
+
 pub struct Timeline {
    conf: &'static PageServerConf,
    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
@@ -324,7 +334,7 @@ pub struct Timeline {
    pub walreceiver: Mutex<Option<WalReceiver>>,

    /// Relation size cache
-    pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
+    pub(crate) rel_size_cache: RwLock<RelSizeCache>,

    download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,

@@ -428,6 +438,62 @@ pub(crate) enum PageReconstructError {
    /// An error happened replaying WAL records
    #[error(transparent)]
    WalRedo(anyhow::Error),
+
+    #[error("{0}")]
+    MissingKey(MissingKeyError),
+}
+
+#[derive(Debug)]
+pub struct MissingKeyError {
+    stuck_at_lsn: bool,
+    key: Key,
+    shard: ShardNumber,
+    cont_lsn: Lsn,
+    request_lsn: Lsn,
+    ancestor_lsn: Option<Lsn>,
+    traversal_path: Vec<TraversalPathItem>,
+    backtrace: Option<std::backtrace::Backtrace>,
+}
+
+impl std::fmt::Display for MissingKeyError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.stuck_at_lsn {
+            // Records are found in this timeline but no image layer or initial delta record was found.
+            write!(
+                f,
+                "could not find layer with more data for key {} (shard {:?}) at LSN {}, request LSN {}",
+                self.key, self.shard, self.cont_lsn, self.request_lsn
+            )?;
+            if let Some(ref ancestor_lsn) = self.ancestor_lsn {
+                write!(f, ", ancestor {}", ancestor_lsn)?;
+            }
+        } else {
+            // No records in this timeline.
+            write!(
+                f,
+                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
+                self.key, self.shard, self.cont_lsn, self.request_lsn
+            )?;
+        }
+
+        if !self.traversal_path.is_empty() {
+            writeln!(f)?;
+        }
+
+        for (r, c, l) in &self.traversal_path {
+            writeln!(
+                f,
+                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
+                r, c, l,
+            )?;
+        }
+
+        if let Some(ref backtrace) = self.backtrace {
+            write!(f, "\n{}", backtrace)?;
+        }
+
+        Ok(())
+    }
 }

 impl PageReconstructError {
@@ -439,6 +505,7 @@ impl PageReconstructError {
            AncestorLsnTimeout(_) => false,
            Cancelled | AncestorStopping(_) => true,
            WalRedo(_) => false,
+            MissingKey { .. } => false,
        }
    }
 }
@@ -730,7 +797,9 @@ impl Timeline {
            img: cached_page_img,
        };

-        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
+        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
+            .for_get_kind(GetKind::Singular)
+            .start_timer();
        let path = self
            .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
            .await?;
@@ -740,7 +809,7 @@ impl Timeline {
        let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
        let elapsed = start.elapsed();
        crate::metrics::RECONSTRUCT_TIME
-            .for_result(&res)
+            .for_get_kind(GetKind::Singular)
            .observe(elapsed.as_secs_f64());

        if cfg!(feature = "testing") && res.is_err() {
@@ -753,7 +822,7 @@ impl Timeline {
                writeln!(
                    msg,
                    "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
-                    layer(),
+                    layer,
                )
                .expect("string grows")
            });
@@ -872,8 +941,16 @@ impl Timeline {
                    Err(Cancelled | AncestorStopping(_)) => {
                        return Err(GetVectoredError::Cancelled)
                    }
-                    Err(Other(err)) if err.to_string().contains("could not find data for key") => {
-                        return Err(GetVectoredError::MissingKey(key))
+                    // we only capture stuck_at_lsn=false now until we figure out https://github.com/neondatabase/neon/issues/7380
+                    Err(MissingKey(MissingKeyError {
+                        stuck_at_lsn: false,
+                        ..
+                    })) if !NON_INHERITED_RANGE.contains(&key) => {
+                        // The vectored read path handles non inherited keys specially.
+                        // If such a a key cannot be reconstructed from the current timeline,
+                        // the vectored read path returns a key level error as opposed to a top
+                        // level error.
+                        return Err(GetVectoredError::MissingKey(key));
                    }
                    _ => {
                        values.insert(key, block);
@@ -894,10 +971,24 @@ impl Timeline {
    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
        let mut reconstruct_state = ValuesReconstructState::new();

+        let get_kind = if keyspace.total_size() == 1 {
+            GetKind::Singular
+        } else {
+            GetKind::Vectored
+        };
+
+        let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
+            .for_get_kind(get_kind)
+            .start_timer();
        self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
            .await?;
+        get_data_timer.stop_and_record();

+        let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME
+            .for_get_kind(get_kind)
+            .start_timer();
        let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
+        let layers_visited = reconstruct_state.get_layers_visited();
        for (key, res) in reconstruct_state.keys {
            match res {
                Err(err) => {
@@ -911,6 +1002,13 @@ impl Timeline {
                }
            }
        }
+        reconstruct_timer.stop_and_record();
+
+        // Note that this is an approximation. Tracking the exact number of layers visited
+        // per key requires virtually unbounded memory usage and is inefficient
+        // (i.e. segment tree tracking each range queried from a layer)
+        crate::metrics::VEC_READ_NUM_LAYERS_VISITED
+            .observe(layers_visited as f64 / results.len() as f64);

        Ok(results)
    }
@@ -1892,7 +1990,10 @@ impl Timeline {
                last_image_layer_creation_check_at: AtomicLsn::new(0),

                last_received_wal: Mutex::new(None),
-                rel_size_cache: RwLock::new(HashMap::new()),
+                rel_size_cache: RwLock::new(RelSizeCache {
+                    complete_as_of: disk_consistent_lsn,
+                    map: HashMap::new(),
+                }),

                download_all_remote_layers_task_info: RwLock::new(None),

@@ -2692,7 +2793,7 @@ impl Timeline {
    }
 }

-type TraversalId = String;
+type TraversalId = Arc<str>;

 trait TraversalLayerExt {
    fn traversal_id(&self) -> TraversalId;
@@ -2700,13 +2801,13 @@ trait TraversalLayerExt {

 impl TraversalLayerExt for Layer {
    fn traversal_id(&self) -> TraversalId {
-        self.local_path().to_string()
+        Arc::clone(self.local_path_str())
    }
 }

 impl TraversalLayerExt for Arc<InMemoryLayer> {
    fn traversal_id(&self) -> TraversalId {
-        format!("timeline {} in-memory {self}", self.get_timeline_id())
+        Arc::clone(self.local_path_str())
    }
 }

@@ -2735,7 +2836,7 @@ impl Timeline {
        let mut timeline = self;

        let mut read_count = scopeguard::guard(0, |cnt| {
-            crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64)
+            crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64)
        });

        // For debugging purposes, collect the path of layers that we traversed
@@ -2775,32 +2876,35 @@ impl Timeline {
                        if prev <= cont_lsn {
                            // Didn't make any progress in last iteration. Error out to avoid
                            // getting stuck in the loop.
-                            return Err(layer_traversal_error(format!(
-                                "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
+                            return Err(PageReconstructError::MissingKey(MissingKeyError {
+                                stuck_at_lsn: true,
                                key,
-                                Lsn(cont_lsn.0 - 1),
+                                shard: self.shard_identity.get_shard_number(&key),
+                                cont_lsn: Lsn(cont_lsn.0 - 1),
                                request_lsn,
-                                timeline.ancestor_lsn
-                            ), traversal_path));
+                                ancestor_lsn: Some(timeline.ancestor_lsn),
+                                traversal_path,
+                                backtrace: None,
+                            }));
                        }
                    }
                    prev_lsn = Some(cont_lsn);
                }
                ValueReconstructResult::Missing => {
-                    return Err(layer_traversal_error(
-                        if cfg!(test) {
-                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
-                            )
-                        } else {
-                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
-                            )
-                        },
+                    return Err(PageReconstructError::MissingKey(MissingKeyError {
+                        stuck_at_lsn: false,
+                        key,
+                        shard: self.shard_identity.get_shard_number(&key),
+                        cont_lsn,
+                        request_lsn,
+                        ancestor_lsn: None,
                        traversal_path,
-                    ));
+                        backtrace: if cfg!(test) {
+                            Some(std::backtrace::Backtrace::force_capture())
+                        } else {
+                            None
+                        },
+                    }));
                }
            }

@@ -2847,12 +2951,8 @@ impl Timeline {
                        Err(e) => return Err(PageReconstructError::from(e)),
                    };
                    cont_lsn = lsn_floor;
-                    // metrics: open_layer does not count as fs access, so we are not updating `read_count`
-                    traversal_path.push((
-                        result,
-                        cont_lsn,
-                        Box::new(move || open_layer.traversal_id()),
-                    ));
+                    *read_count += 1;
+                    traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
                    continue 'outer;
                }
            }
@@ -2878,12 +2978,8 @@ impl Timeline {
                        Err(e) => return Err(PageReconstructError::from(e)),
                    };
                    cont_lsn = lsn_floor;
-                    // metrics: open_layer does not count as fs access, so we are not updating `read_count`
-                    traversal_path.push((
-                        result,
-                        cont_lsn,
-                        Box::new(move || frozen_layer.traversal_id()),
-                    ));
+                    *read_count += 1;
+                    traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
                    continue 'outer;
                }
            }
@@ -2904,14 +3000,7 @@ impl Timeline {
                };
                cont_lsn = lsn_floor;
                *read_count += 1;
-                traversal_path.push((
-                    result,
-                    cont_lsn,
-                    Box::new({
-                        let layer = layer.to_owned();
-                        move || layer.traversal_id()
-                    }),
-                ));
+                traversal_path.push((result, cont_lsn, layer.traversal_id()));
                continue 'outer;
            } else if timeline.ancestor_timeline.is_some() {
                // Nothing on this timeline. Traverse to parent
@@ -2964,6 +3053,41 @@ impl Timeline {
            .await?;

            keyspace.remove_overlapping_with(&completed);
+
+            // Do not descend into the ancestor timeline for aux files.
+            // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
+            // stalling compaction.
+            // TODO(chi): this will need to be updated for aux files v2 storage
+            if keyspace.overlaps(&NON_INHERITED_RANGE) {
+                let removed = keyspace.remove_overlapping_with(&KeySpace {
+                    ranges: vec![NON_INHERITED_RANGE],
+                });
+
+                for range in removed.ranges {
+                    let mut key = range.start;
+                    while key < range.end {
+                        reconstruct_state.on_key_error(
+                            key,
+                            PageReconstructError::MissingKey(MissingKeyError {
+                                stuck_at_lsn: false,
+                                key,
+                                shard: self.shard_identity.get_shard_number(&key),
+                                cont_lsn,
+                                request_lsn,
+                                ancestor_lsn: None,
+                                traversal_path: Vec::default(),
+                                backtrace: if cfg!(test) {
+                                    Some(std::backtrace::Backtrace::force_capture())
+                                } else {
+                                    None
+                                },
+                            }),
+                        );
+                        key = key.next();
+                    }
+                }
+            }
+
            if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() {
                break;
            }
@@ -3019,55 +3143,61 @@ impl Timeline {
            unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
            completed_keyspace.merge(&keys_done_last_step);

-            let guard = timeline.layers.read().await;
-            let layers = guard.layer_map();
+            // Do not descent any further if the last layer we visited
+            // completed all keys in the keyspace it inspected. This is not
+            // required for correctness, but avoids visiting extra layers
+            // which turns out to be a perf bottleneck in some cases.
+            if !unmapped_keyspace.is_empty() {
+                let guard = timeline.layers.read().await;
+                let layers = guard.layer_map();

-            let in_memory_layer = layers.find_in_memory_layer(|l| {
-                let start_lsn = l.get_lsn_range().start;
-                cont_lsn > start_lsn
-            });
+                let in_memory_layer = layers.find_in_memory_layer(|l| {
+                    let start_lsn = l.get_lsn_range().start;
+                    cont_lsn > start_lsn
+                });

-            match in_memory_layer {
-                Some(l) => {
-                    let lsn_range = l.get_lsn_range().start..cont_lsn;
-                    fringe.update(
-                        ReadableLayer::InMemoryLayer(l),
-                        unmapped_keyspace.clone(),
-                        lsn_range,
-                    );
-                }
-                None => {
-                    for range in unmapped_keyspace.ranges.iter() {
-                        let results = layers.range_search(range.clone(), cont_lsn);
+                match in_memory_layer {
+                    Some(l) => {
+                        let lsn_range = l.get_lsn_range().start..cont_lsn;
+                        fringe.update(
+                            ReadableLayer::InMemoryLayer(l),
+                            unmapped_keyspace.clone(),
+                            lsn_range,
+                        );
+                    }
+                    None => {
+                        for range in unmapped_keyspace.ranges.iter() {
+                            let results = layers.range_search(range.clone(), cont_lsn);

-                        results
-                            .found
-                            .into_iter()
-                            .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
-                                (
-                                    ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
-                                    keyspace_accum.to_keyspace(),
-                                    lsn_floor..cont_lsn,
-                                )
-                            })
-                            .for_each(|(layer, keyspace, lsn_range)| {
-                                fringe.update(layer, keyspace, lsn_range)
-                            });
+                            results
+                                .found
+                                .into_iter()
+                                .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
+                                    (
+                                        ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
+                                        keyspace_accum.to_keyspace(),
+                                        lsn_floor..cont_lsn,
+                                    )
+                                })
+                                .for_each(|(layer, keyspace, lsn_range)| {
+                                    fringe.update(layer, keyspace, lsn_range)
+                                });
+                        }
                    }
                }
-            }

-            // It's safe to drop the layer map lock after planning the next round of reads.
-            // The fringe keeps readable handles for the layers which are safe to read even
-            // if layers were compacted or flushed.
-            //
-            // The more interesting consideration is: "Why is the read algorithm still correct
-            // if the layer map changes while it is operating?". Doing a vectored read on a
-            // timeline boils down to pushing an imaginary lsn boundary downwards for each range
-            // covered by the read. The layer map tells us how to move the lsn downwards for a
-            // range at *a particular point in time*. It is fine for the answer to be different
-            // at two different time points.
-            drop(guard);
+                // It's safe to drop the layer map lock after planning the next round of reads.
+                // The fringe keeps readable handles for the layers which are safe to read even
+                // if layers were compacted or flushed.
+                //
+                // The more interesting consideration is: "Why is the read algorithm still correct
+                // if the layer map changes while it is operating?". Doing a vectored read on a
+                // timeline boils down to pushing an imaginary lsn boundary downwards for each range
+                // covered by the read. The layer map tells us how to move the lsn downwards for a
+                // range at *a particular point in time*. It is fine for the answer to be different
+                // at two different time points.
+                drop(guard);
+            }

            if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
                let next_cont_lsn = lsn_range.start;
@@ -3082,6 +3212,8 @@ impl Timeline {

                unmapped_keyspace = keyspace_to_read;
                cont_lsn = next_cont_lsn;
+
+                reconstruct_state.on_layer_visited();
            } else {
                break;
            }
@@ -3525,7 +3657,7 @@ impl Timeline {
        &self,
        disk_consistent_lsn: Lsn,
        layers_to_upload: impl IntoIterator<Item = ResidentLayer>,
-    ) -> anyhow::Result<TimelineMetadata> {
+    ) -> anyhow::Result<()> {
        // We can only save a valid 'prev_record_lsn' value on disk if we
        // flushed *all* in-memory changes to disk. We only track
        // 'prev_record_lsn' in memory for the latest processed record, so we
@@ -3542,19 +3674,10 @@ impl Timeline {
            None
        };

-        let ancestor_timeline_id = self
-            .ancestor_timeline
-            .as_ref()
-            .map(|ancestor| ancestor.timeline_id);
-
-        let metadata = TimelineMetadata::new(
+        let update = crate::tenant::metadata::MetadataUpdate::new(
            disk_consistent_lsn,
            ondisk_prev_record_lsn,
-            ancestor_timeline_id,
-            self.ancestor_lsn,
            *self.latest_gc_cutoff_lsn.read(),
-            self.initdb_lsn,
-            self.pg_version,
        );

        fail_point!("checkpoint-before-saving-metadata", |x| bail!(
@@ -3566,10 +3689,10 @@ impl Timeline {
            for layer in layers_to_upload {
                remote_client.schedule_layer_file_upload(layer)?;
            }
-            remote_client.schedule_index_upload_for_metadata_update(&metadata)?;
+            remote_client.schedule_index_upload_for_metadata_update(&update)?;
        }

-        Ok(metadata)
+        Ok(())
    }

    pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
@@ -4143,9 +4266,8 @@ impl Timeline {
                *self.get_latest_gc_cutoff_lsn()
            }
        } else {
-            // No time-based retention was configured. Set time-based cutoff to
-            // same as LSN based.
-            cutoff_horizon
+            // No time-based retention was configured. Interpret this as "keep no history".
+            self.get_last_record_lsn()
        };

        // Grab the lock and update the values
@@ -4665,35 +4787,7 @@ impl Timeline {
    }
 }

-type TraversalPathItem = (
-    ValueReconstructResult,
-    Lsn,
-    Box<dyn Send + FnOnce() -> TraversalId>,
-);
-
-/// Helper function for get_reconstruct_data() to add the path of layers traversed
-/// to an error, as anyhow context information.
-fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageReconstructError {
-    // We want the original 'msg' to be the outermost context. The outermost context
-    // is the most high-level information, which also gets propagated to the client.
-    let mut msg_iter = path
-        .into_iter()
-        .map(|(r, c, l)| {
-            format!(
-                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
-                r,
-                c,
-                l(),
-            )
-        })
-        .chain(std::iter::once(msg));
-    // Construct initial message from the first traversed layer
-    let err = anyhow!(msg_iter.next().unwrap());
-
-    // Append all subsequent traversals, and the error message 'msg', as contexts.
-    let msg = msg_iter.fold(err, |err, msg| err.context(msg));
-    PageReconstructError::from(msg)
-}
+type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);

 struct TimelineWriterState {
    open_layer: Arc<InMemoryLayer>,
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -188,24 +188,10 @@ impl Timeline {
    ) -> ControlFlow<()> {
        let now = SystemTime::now();

-        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
-            BackgroundLoopKind::Eviction,
-            ctx,
-        );
+        let permit = self.acquire_imitation_permit(cancel, ctx).await?;

-        let _permit = tokio::select! {
-            permit = acquire_permit => permit,
-            _ = cancel.cancelled() => return ControlFlow::Break(()),
-            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
-        };
-
-        match self
-            .imitate_layer_accesses(tenant, p, cancel, gate, ctx)
-            .await
-        {
-            ControlFlow::Break(()) => return ControlFlow::Break(()),
-            ControlFlow::Continue(()) => (),
-        }
+        self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
+            .await?;

        #[derive(Debug, Default)]
        struct EvictionStats {
@@ -330,19 +316,27 @@ impl Timeline {
        gate: &GateGuard,
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
+        let permit = self.acquire_imitation_permit(cancel, ctx).await?;
+
+        self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
+            .await
+    }
+
+    async fn acquire_imitation_permit(
+        &self,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> {
        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
            BackgroundLoopKind::Eviction,
            ctx,
        );

-        let _permit = tokio::select! {
-            permit = acquire_permit => permit,
-            _ = cancel.cancelled() => return ControlFlow::Break(()),
-            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
-        };
-
-        self.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
-            .await
+        tokio::select! {
+            permit = acquire_permit => ControlFlow::Continue(permit),
+            _ = cancel.cancelled() => ControlFlow::Break(()),
+            _ = self.cancel.cancelled() => ControlFlow::Break(()),
+        }
    }

    /// If we evict layers but keep cached values derived from those layers, then
@@ -376,6 +370,7 @@ impl Timeline {
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
+        permit: tokio::sync::SemaphorePermit<'static>,
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
        if !self.tenant_shard_id.is_shard_zero() {
@@ -408,7 +403,28 @@ impl Timeline {
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
        // imitated accesses that the winner made.
-        let mut state = tenant.eviction_task_tenant_state.lock().await;
+        let (mut state, _permit) = {
+            if let Ok(locked) = tenant.eviction_task_tenant_state.try_lock() {
+                (locked, permit)
+            } else {
+                // we might need to wait for a long time here in case of pathological synthetic
+                // size calculation performance
+                drop(permit);
+                let locked = tokio::select! {
+                    locked = tenant.eviction_task_tenant_state.lock() => locked,
+                    _ = self.cancel.cancelled() => {
+                        return ControlFlow::Break(())
+                    },
+                    _ = cancel.cancelled() => {
+                        return ControlFlow::Break(())
+                    }
+                };
+                // then reacquire -- this will be bad if there is a lot of traffic, but because we
+                // released the permit, the overall latency will be much better.
+                let permit = self.acquire_imitation_permit(cancel, ctx).await?;
+                (locked, permit)
+            }
+        };
        match state.last_layer_access_imitation {
            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -67,7 +67,6 @@ routerify.workspace = true
 rustc-hash.workspace = true
 rustls-pemfile.workspace = true
 rustls.workspace = true
-rustls-native-certs = "0.7.0"
 scopeguard.workspace = true
 serde.workspace = true
 serde_json.workspace = true
@@ -82,10 +81,9 @@ thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
 tokio-postgres.workspace = true
-tokio-postgres-rustls.workspace = true
 tokio-rustls.workspace = true
 tokio-util.workspace = true
-tokio = { workspace = true, features = ["signal", "tracing"] }
+tokio = { workspace = true, features = ["signal"] }
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
@@ -96,11 +94,11 @@ utils.workspace = true
 uuid.workspace = true
 webpki-roots.workspace = true
 x509-parser.workspace = true
+native-tls.workspace = true
+postgres-native-tls.workspace = true
 postgres-protocol.workspace = true
 redis.workspace = true

-console-subscriber = "0.2.0"
-
 workspace_hack.workspace = true

 [dev-dependencies]
@@ -108,5 +106,6 @@ camino-tempfile.workspace = true
 fallible-iterator.workspace = true
 rcgen.workspace = true
 rstest.workspace = true
+tokio-postgres-rustls.workspace = true
 walkdir.workspace = true
 rand_distr = "0.4"
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -121,5 +121,6 @@ pub(super) async fn authenticate(
    Ok(NodeInfo {
        config,
        aux: db_info.aux,
+        allow_self_signed_compute: false, // caller may override
    })
 }
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -35,8 +35,6 @@ use proxy::config::{self, ProxyConfig};
 use proxy::serverless;
 use std::net::SocketAddr;
 use std::pin::pin;
-use std::sync::atomic::AtomicUsize;
-use std::sync::atomic::Ordering;
 use std::sync::Arc;
 use tokio::net::TcpListener;
 use tokio::sync::Mutex;
@@ -122,6 +120,9 @@ struct ProxyCliArgs {
    /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
    #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
    wake_compute_lock: String,
+    /// Allow self-signed certificates for compute nodes (for testing)
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    allow_self_signed_compute: bool,
    #[clap(flatten)]
    sql_over_http: SqlOverHttpArgs,
    /// timeout for scram authentication protocol
@@ -199,6 +200,12 @@ struct ProxyCliArgs {
    /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
    #[clap(long, default_value = "4194304")]
    metric_backup_collection_chunk_size: usize,
+    /// Whether to retry the connection to the compute node
+    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
+    connect_to_compute_retry: String,
+    /// Whether to retry the wake_compute request
+    #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)]
+    wake_compute_retry: String,
 }

 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -235,21 +242,8 @@ struct SqlOverHttpArgs {
    sql_over_http_pool_shards: usize,
 }

-fn main() -> anyhow::Result<()> {
-    let rt = tokio::runtime::Builder::new_multi_thread()
-        .enable_all()
-        .thread_name_fn(|| {
-            static ATOMIC_ID: AtomicUsize = AtomicUsize::new(0);
-            let id = ATOMIC_ID.fetch_add(1, Ordering::SeqCst);
-            format!("worker-{}", id)
-        })
-        .build()
-        .unwrap();
-
-    rt.block_on(main2())
-}
-
-async fn main2() -> anyhow::Result<()> {
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
    let _logging_guard = proxy::logging::init().await?;
    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
@@ -345,7 +339,7 @@ async fn main2() -> anyhow::Result<()> {

    let cancel_map = CancelMap::default();

-    let redis_publisher = match &regional_redis_client {
+    let redis_publisher = match &redis_notifications_client {
        Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
            redis_publisher.clone(),
            args.region.clone(),
@@ -364,16 +358,12 @@ async fn main2() -> anyhow::Result<()> {
    // client facing tasks. these will exit on error or on cancellation
    // cancellation returns Ok(())
    let mut client_tasks = JoinSet::new();
-    client_tasks
-        .build_task()
-        .name("tcp main")
-        .spawn(proxy::proxy::task_main(
-            config,
-            proxy_listener,
-            cancellation_token.clone(),
-            cancellation_handler.clone(),
-        ))
-        .unwrap();
+    client_tasks.spawn(proxy::proxy::task_main(
+        config,
+        proxy_listener,
+        cancellation_token.clone(),
+        cancellation_handler.clone(),
+    ));

    // TODO: rename the argument to something like serverless.
    // It now covers more than just websockets, it also covers SQL over HTTP.
@@ -382,98 +372,61 @@ async fn main2() -> anyhow::Result<()> {
        info!("Starting wss on {serverless_address}");
        let serverless_listener = TcpListener::bind(serverless_address).await?;

-        client_tasks
-            .build_task()
-            .name("serverless main")
-            .spawn(serverless::task_main(
-                config,
-                serverless_listener,
-                cancellation_token.clone(),
-                cancellation_handler.clone(),
-            ))
-            .unwrap();
+        client_tasks.spawn(serverless::task_main(
+            config,
+            serverless_listener,
+            cancellation_token.clone(),
+            cancellation_handler.clone(),
+        ));
    }

-    client_tasks
-        .build_task()
-        .name("parquet worker")
-        .spawn(proxy::context::parquet::worker(
-            cancellation_token.clone(),
-            args.parquet_upload,
-        ))
-        .unwrap();
+    client_tasks.spawn(proxy::context::parquet::worker(
+        cancellation_token.clone(),
+        args.parquet_upload,
+    ));

    // maintenance tasks. these never return unless there's an error
    let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks
-        .build_task()
-        .name("signal handler")
-        .spawn(proxy::handle_signals(cancellation_token.clone()))
-        .unwrap();
-    maintenance_tasks
-        .build_task()
-        .name("health server")
-        .spawn(http::health_server::task_main(
-            http_listener,
-            AppMetrics {
-                jemalloc,
-                neon_metrics,
-                proxy: proxy::metrics::Metrics::get(),
-            },
-        ))
-        .unwrap();
-    maintenance_tasks
-        .build_task()
-        .name("mangement main")
-        .spawn(console::mgmt::task_main(mgmt_listener))
-        .unwrap();
+    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
+    maintenance_tasks.spawn(http::health_server::task_main(
+        http_listener,
+        AppMetrics {
+            jemalloc,
+            neon_metrics,
+            proxy: proxy::metrics::Metrics::get(),
+        },
+    ));
+    maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));

    if let Some(metrics_config) = &config.metric_collection {
        // TODO: Add gc regardles of the metric collection being enabled.
-        maintenance_tasks
-            .build_task()
-            .name("")
-            .spawn(usage_metrics::task_main(metrics_config))
-            .unwrap();
-        client_tasks
-            .build_task()
-            .name("")
-            .spawn(usage_metrics::task_backup(
-                &metrics_config.backup_metric_collection_config,
-                cancellation_token,
-            ))
-            .unwrap();
+        maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
+        client_tasks.spawn(usage_metrics::task_backup(
+            &metrics_config.backup_metric_collection_config,
+            cancellation_token.clone(),
+        ));
    }

    if let auth::BackendType::Console(api, _) = &config.auth_backend {
        if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
            if let Some(redis_notifications_client) = redis_notifications_client {
                let cache = api.caches.project_info.clone();
-                maintenance_tasks
-                    .build_task()
-                    .name("redis notifications")
-                    .spawn(notifications::task_main(
-                        redis_notifications_client,
-                        cache.clone(),
-                        cancel_map.clone(),
-                        args.region.clone(),
-                    ))
-                    .unwrap();
-                maintenance_tasks
-                    .build_task()
-                    .name("proj info cache gc")
-                    .spawn(async move { cache.clone().gc_worker().await })
-                    .unwrap();
+                maintenance_tasks.spawn(notifications::task_main(
+                    redis_notifications_client,
+                    cache.clone(),
+                    cancel_map.clone(),
+                    args.region.clone(),
+                ));
+                maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
            }
            if let Some(regional_redis_client) = regional_redis_client {
                let cache = api.caches.endpoints_cache.clone();
                let con = regional_redis_client;
                let span = tracing::info_span!("endpoints_cache");
-                maintenance_tasks
-                    .build_task()
-                    .name("redis endpoints cache read")
-                    .spawn(async move { cache.do_read(con).await }.instrument(span))
-                    .unwrap();
+                maintenance_tasks.spawn(
+                    async move { cache.do_read(con, cancellation_token.clone()).await }
+                        .instrument(span),
+                );
            }
        }
    }
@@ -514,6 +467,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        _ => bail!("either both or neither tls-key and tls-cert must be specified"),
    };

+    if args.allow_self_signed_compute {
+        warn!("allowing self-signed compute certificates");
+    }
    let backup_metric_collection_config = config::MetricBackupCollectionConfig {
        interval: args.metric_backup_collection_interval,
        remote_storage_config: remote_storage_from_toml(
@@ -578,10 +534,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
                )
                .unwrap(),
            ));
-            tokio::task::Builder::new()
-                .name("wake compute lock gc")
-                .spawn(locks.garbage_collect_worker())
-                .unwrap();
+            tokio::spawn(locks.garbage_collect_worker());

            let url = args.auth_endpoint.parse()?;
            let endpoint = http::Endpoint::new(url, http::new_client());
@@ -631,6 +584,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        tls_config,
        auth_backend,
        metric_collection,
+        allow_self_signed_compute: args.allow_self_signed_compute,
        http_config,
        authentication_config,
        require_client_ip: args.require_client_ip,
@@ -639,6 +593,10 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        handshake_timeout: args.handshake_timeout,
        region: args.region.clone(),
        aws_region: args.aws_region.clone(),
+        wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
+        connect_to_compute_retry_config: config::RetryConfig::parse(
+            &args.connect_to_compute_retry,
+        )?,
    }));

    Ok(config)
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -4,6 +4,7 @@ use std::{
        atomic::{AtomicBool, Ordering},
        Arc,
    },
+    time::Duration,
 };

 use dashmap::DashSet;
@@ -13,6 +14,7 @@ use redis::{
 };
 use serde::Deserialize;
 use tokio::sync::Mutex;
+use tokio_util::sync::CancellationToken;
 use tracing::info;

 use crate::{
@@ -111,16 +113,22 @@ impl EndpointsCache {
    pub async fn do_read(
        &self,
        mut con: ConnectionWithCredentialsProvider,
+        cancellation_token: CancellationToken,
    ) -> anyhow::Result<Infallible> {
        let mut last_id = "0-0".to_string();
        loop {
-            self.ready.store(false, Ordering::Release);
            if let Err(e) = con.connect().await {
                tracing::error!("error connecting to redis: {:?}", e);
-                continue;
+                self.ready.store(false, Ordering::Release);
            }
            if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
                tracing::error!("error reading from redis: {:?}", e);
+                self.ready.store(false, Ordering::Release);
+            }
+            if cancellation_token.is_cancelled() {
+                info!("cancellation token is cancelled, exiting");
+                tokio::time::sleep(Duration::from_secs(60 * 60 * 24 * 7)).await;
+                // 1 week.
            }
            tokio::time::sleep(self.config.retry_interval).await;
        }
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -10,12 +10,7 @@ use crate::{
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
-use std::{
-    io,
-    net::SocketAddr,
-    sync::{Arc, OnceLock},
-    time::Duration,
-};
+use std::{io, net::SocketAddr, time::Duration};
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::tls::MakeTlsConnect;
@@ -33,6 +28,9 @@ pub enum ConnectionError {
    #[error("{COULD_NOT_CONNECT}: {0}")]
    CouldNotConnect(#[from] io::Error),

+    #[error("{COULD_NOT_CONNECT}: {0}")]
+    TlsError(#[from] native_tls::Error),
+
    #[error("{COULD_NOT_CONNECT}: {0}")]
    WakeComputeError(#[from] WakeComputeError),
 }
@@ -71,6 +69,7 @@ impl ReportableError for ConnectionError {
            }
            ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute,
            ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute,
+            ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute,
            ConnectionError::WakeComputeError(e) => e.get_error_kind(),
        }
    }
@@ -240,7 +239,7 @@ pub struct PostgresConnection {
    /// Socket connected to a compute node.
    pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
        tokio::net::TcpStream,
-        tokio_postgres_rustls::RustlsStream<tokio::net::TcpStream>,
+        postgres_native_tls::TlsStream<tokio::net::TcpStream>,
    >,
    /// PostgreSQL connection parameters.
    pub params: std::collections::HashMap<String, String>,
@@ -252,39 +251,22 @@ pub struct PostgresConnection {
    _guage: NumDbConnectionsGuard<'static>,
 }

-static ROOT_STORE: OnceLock<Arc<rustls::RootCertStore>> = OnceLock::new();
-
 impl ConnCfg {
    /// Connect to a corresponding compute node.
    pub async fn connect(
        &self,
        ctx: &mut RequestMonitoring,
+        allow_self_signed_compute: bool,
        aux: MetricsAuxInfo,
        timeout: Duration,
    ) -> Result<PostgresConnection, ConnectionError> {
        let (socket_addr, stream, host) = self.connect_raw(timeout).await?;

-        let root_store = ROOT_STORE.get_or_init(|| {
-            let mut roots = rustls::RootCertStore::empty();
-
-            let certs = match rustls_native_certs::load_native_certs() {
-                Ok(certs) => certs,
-                Err(e) => {
-                    error!("could not load native ssl certs: {e:?}");
-                    return Arc::new(roots);
-                }
-            };
-
-            let (added, ignored) = roots.add_parsable_certificates(certs);
-            info!(added, ignored, "loaded native ssl certifications");
-
-            Arc::new(roots)
-        });
-
-        let client_config = rustls::ClientConfig::builder()
-            .with_root_certificates(root_store.clone())
-            .with_no_client_auth();
-        let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
+        let tls_connector = native_tls::TlsConnector::builder()
+            .danger_accept_invalid_certs(allow_self_signed_compute)
+            .build()
+            .unwrap();
+        let mut mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector);
        let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;

        // connect_raw() will not use TLS if sslmode is "disable"
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -24,6 +24,7 @@ pub struct ProxyConfig {
    pub tls_config: Option<TlsConfig>,
    pub auth_backend: auth::BackendType<'static, (), ()>,
    pub metric_collection: Option<MetricCollectionConfig>,
+    pub allow_self_signed_compute: bool,
    pub http_config: HttpConfig,
    pub authentication_config: AuthenticationConfig,
    pub require_client_ip: bool,
@@ -32,6 +33,8 @@ pub struct ProxyConfig {
    pub region: String,
    pub handshake_timeout: Duration,
    pub aws_region: String,
+    pub wake_compute_retry_config: RetryConfig,
+    pub connect_to_compute_retry_config: RetryConfig,
 }

 #[derive(Debug)]
@@ -516,6 +519,59 @@ impl FromStr for ProjectInfoCacheOptions {
    }
 }

+/// This is a config for connect to compute and wake compute.
+#[derive(Clone, Copy, Debug)]
+pub struct RetryConfig {
+    /// Number of times we should retry.
+    pub max_retries: u32,
+    /// Retry duration is base_delay * backoff_factor ^ n, where n starts at 0
+    pub base_delay: tokio::time::Duration,
+    /// Exponential base for retry wait duration
+    pub backoff_factor: f64,
+}
+
+impl RetryConfig {
+    /// Default options for RetryConfig.
+
+    /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
+    pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
+        "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6";
+    /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
+    /// Cplane has timeout of 60s on each request. 8m7s in total.
+    pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str =
+        "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6";
+
+    /// Parse retry options passed via cmdline.
+    /// Example: [`Self::CONNECT_TO_COMPUTE_DEFAULT_VALUES`].
+    pub fn parse(options: &str) -> anyhow::Result<Self> {
+        let mut num_retries = None;
+        let mut base_retry_wait_duration = None;
+        let mut retry_wait_exponent_base = None;
+
+        for option in options.split(',') {
+            let (key, value) = option
+                .split_once('=')
+                .with_context(|| format!("bad key-value pair: {option}"))?;
+
+            match key {
+                "num_retries" => num_retries = Some(value.parse()?),
+                "base_retry_wait_duration" => {
+                    base_retry_wait_duration = Some(humantime::parse_duration(value)?)
+                }
+                "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?),
+                unknown => bail!("unknown key: {unknown}"),
+            }
+        }
+
+        Ok(Self {
+            max_retries: num_retries.context("missing `num_retries`")?,
+            base_delay: base_retry_wait_duration.context("missing `base_retry_wait_duration`")?,
+            backoff_factor: retry_wait_exponent_base
+                .context("missing `retry_wait_exponent_base`")?,
+        })
+    }
+}
+
 /// Helper for cmdline cache options parsing.
 pub struct WakeComputeLockOptions {
    /// The number of shards the lock map should have
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -40,31 +40,28 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {

        let span = info_span!("mgmt", peer = %peer_addr);

-        tokio::task::Builder::new()
-            .name("mgmt handler")
-            .spawn(
-                async move {
-                    info!("serving a new console management API connection");
+        tokio::task::spawn(
+            async move {
+                info!("serving a new console management API connection");

-                    // these might be long running connections, have a separate logging for cancelling
-                    // on shutdown and other ways of stopping.
-                    let cancelled = scopeguard::guard(tracing::Span::current(), |span| {
-                        let _e = span.entered();
-                        info!("console management API task cancelled");
-                    });
+                // these might be long running connections, have a separate logging for cancelling
+                // on shutdown and other ways of stopping.
+                let cancelled = scopeguard::guard(tracing::Span::current(), |span| {
+                    let _e = span.entered();
+                    info!("console management API task cancelled");
+                });

-                    if let Err(e) = handle_connection(socket).await {
-                        error!("serving failed with an error: {e}");
-                    } else {
-                        info!("serving completed");
-                    }
-
-                    // we can no longer get dropped
-                    scopeguard::ScopeGuard::into_inner(cancelled);
+                if let Err(e) = handle_connection(socket).await {
+                    error!("serving failed with an error: {e}");
+                } else {
+                    info!("serving completed");
                }
-                .instrument(span),
-            )
-            .unwrap();
+
+                // we can no longer get dropped
+                scopeguard::ScopeGuard::into_inner(cancelled);
+            }
+            .instrument(span),
+        );
    }
 }

--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -293,6 +293,9 @@ pub struct NodeInfo {

    /// Labels for proxy's metrics.
    pub aux: MetricsAuxInfo,
+
+    /// Whether we should accept self-signed certificates (for testing)
+    pub allow_self_signed_compute: bool,
 }

 impl NodeInfo {
@@ -301,9 +304,17 @@ impl NodeInfo {
        ctx: &mut RequestMonitoring,
        timeout: Duration,
    ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
-        self.config.connect(ctx, self.aux.clone(), timeout).await
+        self.config
+            .connect(
+                ctx,
+                self.allow_self_signed_compute,
+                self.aux.clone(),
+                timeout,
+            )
+            .await
    }
    pub fn reuse_settings(&mut self, other: Self) {
+        self.allow_self_signed_compute = other.allow_self_signed_compute;
        self.config.reuse_password(other.config);
    }

--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -63,10 +63,7 @@ impl Api {
            let (client, connection) =
                tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;

-            tokio::task::Builder::new()
-                .name("mock conn")
-                .spawn(connection)
-                .unwrap();
+            tokio::spawn(connection);
            let secret = match get_execute_postgres_query(
                &client,
                "select rolpassword from pg_catalog.pg_authid where rolname = $1",
@@ -129,6 +126,7 @@ impl Api {
                branch_id: (&BranchId::from("branch")).into(),
                cold_start_info: crate::console::messages::ColdStartInfo::Warm,
            },
+            allow_self_signed_compute: false,
        };

        Ok(node)
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -175,6 +175,7 @@ impl Api {
            let node = NodeInfo {
                config,
                aux: body.aux,
+                allow_self_signed_compute: false,
            };

            Ok(node)
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -141,15 +141,12 @@ pub async fn worker(
    LOG_CHAN.set(tx.downgrade()).unwrap();

    // setup row stream that will close on cancellation
-    tokio::task::Builder::new()
-        .name("drop parquet conn")
-        .spawn(async move {
-            cancellation_token.cancelled().await;
-            // dropping this sender will cause the channel to close only once
-            // all the remaining inflight requests have been completed.
-            drop(tx);
-        })
-        .unwrap();
+    tokio::spawn(async move {
+        cancellation_token.cancelled().await;
+        // dropping this sender will cause the channel to close only once
+        // all the remaining inflight requests have been completed.
+        drop(tx);
+    });
    let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx));
    let rx = rx.map(RequestData::from);

@@ -416,6 +413,7 @@ mod tests {
                    )
                    .unwrap(),
                    max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
+                    upload_storage_class: None,
                }),
                timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
            })
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -75,10 +75,6 @@ async fn prometheus_metrics_handler(

    let span = info_span!("blocking");
    let body = tokio::task::spawn_blocking(move || {
-        // there are situations where we lose scraped metrics under load, try to gather some clues
-        // since all nodes are queried this, keep the message count low.
-        let spawned_at = std::time::Instant::now();
-
        let _span = span.entered();

        let mut state = state.lock().unwrap();
@@ -88,19 +84,11 @@ async fn prometheus_metrics_handler(
            .collect_group_into(&mut *encoder)
            .unwrap_or_else(|infallible| match infallible {});

-        let encoded_at = std::time::Instant::now();
-
        let body = encoder.finish();

-        let spawned_in = spawned_at - started_at;
-        let encoded_in = encoded_at - spawned_at;
-        let total = encoded_at - started_at;
-
        tracing::info!(
            bytes = body.len(),
-            total_ms = total.as_millis(),
-            spawning_ms = spawned_in.as_millis(),
-            encoding_ms = encoded_in.as_millis(),
+            elapsed_ms = started_at.elapsed().as_millis(),
            "responded /metrics"
        );

--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -26,12 +26,7 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
        .await
        .map(OpenTelemetryLayer::new);

-    // spawn the console server in the background,
-    // returning a `Layer`:
-    let console_layer = console_subscriber::spawn();
-
    tracing_subscriber::registry()
-        .with(console_layer)
        .with(env_filter)
        .with(otlp_layer)
        .with(fmt_layer)
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -119,6 +119,10 @@ pub struct ProxyMetrics {

    /// Number of invalid endpoints (per protocol, per rejected).
    pub invalid_endpoints_total: CounterVec<InvalidEndpointsSet>,
+
+    /// Number of retries (per outcome, per retry_type).
+    #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))]
+    pub retries_metric: HistogramVec<RetriesMetricSet, 9>,
 }

 #[derive(MetricGroup)]
@@ -480,3 +484,16 @@ pub struct InvalidEndpointsGroup {
    pub rejected: Bool,
    pub outcome: ConnectOutcome,
 }
+
+#[derive(LabelGroup)]
+#[label(set = RetriesMetricSet)]
+pub struct RetriesMetricGroup {
+    pub outcome: ConnectOutcome,
+    pub retry_type: RetryType,
+}
+
+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+pub enum RetryType {
+    WakeCompute,
+    ConnectToCompute,
+}
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -87,7 +87,7 @@ pub async fn task_main(

        tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");

-        tokio::task::Builder::new().name("tcp client connection").spawn(connections.track_future(async move {
+        connections.spawn(async move {
            let mut socket = WithClientIp::new(socket);
            let mut peer_addr = peer_addr.ip();
            match socket.wait_for_addr().await {
@@ -152,7 +152,7 @@ pub async fn task_main(
                    }
                }
            }
-        })).unwrap();
+        });
    }

    connections.close();
@@ -178,6 +178,13 @@ impl ClientMode {
        }
    }

+    pub fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
+        match self {
+            ClientMode::Tcp => config.allow_self_signed_compute,
+            ClientMode::Websockets { .. } => false,
+        }
+    }
+
    fn hostname<'a, S>(&'a self, s: &'a Stream<S>) -> Option<&'a str> {
        match self {
            ClientMode::Tcp => s.sni_hostname(),
@@ -296,9 +303,16 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        }
    };

-    let mut node = connect_to_compute(ctx, &TcpMechanism { params: &params }, &user_info)
-        .or_else(|e| stream.throw_error(e))
-        .await?;
+    let mut node = connect_to_compute(
+        ctx,
+        &TcpMechanism { params: &params },
+        &user_info,
+        mode.allow_self_signed_compute(config),
+        config.wake_compute_retry_config,
+        config.connect_to_compute_retry_config,
+    )
+    .or_else(|e| stream.throw_error(e))
+    .await?;

    let session = cancellation_handler.get_session();
    prepare_client_connection(&node, &session, &mut stream).await?;
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -1,10 +1,11 @@
 use crate::{
    auth::backend::ComputeCredentialKeys,
    compute::{self, PostgresConnection},
+    config::RetryConfig,
    console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
    context::RequestMonitoring,
    error::ReportableError,
-    metrics::{ConnectionFailureKind, Metrics},
+    metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType},
    proxy::{
        retry::{retry_after, ShouldRetry},
        wake_compute::wake_compute,
@@ -92,17 +93,24 @@ pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
    ctx: &mut RequestMonitoring,
    mechanism: &M,
    user_info: &B,
+    allow_self_signed_compute: bool,
+    wake_compute_retry_config: RetryConfig,
+    connect_to_compute_retry_config: RetryConfig,
 ) -> Result<M::Connection, M::Error>
 where
    M::ConnectError: ShouldRetry + std::fmt::Debug,
    M::Error: From<WakeComputeError>,
 {
    let mut num_retries = 0;
-    let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+    let mut node_info =
+        wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
    if let Some(keys) = user_info.get_keys() {
        node_info.set_keys(keys);
    }
+    node_info.allow_self_signed_compute = allow_self_signed_compute;
+    // let mut node_info = credentials.get_node_info(ctx, user_info).await?;
    mechanism.update_connect_config(&mut node_info.config);
+    let retry_type = RetryType::ConnectToCompute;

    // try once
    let err = match mechanism
@@ -111,6 +119,13 @@ where
    {
        Ok(res) => {
            ctx.latency_timer.success();
+            Metrics::get().proxy.retries_metric.observe(
+                RetriesMetricGroup {
+                    outcome: ConnectOutcome::Success,
+                    retry_type,
+                },
+                num_retries.into(),
+            );
            return Ok(res);
        }
        Err(e) => e,
@@ -121,7 +136,7 @@ where
    let node_info = if !node_info.cached() {
        // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
        // Do not need to retrieve a new node_info, just return the old one.
-        if !err.should_retry(num_retries) {
+        if !err.should_retry(num_retries, connect_to_compute_retry_config) {
            return Err(err.into());
        }
        node_info
@@ -129,7 +144,8 @@ where
        // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
        info!("compute node's state has likely changed; requesting a wake-up");
        let old_node_info = invalidate_cache(node_info);
-        let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+        let mut node_info =
+            wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
        node_info.reuse_settings(old_node_info);

        mechanism.update_connect_config(&mut node_info.config);
@@ -148,19 +164,34 @@ where
        {
            Ok(res) => {
                ctx.latency_timer.success();
+                Metrics::get().proxy.retries_metric.observe(
+                    RetriesMetricGroup {
+                        outcome: ConnectOutcome::Success,
+                        retry_type,
+                    },
+                    num_retries.into(),
+                );
+                info!(?num_retries, "connected to compute node after");
                return Ok(res);
            }
            Err(e) => {
-                let retriable = e.should_retry(num_retries);
+                let retriable = e.should_retry(num_retries, connect_to_compute_retry_config);
                if !retriable {
                    error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
+                    Metrics::get().proxy.retries_metric.observe(
+                        RetriesMetricGroup {
+                            outcome: ConnectOutcome::Failed,
+                            retry_type,
+                        },
+                        num_retries.into(),
+                    );
                    return Err(e.into());
                }
                warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
            }
        }

-        let wait_duration = retry_after(num_retries);
+        let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
        num_retries += 1;

        time::sleep(wait_duration).await;
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -1,18 +1,12 @@
-use crate::compute;
+use crate::{compute, config::RetryConfig};
 use std::{error::Error, io};
 use tokio::time;

-/// Number of times we should retry the `/proxy_wake_compute` http request.
-/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
-pub const NUM_RETRIES_CONNECT: u32 = 16;
-const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
-const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
-
 pub trait ShouldRetry {
    fn could_retry(&self) -> bool;
-    fn should_retry(&self, num_retries: u32) -> bool {
+    fn should_retry(&self, num_retries: u32, config: RetryConfig) -> bool {
        match self {
-            _ if num_retries >= NUM_RETRIES_CONNECT => false,
+            _ if num_retries >= config.max_retries => false,
            err => err.could_retry(),
        }
    }
@@ -63,6 +57,8 @@ impl ShouldRetry for compute::ConnectionError {
    }
 }

-pub fn retry_after(num_retries: u32) -> time::Duration {
-    BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
+pub fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration {
+    config
+        .base_delay
+        .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1))
 }
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -10,13 +10,13 @@ use super::*;
 use crate::auth::backend::{
    ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
 };
-use crate::config::CertResolver;
+use crate::config::{CertResolver, RetryConfig};
 use crate::console::caches::NodeInfoCache;
 use crate::console::messages::MetricsAuxInfo;
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
-use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
+use crate::proxy::retry::retry_after;
 use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
 use anyhow::{bail, Context};
 use async_trait::async_trait;
@@ -361,11 +361,15 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 #[test]
 fn connect_compute_total_wait() {
    let mut total_wait = tokio::time::Duration::ZERO;
-    for num_retries in 1..NUM_RETRIES_CONNECT {
-        total_wait += retry_after(num_retries);
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    for num_retries in 1..config.max_retries {
+        total_wait += retry_after(num_retries, config);
    }
-    assert!(total_wait < tokio::time::Duration::from_secs(12));
-    assert!(total_wait > tokio::time::Duration::from_secs(10));
+    assert!(f64::abs(total_wait.as_secs_f64() - 15.0) < 0.1);
 }

 #[derive(Clone, Copy, Debug)]
@@ -519,6 +523,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
            branch_id: (&BranchId::from("branch")).into(),
            cold_start_info: crate::console::messages::ColdStartInfo::Warm,
        },
+        allow_self_signed_compute: false,
    };
    let (_, node) = cache.insert("key".into(), node);
    node
@@ -548,7 +553,12 @@ async fn connect_to_compute_success() {
    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![Wake, Connect]);
    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap();
    mechanism.verify();
@@ -561,7 +571,12 @@ async fn connect_to_compute_retry() {
    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap();
    mechanism.verify();
@@ -575,7 +590,12 @@ async fn connect_to_compute_non_retry_1() {
    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]);
    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap_err();
    mechanism.verify();
@@ -589,7 +609,12 @@ async fn connect_to_compute_non_retry_2() {
    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]);
    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap();
    mechanism.verify();
@@ -599,17 +624,32 @@ async fn connect_to_compute_non_retry_2() {
 #[tokio::test]
 async fn connect_to_compute_non_retry_3() {
    let _ = env_logger::try_init();
-    assert_eq!(NUM_RETRIES_CONNECT, 16);
+    tokio::time::pause();
    use ConnectAction::*;
    let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![
-        Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
-        Retry, Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
-    ]);
+    let mechanism =
+        TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]);
    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info)
-        .await
-        .unwrap_err();
+    let wake_compute_retry_config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 1,
+        backoff_factor: 2.0,
+    };
+    let connect_to_compute_retry_config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(
+        &mut ctx,
+        &mechanism,
+        &user_info,
+        false,
+        wake_compute_retry_config,
+        connect_to_compute_retry_config,
+    )
+    .await
+    .unwrap_err();
    mechanism.verify();
 }

@@ -621,7 +661,12 @@ async fn wake_retry() {
    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]);
    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap();
    mechanism.verify();
@@ -635,7 +680,12 @@ async fn wake_non_retry() {
    let mut ctx = RequestMonitoring::test();
    let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]);
    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info)
+    let config = RetryConfig {
+        base_delay: Duration::from_secs(1),
+        max_retries: 5,
+        backoff_factor: 2.0,
+    };
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
        .await
        .unwrap_err();
    mechanism.verify();
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,10 +1,14 @@
+use crate::config::RetryConfig;
 use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
 use crate::context::RequestMonitoring;
-use crate::metrics::{ConnectionFailuresBreakdownGroup, Metrics, WakeupFailureKind};
+use crate::metrics::{
+    ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,
+    WakeupFailureKind,
+};
 use crate::proxy::retry::retry_after;
 use hyper::StatusCode;
 use std::ops::ControlFlow;
-use tracing::{error, warn};
+use tracing::{error, info, warn};

 use super::connect_compute::ComputeConnectBackend;
 use super::retry::ShouldRetry;
@@ -13,23 +17,42 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
    num_retries: &mut u32,
    ctx: &mut RequestMonitoring,
    api: &B,
+    config: RetryConfig,
 ) -> Result<CachedNodeInfo, WakeComputeError> {
+    let retry_type = RetryType::WakeCompute;
    loop {
        let wake_res = api.wake_compute(ctx).await;
-        match handle_try_wake(wake_res, *num_retries) {
+        match handle_try_wake(wake_res, *num_retries, config) {
            Err(e) => {
                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
                report_error(&e, false);
+                Metrics::get().proxy.retries_metric.observe(
+                    RetriesMetricGroup {
+                        outcome: ConnectOutcome::Failed,
+                        retry_type,
+                    },
+                    (*num_retries).into(),
+                );
                return Err(e);
            }
            Ok(ControlFlow::Continue(e)) => {
                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
                report_error(&e, true);
            }
-            Ok(ControlFlow::Break(n)) => return Ok(n),
+            Ok(ControlFlow::Break(n)) => {
+                Metrics::get().proxy.retries_metric.observe(
+                    RetriesMetricGroup {
+                        outcome: ConnectOutcome::Success,
+                        retry_type,
+                    },
+                    (*num_retries).into(),
+                );
+                info!(?num_retries, "compute node woken up after");
+                return Ok(n);
+            }
        }

-        let wait_duration = retry_after(*num_retries);
+        let wait_duration = retry_after(*num_retries, config);
        *num_retries += 1;
        tokio::time::sleep(wait_duration).await;
    }
@@ -42,10 +65,11 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
 pub fn handle_try_wake(
    result: Result<CachedNodeInfo, WakeComputeError>,
    num_retries: u32,
+    config: RetryConfig,
 ) -> Result<ControlFlow<CachedNodeInfo, WakeComputeError>, WakeComputeError> {
    match result {
        Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
+            WakeComputeError::ApiError(api) if api.should_retry(num_retries, config) => {
                Ok(ControlFlow::Continue(err))
            }
            _ => Err(err),
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -108,12 +108,10 @@ impl ConnectionWithCredentialsProvider {
        if let Credentials::Dynamic(credentials_provider, _) = &self.credentials {
            let credentials_provider = credentials_provider.clone();
            let con2 = con.clone();
-            let f = tokio::task::Builder::new()
-                .name("redis keep connection")
-                .spawn(async move {
-                    let _ = Self::keep_connection(con2, credentials_provider).await;
-                });
-            self.refresh_token_task = Some(f.unwrap());
+            let f = tokio::spawn(async move {
+                let _ = Self::keep_connection(con2, credentials_provider).await;
+            });
+            self.refresh_token_task = Some(f);
        }
        match Self::ping(&mut con).await {
            Ok(()) => {
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -142,13 +142,10 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
                // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message.
                let cache = self.cache.clone();
-                tokio::task::Builder::new()
-                    .name("invalidate cache lazy")
-                    .spawn(async move {
-                        tokio::time::sleep(INVALIDATION_LAG).await;
-                        invalidate_cache(cache, msg);
-                    })
-                    .unwrap();
+                tokio::spawn(async move {
+                    tokio::time::sleep(INVALIDATION_LAG).await;
+                    invalidate_cache(cache, msg);
+                });
            }
        }

--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -61,28 +61,22 @@ pub async fn task_main(
    let conn_pool = conn_pool::GlobalConnPool::new(&config.http_config);
    {
        let conn_pool = Arc::clone(&conn_pool);
-        tokio::task::Builder::new()
-            .name("serverless pool gc")
-            .spawn(async move {
-                conn_pool.gc_worker(StdRng::from_entropy()).await;
-            })
-            .unwrap();
+        tokio::spawn(async move {
+            conn_pool.gc_worker(StdRng::from_entropy()).await;
+        });
    }

    // shutdown the connection pool
-    tokio::task::Builder::new()
-        .name("serverless pool shutdown")
-        .spawn({
-            let cancellation_token = cancellation_token.clone();
-            let conn_pool = conn_pool.clone();
-            async move {
-                cancellation_token.cancelled().await;
-                tokio::task::spawn_blocking(move || conn_pool.shutdown())
-                    .await
-                    .unwrap();
-            }
-        })
-        .unwrap();
+    tokio::spawn({
+        let cancellation_token = cancellation_token.clone();
+        let conn_pool = conn_pool.clone();
+        async move {
+            cancellation_token.cancelled().await;
+            tokio::task::spawn_blocking(move || conn_pool.shutdown())
+                .await
+                .unwrap();
+        }
+    });

    let backend = Arc::new(PoolingBackend {
        pool: Arc::clone(&conn_pool),
@@ -115,25 +109,20 @@ pub async fn task_main(
        let conn_id = uuid::Uuid::new_v4();
        let http_conn_span = tracing::info_span!("http_conn", ?conn_id);

-        tokio::task::Builder::new()
-            .name("serverless conn handler")
-            .spawn(
-                connections.track_future(
-                    connection_handler(
-                        config,
-                        backend.clone(),
-                        connections.clone(),
-                        cancellation_handler.clone(),
-                        cancellation_token.clone(),
-                        server.clone(),
-                        tls_acceptor.clone(),
-                        conn,
-                        peer_addr,
-                    )
-                    .instrument(http_conn_span),
-                ),
+        connections.spawn(
+            connection_handler(
+                config,
+                backend.clone(),
+                connections.clone(),
+                cancellation_handler.clone(),
+                cancellation_token.clone(),
+                server.clone(),
+                tls_acceptor.clone(),
+                conn,
+                peer_addr,
            )
-            .unwrap();
+            .instrument(http_conn_span),
+        );
    }

    connections.wait().await;
@@ -229,25 +218,20 @@ async fn connection_handler(

            // `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
            // By spawning the future, we ensure it never gets cancelled until it decides to.
-            let handler = tokio::task::Builder::new()
-                .name("serverless request handler")
-                .spawn(
-                    connections.track_future(
-                        request_handler(
-                            req,
-                            config,
-                            backend.clone(),
-                            connections.clone(),
-                            cancellation_handler.clone(),
-                            session_id,
-                            peer_addr,
-                            http_request_token,
-                        )
-                        .in_current_span()
-                        .map_ok_or_else(api_error_into_response, |r| r),
-                    ),
+            let handler = connections.spawn(
+                request_handler(
+                    req,
+                    config,
+                    backend.clone(),
+                    connections.clone(),
+                    cancellation_handler.clone(),
+                    session_id,
+                    peer_addr,
+                    http_request_token,
                )
-                .unwrap();
+                .in_current_span()
+                .map_ok_or_else(api_error_into_response, |r| r),
+            );

            async move {
                let res = handler.await;
@@ -306,27 +290,17 @@ async fn request_handler(
        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
            .map_err(|e| ApiError::BadRequest(e.into()))?;

-        tokio::task::Builder::new()
-            .name("websocket client conn")
-            .spawn(
-                ws_connections.track_future(
-                    async move {
-                        if let Err(e) = websocket::serve_websocket(
-                            config,
-                            ctx,
-                            websocket,
-                            cancellation_handler,
-                            host,
-                        )
+        ws_connections.spawn(
+            async move {
+                if let Err(e) =
+                    websocket::serve_websocket(config, ctx, websocket, cancellation_handler, host)
                        .await
-                        {
-                            error!("error in websocket connection: {e:#}");
-                        }
-                    }
-                    .instrument(span),
-                ),
-            )
-            .unwrap();
+                {
+                    error!("error in websocket connection: {e:#}");
+                }
+            }
+            .instrument(span),
+        );

        // Return the response so the spawned future can continue.
        Ok(response)
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -107,6 +107,9 @@ impl PoolingBackend {
                pool: self.pool.clone(),
            },
            &backend,
+            false, // do not allow self signed compute for http flow
+            self.config.wake_compute_retry_config,
+            self.config.connect_to_compute_retry_config,
        )
        .await
    }
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -492,7 +492,7 @@ pub fn poll_client<C: ClientInnerExt>(
    let cancel = CancellationToken::new();
    let cancelled = cancel.clone().cancelled_owned();

-    tokio::task::Builder::new().name("pooled conn").spawn(
+    tokio::spawn(
    async move {
        let _conn_gauge = conn_gauge;
        let mut idle_timeout = pin!(tokio::time::sleep(idle));
@@ -565,7 +565,7 @@ pub fn poll_client<C: ClientInnerExt>(
        }).await;

    }
-    .instrument(span)).unwrap();
+    .instrument(span));
    let inner = ClientInner {
        inner: client,
        session: tx,
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -18,7 +18,7 @@ use std::time::Duration;
 use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::{XLogSegNo, PG_TLI};
-use remote_storage::{GenericRemoteStorage, RemotePath, StorageMetadata};
+use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata};
 use tokio::fs::File;

 use tokio::select;
@@ -601,12 +601,18 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
    backoff::retry(
        || async {
            // Do list-delete in batch_size batches to make progress even if there a lot of files.
-            // Alternatively we could make list_files return iterator, but it is more complicated and
+            // Alternatively we could make remote storage list return iterator, but it is more complicated and
            // I'm not sure deleting while iterating is expected in s3.
            loop {
                let files = storage
-                    .list_files(Some(&remote_path), Some(batch_size), &cancel)
-                    .await?;
+                    .list(
+                        Some(&remote_path),
+                        ListingMode::NoDelimiter,
+                        Some(batch_size),
+                        &cancel,
+                    )
+                    .await?
+                    .keys;
                if files.is_empty() {
                    return Ok(()); // done
                }
@@ -666,8 +672,9 @@ pub async fn copy_s3_segments(
    let cancel = CancellationToken::new();

    let files = storage
-        .list_files(Some(&remote_path), None, &cancel)
-        .await?;
+        .list(Some(&remote_path), ListingMode::NoDelimiter, None, &cancel)
+        .await?
+        .keys;

    let uploaded_segments = &files
        .iter()
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -1,730 +0,0 @@
-#
-# Script to export tenants from one pageserver and import them into another page server.
-#
-# Outline of steps:
-# 1. Get `(last_lsn, prev_lsn)` from old pageserver
-# 2. Get `fullbackup` from old pageserver, which creates a basebackup tar file
-# 3. This tar file might be missing relation files for empty relations, if the pageserver
-#    is old enough (we didn't always store those). So to recreate them, we start a local
-#    vanilla postgres on this basebackup and ask it what relations should exist, then touch
-#    any missing files and re-pack the tar.
-#    TODO This functionality is no longer needed, so we can delete it later if we don't
-#         end up using the same utils for the pg 15 upgrade. Not sure.
-# 4. We import the patched basebackup into a new pageserver
-# 5. We export again via fullbackup, now from the new pageserver and compare the returned
-#    tar file with the one we imported. This confirms that we imported everything that was
-#    exported, but doesn't guarantee correctness (what if we didn't **export** everything
-#    initially?)
-# 6. We wait for the new pageserver's remote_consistent_lsn to catch up
-#
-# For more context on how to use this, see:
-# https://www.notion.so/neondatabase/Storage-format-migration-9a8eba33ccf8417ea8cf50e6a0c542cf
-
-import argparse
-import os
-import shutil
-import subprocess
-import tempfile
-import time
-import uuid
-from contextlib import closing
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, cast
-
-import psycopg2
-import requests
-from psycopg2.extensions import connection as PgConnection
-from psycopg2.extensions import parse_dsn
-
-###############################################
-### client-side utils copied from test fixtures
-###############################################
-
-Env = Dict[str, str]
-
-_global_counter = 0
-
-
-def global_counter() -> int:
-    """A really dumb global counter.
-    This is useful for giving output files a unique number, so if we run the
-    same command multiple times we can keep their output separate.
-    """
-    global _global_counter
-    _global_counter += 1
-    return _global_counter
-
-
-def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
-    """Run a process and capture its output
-    Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
-    where "cmd" is the name of the program and NNN is an incrementing
-    counter.
-    If those files already exist, we will overwrite them.
-    Returns basepath for files with captured output.
-    """
-    assert isinstance(cmd, list)
-    base = f"{os.path.basename(cmd[0])}_{global_counter()}"
-    basepath = os.path.join(capture_dir, base)
-    stdout_filename = basepath + ".stdout"
-    stderr_filename = basepath + ".stderr"
-
-    with open(stdout_filename, "w") as stdout_f:
-        with open(stderr_filename, "w") as stderr_f:
-            print(f'(capturing output to "{base}.stdout")')
-            subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
-
-    return basepath
-
-
-class PgBin:
-    """A helper class for executing postgres binaries"""
-
-    def __init__(self, log_dir: Path, pg_distrib_dir, pg_version):
-        self.log_dir = log_dir
-        self.pg_bin_path = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "bin")
-        self.env = os.environ.copy()
-        self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "lib")
-
-    def _fixpath(self, command: List[str]):
-        if "/" not in command[0]:
-            command[0] = os.path.join(self.pg_bin_path, command[0])
-
-    def _build_env(self, env_add: Optional[Env]) -> Env:
-        if env_add is None:
-            return self.env
-        env = self.env.copy()
-        env.update(env_add)
-        return env
-
-    def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None):
-        """
-        Run one of the postgres binaries.
-        The command should be in list form, e.g. ['pgbench', '-p', '55432']
-        All the necessary environment variables will be set.
-        If the first argument (the command name) doesn't include a path (no '/'
-        characters present), then it will be edited to include the correct path.
-        If you want stdout/stderr captured to files, use `run_capture` instead.
-        """
-
-        self._fixpath(command)
-        print(f'Running command "{" ".join(command)}"')
-        env = self._build_env(env)
-        subprocess.run(command, env=env, cwd=cwd, check=True)
-
-    def run_capture(
-        self,
-        command: List[str],
-        env: Optional[Env] = None,
-        cwd: Optional[str] = None,
-        **kwargs: Any,
-    ) -> str:
-        """
-        Run one of the postgres binaries, with stderr and stdout redirected to a file.
-        This is just like `run`, but for chatty programs. Returns basepath for files
-        with captured output.
-        """
-
-        self._fixpath(command)
-        print(f'Running command "{" ".join(command)}"')
-        env = self._build_env(env)
-        return subprocess_capture(
-            str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs
-        )
-
-
-class PgProtocol:
-    """Reusable connection logic"""
-
-    def __init__(self, **kwargs):
-        self.default_options = kwargs
-
-    def conn_options(self, **kwargs):
-        conn_options = self.default_options.copy()
-        if "dsn" in kwargs:
-            conn_options.update(parse_dsn(kwargs["dsn"]))
-        conn_options.update(kwargs)
-
-        # Individual statement timeout in seconds. 2 minutes should be
-        # enough for our tests, but if you need a longer, you can
-        # change it by calling "SET statement_timeout" after
-        # connecting.
-        conn_options["options"] = f"-cstatement_timeout=120s {conn_options.get('options', '')}"
-
-        return conn_options
-
-    # autocommit=True here by default because that's what we need most of the time
-    def connect(self, autocommit=True, **kwargs) -> PgConnection:
-        """
-        Connect to the node.
-        Returns psycopg2's connection object.
-        This method passes all extra params to connstr.
-        """
-        conn: PgConnection = psycopg2.connect(**self.conn_options(**kwargs))
-
-        # WARNING: this setting affects *all* tests!
-        conn.autocommit = autocommit
-        return conn
-
-    def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]:
-        """
-        Execute query against the node and return all rows.
-        This method passes all extra params to connstr.
-        """
-        return self.safe_psql_many([query], **kwargs)[0]
-
-    def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]:
-        """
-        Execute queries against the node and return all rows.
-        This method passes all extra params to connstr.
-        """
-        result: List[List[Any]] = []
-        with closing(self.connect(**kwargs)) as conn:
-            with conn.cursor() as cur:
-                for query in queries:
-                    print(f"Executing query: {query}")
-                    cur.execute(query)
-
-                    if cur.description is None:
-                        result.append([])  # query didn't return data
-                    else:
-                        result.append(cast(List[Any], cur.fetchall()))
-        return result
-
-
-class VanillaPostgres(PgProtocol):
-    def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True):
-        super().__init__(host="localhost", port=port, dbname="postgres")
-        self.pgdatadir = pgdatadir
-        self.pg_bin = pg_bin
-        self.running = False
-        if init:
-            self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)])
-        self.configure([f"port = {port}\n"])
-
-    def configure(self, options: List[str]):
-        """Append lines into postgresql.conf file."""
-        assert not self.running
-        with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file:
-            conf_file.write("\n".join(options))
-
-    def start(self, log_path: Optional[str] = None):
-        assert not self.running
-        self.running = True
-
-        log_path = log_path or os.path.join(self.pgdatadir, "pg.log")
-
-        self.pg_bin.run_capture(
-            ["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"]
-        )
-
-    def stop(self):
-        assert self.running
-        self.running = False
-        self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"])
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc, tb):
-        if self.running:
-            self.stop()
-
-
-class NeonPageserverApiException(Exception):
-    pass
-
-
-class NeonPageserverHttpClient(requests.Session):
-    def __init__(self, host, port):
-        super().__init__()
-        self.host = host
-        self.port = port
-
-    def verbose_error(self, res: requests.Response):
-        try:
-            res.raise_for_status()
-        except requests.RequestException as e:
-            try:
-                msg = res.json()["msg"]
-            except:  # noqa: E722
-                msg = ""
-            raise NeonPageserverApiException(msg) from e
-
-    def check_status(self):
-        self.get(f"http://{self.host}:{self.port}/v1/status").raise_for_status()
-
-    def tenant_list(self):
-        res = self.get(f"http://{self.host}:{self.port}/v1/tenant")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, list)
-        return res_json
-
-    def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists):
-        res = self.post(
-            f"http://{self.host}:{self.port}/v1/tenant",
-            json={"new_tenant_id": new_tenant_id.hex, "generation": 1},
-        )
-
-        if res.status_code == 409:
-            if ok_if_exists:
-                print(f"could not create tenant: already exists for id {new_tenant_id}")
-            else:
-                res.raise_for_status()
-        elif res.status_code == 201:
-            print(f"created tenant {new_tenant_id}")
-        else:
-            self.verbose_error(res)
-
-        return new_tenant_id
-
-    def timeline_list(self, tenant_id: uuid.UUID):
-        res = self.get(f"http://{self.host}:{self.port}/v1/tenant/{tenant_id.hex}/timeline")
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, list)
-        return res_json
-
-    def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]:
-        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=true"
-        )
-        self.verbose_error(res)
-        res_json = res.json()
-        assert isinstance(res_json, dict)
-        return res_json
-
-
-def lsn_to_hex(num: int) -> str:
-    """Convert lsn from int to standard hex notation."""
-    return f"{num >> 32:X}/{num & 0xFFFFFFFF:X}"
-
-
-def lsn_from_hex(lsn_hex: str) -> int:
-    """Convert lsn from hex notation to int."""
-    left, right = lsn_hex.split("/")
-    return (int(left, 16) << 32) + int(right, 16)
-
-
-def remote_consistent_lsn(
-    pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID
-) -> int:
-    detail = pageserver_http_client.timeline_detail(tenant, timeline)
-
-    lsn_str = detail["remote_consistent_lsn"]
-    assert isinstance(lsn_str, str)
-    return lsn_from_hex(lsn_str)
-
-
-def wait_for_upload(
-    pageserver_http_client: NeonPageserverHttpClient,
-    tenant: uuid.UUID,
-    timeline: uuid.UUID,
-    lsn: int,
-):
-    """waits for local timeline upload up to specified lsn"""
-    for i in range(10):
-        current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline)
-        if current_lsn >= lsn:
-            return
-        print(
-            f"waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, now {lsn_to_hex(current_lsn)}, iteration {i + 1}"
-        )
-        time.sleep(1)
-
-    raise Exception(
-        f"timed out while waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, was {lsn_to_hex(current_lsn)}"
-    )
-
-
-##############
-# End of utils
-##############
-
-
-def pack_base(log_dir, restored_dir, output_tar):
-    """Create tar file from basebackup, being careful to produce relative filenames."""
-    tmp_tar_name = "tmp.tar"
-    tmp_tar_path = os.path.join(restored_dir, tmp_tar_name)
-    cmd = ["tar", "-cf", tmp_tar_name] + os.listdir(restored_dir)
-    # We actually cd into the dir and call tar from there. If we call tar from
-    # outside we won't encode filenames as relative, and they won't parse well
-    # on import.
-    subprocess_capture(log_dir, cmd, cwd=restored_dir)
-    shutil.move(tmp_tar_path, output_tar)
-
-
-def reconstruct_paths(log_dir, pg_bin, base_tar, port: int):
-    """Reconstruct what relation files should exist in the datadir by querying postgres."""
-    with tempfile.TemporaryDirectory() as restored_dir:
-        # Unpack the base tar
-        subprocess_capture(log_dir, ["tar", "-xf", base_tar, "-C", restored_dir])
-
-        # Start a vanilla postgres from the given datadir and query it to find
-        # what relfiles should exist, but possibly don't.
-        with VanillaPostgres(Path(restored_dir), pg_bin, port, init=False) as vanilla_pg:
-            vanilla_pg.configure([f"port={port}"])
-            vanilla_pg.start(log_path=os.path.join(log_dir, "tmp_pg.log"))
-
-            # Create database based on template0 because we can't connect to template0
-            query = "create database template0copy template template0"
-            vanilla_pg.safe_psql(query, user="cloud_admin")
-            vanilla_pg.safe_psql("CHECKPOINT", user="cloud_admin")
-
-            # Get all databases
-            query = "select oid, datname from pg_database"
-            oid_dbname_pairs = vanilla_pg.safe_psql(query, user="cloud_admin")
-            template0_oid = [
-                oid for (oid, database) in oid_dbname_pairs if database == "template0"
-            ][0]
-
-            # Get rel paths for each database
-            for oid, database in oid_dbname_pairs:
-                if database == "template0":
-                    # We can't connect to template0
-                    continue
-
-                query = "select relname, pg_relation_filepath(oid) from pg_class"
-                result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database)
-                for _relname, filepath in result:
-                    if filepath is not None:
-                        if database == "template0copy":
-                            # Add all template0copy paths to template0
-                            prefix = f"base/{oid}/"
-                            if filepath.startswith(prefix):
-                                suffix = filepath[len(prefix) :]
-                                yield f"base/{template0_oid}/{suffix}"
-                            elif filepath.startswith("global"):
-                                print(f"skipping {database} global file {filepath}")
-                            else:
-                                raise AssertionError
-                        else:
-                            yield filepath
-
-
-def touch_missing_rels(log_dir, corrupt_tar, output_tar, paths):
-    """Add the appropriate empty files to a basebadkup tar."""
-    with tempfile.TemporaryDirectory() as restored_dir:
-        # Unpack the base tar
-        subprocess_capture(log_dir, ["tar", "-xf", corrupt_tar, "-C", restored_dir])
-
-        # Touch files that don't exist
-        for path in paths:
-            absolute_path = os.path.join(restored_dir, path)
-            exists = os.path.exists(absolute_path)
-            if not exists:
-                print(f"File {absolute_path} didn't exist. Creating..")
-                Path(absolute_path).touch()
-
-        # Repackage
-        pack_base(log_dir, restored_dir, output_tar)
-
-
-# HACK This is a workaround for exporting from old pageservers that
-#      can't export empty relations. In this case we need to start
-#      a vanilla postgres from the exported datadir, and query it
-#      to see what empty relations are missing, and then create
-#      those empty files before importing.
-def add_missing_rels(base_tar, output_tar, log_dir, pg_bin, tmp_pg_port: int):
-    reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar, tmp_pg_port))
-    touch_missing_rels(log_dir, base_tar, output_tar, reconstructed_paths)
-
-
-def get_rlsn(pageserver_connstr, tenant_id, timeline_id):
-    with closing(psycopg2.connect(pageserver_connstr)) as conn:
-        conn.autocommit = True
-        with conn.cursor() as cur:
-            cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}"
-            cur.execute(cmd)
-            res = cur.fetchone()
-            assert res is not None
-            prev_lsn = res[0]
-            last_lsn = res[1]
-
-    return last_lsn, prev_lsn
-
-
-def import_timeline(
-    args,
-    psql_path,
-    pageserver_connstr,
-    pageserver_http,
-    tenant_id,
-    timeline_id,
-    last_lsn,
-    prev_lsn,
-    tar_filename,
-    pg_version,
-):
-    # Import timelines to new pageserver
-    import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn} {pg_version}"
-    full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """
-
-    stderr_filename2 = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr")
-    stdout_filename = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout")
-
-    print(f"Running: {full_cmd}")
-
-    with open(stdout_filename, "w") as stdout_f:
-        with open(stderr_filename2, "w") as stderr_f:
-            print(f"(capturing output to {stdout_filename})")
-            pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
-            subprocess.run(
-                full_cmd,
-                stdout=stdout_f,
-                stderr=stderr_f,
-                env=pg_bin._build_env(None),
-                shell=True,
-                check=True,
-            )
-
-            print("Done import")
-
-    # Wait until pageserver persists the files
-    wait_for_upload(
-        pageserver_http, uuid.UUID(tenant_id), uuid.UUID(timeline_id), lsn_from_hex(last_lsn)
-    )
-
-
-def export_timeline(
-    args,
-    psql_path,
-    pageserver_connstr,
-    tenant_id,
-    timeline_id,
-    last_lsn,
-    prev_lsn,
-    tar_filename,
-    pg_version,
-):
-    # Choose filenames
-    incomplete_filename = tar_filename + ".incomplete"
-    stderr_filename = os.path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr")
-
-    # Construct export command
-    query = f"fullbackup {tenant_id} {timeline_id} {last_lsn} {prev_lsn}"
-    cmd = [psql_path, "--no-psqlrc", pageserver_connstr, "-c", query]
-
-    # Run export command
-    print(f"Running: {cmd}")
-    with open(incomplete_filename, "w") as stdout_f:
-        with open(stderr_filename, "w") as stderr_f:
-            print(f"(capturing output to {incomplete_filename})")
-            pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
-            subprocess.run(
-                cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True
-            )
-
-    # Add missing rels
-    pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
-    add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin, args.tmp_pg_port)
-
-    # Log more info
-    file_size = os.path.getsize(tar_filename)
-    print(f"Done export: {tar_filename}, size {file_size}")
-
-
-def main(args: argparse.Namespace):
-    # any psql version will do here. use current DEFAULT_PG_VERSION = 15
-    psql_path = str(Path(args.pg_distrib_dir) / "v15" / "bin" / "psql")
-
-    old_pageserver_host = args.old_pageserver_host
-    new_pageserver_host = args.new_pageserver_host
-
-    old_http_client = NeonPageserverHttpClient(old_pageserver_host, args.old_pageserver_http_port)
-    old_http_client.check_status()
-    old_pageserver_connstr = f"postgresql://{old_pageserver_host}:{args.old_pageserver_pg_port}"
-
-    new_http_client = NeonPageserverHttpClient(new_pageserver_host, args.new_pageserver_http_port)
-    new_http_client.check_status()
-    new_pageserver_connstr = f"postgresql://{new_pageserver_host}:{args.new_pageserver_pg_port}"
-
-    for tenant_id in args.tenants:
-        print(f"Tenant: {tenant_id}")
-        timelines = old_http_client.timeline_list(uuid.UUID(tenant_id))
-        print(f"Timelines: {timelines}")
-
-        # Create tenant in new pageserver
-        if args.only_import is False and not args.timelines:
-            new_http_client.tenant_create(uuid.UUID(tenant_id), args.ok_if_exists)
-
-        for timeline in timelines:
-            # Skip timelines we don't need to export
-            if args.timelines and timeline["timeline_id"] not in args.timelines:
-                print(f"Skipping timeline {timeline['timeline_id']}")
-                continue
-
-            # Choose filenames
-            tar_filename = os.path.join(
-                args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar"
-            )
-
-            pg_version = timeline["pg_version"]
-
-            # Export timeline from old pageserver
-            if args.only_import is False:
-                last_lsn, prev_lsn = get_rlsn(
-                    old_pageserver_connstr,
-                    timeline["tenant_id"],
-                    timeline["timeline_id"],
-                )
-                export_timeline(
-                    args,
-                    psql_path,
-                    old_pageserver_connstr,
-                    timeline["tenant_id"],
-                    timeline["timeline_id"],
-                    last_lsn,
-                    prev_lsn,
-                    tar_filename,
-                    pg_version,
-                )
-
-            # Import into new pageserver
-            import_timeline(
-                args,
-                psql_path,
-                new_pageserver_connstr,
-                new_http_client,
-                timeline["tenant_id"],
-                timeline["timeline_id"],
-                last_lsn,
-                prev_lsn,
-                tar_filename,
-                pg_version,
-            )
-
-            # Re-export and compare
-            re_export_filename = tar_filename + ".reexport"
-            export_timeline(
-                args,
-                psql_path,
-                new_pageserver_connstr,
-                timeline["tenant_id"],
-                timeline["timeline_id"],
-                last_lsn,
-                prev_lsn,
-                re_export_filename,
-                pg_version,
-            )
-
-            # Check the size is the same
-            old_size = (os.path.getsize(tar_filename),)
-            new_size = (os.path.getsize(re_export_filename),)
-            if old_size != new_size:
-                raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}")
-
-
-def non_zero_tcp_port(arg: Any):
-    port = int(arg)
-    if port < 1 or port > 65535:
-        raise argparse.ArgumentTypeError(f"invalid tcp port: {arg}")
-    return port
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--tenant-id",
-        dest="tenants",
-        required=True,
-        nargs="+",
-        help="Id of the tenant to migrate. You can pass multiple arguments",
-    )
-    parser.add_argument(
-        "--timeline-id",
-        dest="timelines",
-        required=False,
-        nargs="+",
-        help="Id of the timeline to migrate. You can pass multiple arguments",
-    )
-    parser.add_argument(
-        "--from-host",
-        dest="old_pageserver_host",
-        required=True,
-        help="Host of the pageserver to migrate data from",
-    )
-    parser.add_argument(
-        "--from-http-port",
-        dest="old_pageserver_http_port",
-        required=False,
-        type=int,
-        default=9898,
-        help="HTTP port of the pageserver to migrate data from. Default: 9898",
-    )
-    parser.add_argument(
-        "--from-pg-port",
-        dest="old_pageserver_pg_port",
-        required=False,
-        type=int,
-        default=6400,
-        help="pg port of the pageserver to migrate data from. Default: 6400",
-    )
-    parser.add_argument(
-        "--to-host",
-        dest="new_pageserver_host",
-        required=True,
-        help="Host of the pageserver to migrate data to",
-    )
-    parser.add_argument(
-        "--to-http-port",
-        dest="new_pageserver_http_port",
-        required=False,
-        default=9898,
-        type=int,
-        help="HTTP port of the pageserver to migrate data to. Default: 9898",
-    )
-    parser.add_argument(
-        "--to-pg-port",
-        dest="new_pageserver_pg_port",
-        required=False,
-        default=6400,
-        type=int,
-        help="pg port of the pageserver to migrate data to. Default: 6400",
-    )
-    parser.add_argument(
-        "--ignore-tenant-exists",
-        dest="ok_if_exists",
-        required=False,
-        help="Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.",
-    )
-    parser.add_argument(
-        "--pg-distrib-dir",
-        dest="pg_distrib_dir",
-        required=False,
-        default="/usr/local/",
-        help="Path where postgres binaries are installed. Default: /usr/local/",
-    )
-    parser.add_argument(
-        "--psql-path",
-        dest="psql_path",
-        required=False,
-        default="/usr/local/v14/bin/psql",
-        help="Path to the psql binary. Default: /usr/local/v14/bin/psql",
-    )
-    parser.add_argument(
-        "--only-import",
-        dest="only_import",
-        required=False,
-        default=False,
-        action="store_true",
-        help="Skip export and tenant creation part",
-    )
-    parser.add_argument(
-        "--work-dir",
-        dest="work_dir",
-        required=True,
-        default=False,
-        help="directory where temporary tar files are stored",
-    )
-    parser.add_argument(
-        "--tmp-pg-port",
-        dest="tmp_pg_port",
-        required=False,
-        default=55439,
-        type=non_zero_tcp_port,
-        help="localhost port to use for temporary postgres instance",
-    )
-    args = parser.parse_args()
-    main(args)
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -5,6 +5,7 @@ use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
 use std::sync::Arc;
+use std::time::Duration;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
@@ -245,6 +246,8 @@ async fn async_main() -> anyhow::Result<()> {
    };

    // After loading secrets & config, but before starting anything else, apply database migrations
+    Persistence::await_connection(&secrets.database_url, Duration::from_secs(5)).await?;
+
    migration_run(&secrets.database_url)
        .await
        .context("Running database migrations")?;
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -2,6 +2,7 @@ pub(crate) mod split_state;
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::time::Duration;
+use std::time::Instant;

 use self::split_state::SplitState;
 use camino::Utf8Path;
@@ -144,6 +145,31 @@ impl Persistence {
        }
    }

+    /// A helper for use during startup, where we would like to tolerate concurrent restarts of the
+    /// database and the storage controller, therefore the database might not be available right away
+    pub async fn await_connection(
+        database_url: &str,
+        timeout: Duration,
+    ) -> Result<(), diesel::ConnectionError> {
+        let started_at = Instant::now();
+        loop {
+            match PgConnection::establish(database_url) {
+                Ok(_) => {
+                    tracing::info!("Connected to database.");
+                    return Ok(());
+                }
+                Err(e) => {
+                    if started_at.elapsed() > timeout {
+                        return Err(e);
+                    } else {
+                        tracing::info!("Database not yet available, waiting... ({e})");
+                        tokio::time::sleep(Duration::from_millis(100)).await;
+                    }
+                }
+            }
+        }
+    }
+
    /// Wraps `with_conn` in order to collect latency and error metrics
    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
    where
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -129,7 +129,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
    "pageserver_getpage_reconstruct_seconds_sum",
    *[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
    *histogram("pageserver_smgr_query_seconds_global"),
-    *histogram("pageserver_read_num_fs_layers"),
+    *histogram("pageserver_layers_visited_per_read_global"),
    *histogram("pageserver_getpage_get_reconstruct_data_seconds"),
    *histogram("pageserver_wait_lsn_seconds"),
    *histogram("pageserver_remote_operation_seconds"),
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -0,0 +1,93 @@
+import os
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.workload import Workload
+
+AGGRESIVE_COMPACTION_TENANT_CONF = {
+    # Disable gc and compaction. The test runs compaction manually.
+    "gc_period": "0s",
+    "compaction_period": "0s",
+    # Small checkpoint distance to create many layers
+    "checkpoint_distance": 1024**2,
+    # Compact small layers
+    "compaction_target_size": 1024**2,
+    "image_creation_threshold": 2,
+    # INC-186: remove when merging the fix
+    "image_layer_creation_check_threshold": 0,
+}
+
+
+@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
+    """
+    This is a smoke test that compaction kicks in. The workload repeatedly churns
+    a small number of rows and manually instructs the pageserver to run compaction
+    between iterations. At the end of the test validate that the average number of
+    layers visited to gather reconstruct data for a given key is within the empirically
+    observed bounds.
+    """
+
+    # Effectively disable the page cache to rely only on image layers
+    # to shorten reads.
+    neon_env_builder.pageserver_config_override = """
+page_cache_size=10
+"""
+
+    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    row_count = 10000
+    churn_rounds = 100
+
+    ps_http = env.pageserver.http_client()
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageserver.id)
+
+    log.info("Writing initial data ...")
+    workload.write_rows(row_count, env.pageserver.id)
+
+    for i in range(1, churn_rounds + 1):
+        if i % 10 == 0:
+            log.info(f"Running churn round {i}/{churn_rounds} ...")
+
+        workload.churn_rows(row_count, env.pageserver.id)
+        ps_http.timeline_compact(tenant_id, timeline_id)
+
+    log.info("Validating at workload end ...")
+    workload.validate(env.pageserver.id)
+
+    log.info("Checking layer access metrics ...")
+
+    layer_access_metric_names = [
+        "pageserver_layers_visited_per_read_global_sum",
+        "pageserver_layers_visited_per_read_global_count",
+        "pageserver_layers_visited_per_read_global_bucket",
+        "pageserver_layers_visited_per_vectored_read_global_sum",
+        "pageserver_layers_visited_per_vectored_read_global_count",
+        "pageserver_layers_visited_per_vectored_read_global_bucket",
+    ]
+
+    metrics = env.pageserver.http_client().get_metrics()
+    for name in layer_access_metric_names:
+        layer_access_metrics = metrics.query_all(name)
+        log.info(f"Got metrics: {layer_access_metrics}")
+
+    non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum")
+    non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count")
+    non_vectored_average = non_vectored_sum.value / non_vectored_count.value
+
+    vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
+    vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
+    vectored_average = vectored_sum.value / vectored_count.value
+
+    log.info(f"{non_vectored_average=} {vectored_average=}")
+
+    # The upper bound for average number of layer visits below (8)
+    # was chosen empirically for this workload.
+    assert non_vectored_average < 8
+    assert vectored_average < 8
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1201,3 +1201,45 @@ def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
        max_lsn = max(Lsn(info["last_record_lsn"]) for info in infos)
        diff = max_lsn - min_lsn
        assert diff < 2 * 1024 * 1024, f"LSN diff={diff}, expected diff < 2MB due to backpressure"
+
+
+def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder):
+    """
+    Check that an unlogged relation is handled properly on a sharded tenant
+
+    Reproducer for https://github.com/neondatabase/neon/issues/7451
+    """
+
+    neon_env_builder.num_pageservers = 2
+    env = neon_env_builder.init_configs()
+    neon_env_builder.start()
+
+    tenant_id = TenantId.generate()
+    timeline_id = TimelineId.generate()
+    env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=8)
+
+    # We will create many tables to ensure it's overwhelmingly likely that at least one
+    # of them doesn't land on shard 0
+    table_names = [f"my_unlogged_{i}" for i in range(0, 16)]
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
+        for table_name in table_names:
+            ep.safe_psql(f"CREATE UNLOGGED TABLE {table_name} (id integer, value varchar(64));")
+            ep.safe_psql(f"INSERT INTO {table_name} VALUES (1, 'foo')")
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == [(1, "foo")]
+            ep.safe_psql(f"CREATE INDEX ON {table_name} USING btree (value);")
+
+        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as ep:
+        for table_name in table_names:
+            # Check that table works: we can select and insert
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == []
+            ep.safe_psql(f"INSERT INTO {table_name} VALUES (2, 'bar');")
+            result = ep.safe_psql(f"SELECT * from {table_name};")
+            assert result == [(2, "bar")]
+
+        # Ensure that post-endpoint-restart modifications are ingested happily by pageserver
+        wait_for_last_flush_lsn(env, ep, tenant_id, timeline_id)
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -16,7 +16,6 @@ from fixtures.pageserver.utils import (
    wait_for_upload,
    wait_tenant_status_404,
 )
-from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import (
    LocalFsStorage,
    RemoteStorageKind,
@@ -24,7 +23,6 @@ from fixtures.remote_storage import (
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
    query_scalar,
-    subprocess_capture,
    wait_until,
 )

@@ -184,20 +182,14 @@ def post_migration_check(endpoint: Endpoint, sum_before_migration: int, old_loca
        # A minor migration involves no storage breaking changes.
        # It is done by attaching the tenant to a new pageserver.
        "minor",
-        # A major migration involves exporting a postgres datadir
-        # basebackup and importing it into the new pageserver.
-        # This kind of migration can tolerate breaking changes
-        # to storage format
-        "major",
+        # In the unlikely and unfortunate event that we have to break
+        # the storage format, extend this test with the param below.
+        # "major",
    ],
 )
@pytest.mark.parametrize("with_load", ["with_load", "without_load"])
 def test_tenant_relocation(
    neon_env_builder: NeonEnvBuilder,
-    port_distributor: PortDistributor,
-    test_output_dir: Path,
-    neon_binpath: Path,
-    base_dir: Path,
    method: str,
    with_load: str,
 ):
@@ -299,40 +291,7 @@ def test_tenant_relocation(
        current_lsn=current_lsn_second,
    )

-    # Migrate either by attaching from s3 or import/export basebackup
-    if method == "major":
-        cmd = [
-            "poetry",
-            "run",
-            "python",
-            str(base_dir / "scripts/export_import_between_pageservers.py"),
-            "--tenant-id",
-            str(tenant_id),
-            "--from-host",
-            "localhost",
-            "--from-http-port",
-            str(origin_http.port),
-            "--from-pg-port",
-            str(origin_ps.service_port.pg),
-            "--to-host",
-            "localhost",
-            "--to-http-port",
-            str(destination_http.port),
-            "--to-pg-port",
-            str(destination_ps.service_port.pg),
-            "--pg-distrib-dir",
-            str(neon_env_builder.pg_distrib_dir),
-            "--work-dir",
-            str(test_output_dir),
-            "--tmp-pg-port",
-            str(port_distributor.get_port()),
-        ]
-        subprocess_capture(test_output_dir, cmd, check=True)
-
-        destination_ps.allowed_errors.append(
-            ".*ignored .* unexpected bytes after the tar archive.*"
-        )
-    elif method == "minor":
+    if method == "minor":
        # call to attach timeline to new pageserver
        destination_ps.tenant_attach(tenant_id)

--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -292,33 +292,12 @@ def test_single_branch_get_tenant_size_grows(
    Operate on single branch reading the tenants size after each transaction.
    """

-    # Disable automatic gc and compaction.
-    # The pitr_interval here is quite problematic, so we cannot really use it.
-    # it'd have to be calibrated per test executing env.
-
-    # there was a bug which was hidden if the create table and first batch of
-    # inserts is larger than gc_horizon. for example 0x20000 here hid the fact
-    # that there next_gc_cutoff could be smaller than initdb_lsn, which will
-    # obviously lead to issues when calculating the size.
-    gc_horizon = 0x3BA00
-
-    # it's a bit of a hack, but different versions of postgres have different
-    # amount of WAL generated for the same amount of data. so we need to
-    # adjust the gc_horizon accordingly.
-    if pg_version == PgVersion.V14:
-        gc_horizon = 0x4A000
-    elif pg_version == PgVersion.V15:
-        gc_horizon = 0x3BA00
-    elif pg_version == PgVersion.V16:
-        gc_horizon = 210000
-    else:
-        raise NotImplementedError(pg_version)
-
+    # Disable automatic compaction and GC, and set a long PITR interval: we will expect
+    # size to always increase with writes as all writes remain within the PITR
    tenant_config = {
        "compaction_period": "0s",
        "gc_period": "0s",
-        "pitr_interval": "0s",
-        "gc_horizon": gc_horizon,
+        "pitr_interval": "3600s",
    }

    env = neon_env_builder.init_start(initial_tenant_conf=tenant_config)
@@ -332,18 +311,6 @@ def test_single_branch_get_tenant_size_grows(

    size_debug_file = open(test_output_dir / "size_debug.html", "w")

-    def check_size_change(
-        current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev_size: int
-    ):
-        if current_lsn - initdb_lsn >= gc_horizon:
-            assert (
-                size >= prev_size
-            ), f"tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})"
-        else:
-            assert (
-                size > prev_size
-            ), f"tenant_size should grow, because we continue to add WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})"
-
    def get_current_consistent_size(
        env: NeonEnv,
        endpoint: Endpoint,
@@ -412,14 +379,6 @@ def test_single_branch_get_tenant_size_grows(
            )

            prev_size = collected_responses[-1][2]
-
-            # branch start shouldn't be past gc_horizon yet
-            # thus the size should grow as we insert more data
-            # "gc_horizon" is tuned so that it kicks in _after_ the
-            # insert phase, but before the update phase ends.
-            assert (
-                current_lsn - initdb_lsn <= gc_horizon
-            ), "Tuning of GC window is likely out-of-date"
            assert size > prev_size

            collected_responses.append(("INSERT", current_lsn, size))
@@ -439,8 +398,7 @@ def test_single_branch_get_tenant_size_grows(
            )

            prev_size = collected_responses[-1][2]
-
-            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+            assert size > prev_size

            collected_responses.append(("UPDATE", current_lsn, size))

@@ -457,8 +415,7 @@ def test_single_branch_get_tenant_size_grows(
            )

            prev_size = collected_responses[-1][2]
-
-            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+            assert size > prev_size

            collected_responses.append(("DELETE", current_lsn, size))

@@ -469,20 +426,20 @@ def test_single_branch_get_tenant_size_grows(
        with endpoint.cursor() as cur:
            cur.execute("DROP TABLE t0")

-        # Without setting a PITR interval, dropping the table doesn't reclaim any space
-        # from the user's point of view, because the DROP transaction is too small
-        # to fall out of gc_horizon.
+        # Dropping the table doesn't reclaim any space
+        # from the user's point of view, because the DROP transaction is still
+        # within pitr_interval.
        (current_lsn, size) = get_current_consistent_size(
            env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
        )
-        prev_size = collected_responses[-1][2]
-        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
+        assert size >= prev_size
+        prev_size = size

-        # Set a tiny PITR interval to allow the DROP to impact the synthetic size
+        # Set a zero PITR interval to allow the DROP to impact the synthetic size
        # Because synthetic size calculation uses pitr interval when available,
        # when our tenant is configured with a tiny pitr interval, dropping a table should
        # cause synthetic size to go down immediately
-        tenant_config["pitr_interval"] = "1ms"
+        tenant_config["pitr_interval"] = "0s"
        env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config)
        (current_lsn, size) = get_current_consistent_size(
            env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
@@ -494,10 +451,6 @@ def test_single_branch_get_tenant_size_grows(
        # defined by gc_horizon.
        collected_responses.append(("DROP", current_lsn, size))

-    # Should have gone past gc_horizon, otherwise gc_horizon is too large
-    bytes_written = current_lsn - initdb_lsn
-    assert bytes_written > gc_horizon
-
    # this isn't too many lines to forget for a while. observed while
    # developing these tests that locally the value is a bit more than what we
    # get in the ci.
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -38,7 +38,6 @@ futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", features = ["raw"] }
-hdrhistogram = { version = "7" }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper = { version = "0.14", features = ["full"] }
@@ -67,9 +66,8 @@ sha2 = { version = "0.10", features = ["asm"] }
 smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
 subtle = { version = "2" }
 time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }
-tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util", "tracing"] }
+tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
-tokio-stream = { version = "0.1", features = ["net"] }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
 toml_datetime = { version = "0.6", default-features = false, features = ["serde"] }
 toml_edit = { version = "0.19", features = ["serde"] }