custom params fmt

less small allocs for startup params
proxy: more refactors (#6526 )
2026-01-19 03:12:55 +00:00 · 2024-02-02 17:02:33 +00:00 · 2024-02-02 16:45:33 +00:00 · 2024-02-02 16:07:35 +00:00 · 2024-02-02 15:52:03 +00:00 · 2024-02-02 15:14:42 +01:00
140 changed files with 5858 additions and 2283 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,27 +1,28 @@
 *

-!rust-toolchain.toml
-!Cargo.toml
+# Files
 !Cargo.lock
+!Cargo.toml
 !Makefile
+!rust-toolchain.toml
+!scripts/combine_control_files.py
+!scripts/ninstall.sh
+!vm-cgconfig.conf

+# Directories
 !.cargo/
 !.config/
-!control_plane/
 !compute_tools/
+!control_plane/
 !libs/
+!neon_local/
 !pageserver/
+!patches/
 !pgxn/
 !proxy/
-!safekeeper/
 !s3_scrubber/
+!safekeeper/
 !storage_broker/
 !trace/
-!vendor/postgres-v14/
-!vendor/postgres-v15/
-!vendor/postgres-v16/
+!vendor/postgres-*/
 !workspace_hack/
-!neon_local/
-!scripts/ninstall.sh
-!scripts/combine_control_files.py
-!vm-cgconfig.conf
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -4,6 +4,8 @@ self-hosted-runner:
    - dev
    - gen3
    - large
+    # Remove `macos-14` from the list after https://github.com/rhysd/actionlint/pull/392 is merged.
+    - macos-14
    - small
    - us-east-2
 config-variables:
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -179,23 +179,6 @@ runs:
          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
        fi

-    - name: Store Allure test stat in the DB
-      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
-      shell: bash -euxo pipefail {0}
-      env:
-        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-        REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }}
-      run: |
-        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR}
-
-        ./scripts/pysync
-
-        poetry run python3 scripts/ingest_regress_test_result.py \
-          --revision ${COMMIT_SHA} \
-          --reference ${GITHUB_REF} \
-          --build-type unified \
-          --ingest ${WORKDIR}/report/data/suites.json
-
    - name: Store Allure test stat in the DB (new)
      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
      shell: bash -euxo pipefail {0}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -508,7 +508,7 @@ jobs:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
-          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -531,7 +531,6 @@ jobs:
        with:
          store-test-results-into-db: true
        env:
-          REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

      - uses: actions/github-script@v6
@@ -609,17 +608,6 @@ jobs:
            --input-objects=/tmp/coverage/binaries.list \
            --format=lcov

-      - name: Upload coverage report
-        id: upload-coverage-report
-        env:
-          BUCKET: neon-github-public-dev
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-        run: |
-          aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA}
-
-          REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
-          echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
-
      - name: Build coverage report NEW
        id: upload-coverage-report-new
        env:
@@ -656,21 +644,11 @@ jobs:

      - uses: actions/github-script@v6
        env:
-          REPORT_URL: ${{ steps.upload-coverage-report.outputs.report-url }}
          REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        with:
          script: |
-            const { REPORT_URL, REPORT_URL_NEW, COMMIT_SHA } = process.env
-
-            await github.rest.repos.createCommitStatus({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              sha: `${COMMIT_SHA}`,
-              state: 'success',
-              target_url: `${REPORT_URL}`,
-              context: 'Code coverage report',
-            })
+            const { REPORT_URL_NEW, COMMIT_SHA } = process.env

            await github.rest.repos.createCommitStatus({
              owner: context.repo.owner,
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -26,7 +26,7 @@ jobs:
      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
      github.ref_name == 'main'
    timeout-minutes: 90
-    runs-on: macos-latest
+    runs-on: macos-14

    env:
      # Use release build only, to have less debug info around
@@ -60,21 +60,21 @@ jobs:
        uses: actions/cache@v3
        with:
          path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Cache postgres v15 build
        id: cache_pg_15
        uses: actions/cache@v3
        with:
          path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Cache postgres v16 build
        id: cache_pg_16
        uses: actions/cache@v3
        with:
          path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Set extra env for macOS
        run: |
@@ -89,7 +89,7 @@ jobs:
            !~/.cargo/registry/src
            ~/.cargo/git
            target
-          key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
+          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust

      - name: Build postgres v14
        if: steps.cache_pg_14.outputs.cache-hit != 'true'
@@ -110,7 +110,7 @@ jobs:
        run: make walproposer-lib -j$(sysctl -n hw.ncpu)

      - name: Run cargo build
-        run: cargo build --all --release
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release

      - name: Check that no warnings are produced
        run: ./run_clippy.sh
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -20,7 +20,7 @@ ln -s ../../pre-commit.py .git/hooks/pre-commit

 This will run following checks on staged files before each commit:
 - `rustfmt`
- checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
+- checks for Python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).

 There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project
 and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date.
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -285,7 +285,6 @@ dependencies = [
 "metrics",
 "pageserver_api",
 "pageserver_client",
- "postgres_backend",
 "postgres_connection",
 "serde",
 "serde_json",
@@ -1145,16 +1144,6 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"

-[[package]]
-name = "close_fds"
-version = "0.3.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3bc416f33de9d59e79e57560f450d21ff8393adcf1cdfc3e6d8fb93d5f88a2ed"
-dependencies = [
- "cfg-if",
- "libc",
-]
-
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -2736,6 +2725,12 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "libm"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+
 [[package]]
 name = "linux-raw-sys"
 version = "0.1.4"
@@ -2806,6 +2801,15 @@ version = "2.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"

+[[package]]
+name = "memoffset"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4"
+dependencies = [
+ "autocfg",
+]
+
 [[package]]
 name = "memoffset"
 version = "0.8.0"
@@ -2832,6 +2836,9 @@ dependencies = [
 "libc",
 "once_cell",
 "prometheus",
+ "rand 0.8.5",
+ "rand_distr",
+ "twox-hash",
 "workspace_hack",
 ]

@@ -2935,6 +2942,19 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "nix"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b"
+dependencies = [
+ "bitflags 1.3.2",
+ "cfg-if",
+ "libc",
+ "memoffset 0.7.1",
+ "pin-utils",
+]
+
 [[package]]
 name = "nix"
 version = "0.27.1"
@@ -3057,6 +3077,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
 dependencies = [
 "autocfg",
+ "libm",
 ]

 [[package]]
@@ -3387,7 +3408,6 @@ dependencies = [
 "camino-tempfile",
 "chrono",
 "clap",
- "close_fds",
 "const_format",
 "consumption_metrics",
 "crc32c",
@@ -3463,6 +3483,7 @@ dependencies = [
 "bincode",
 "byteorder",
 "bytes",
+ "chrono",
 "const_format",
 "enum-map",
 "hex",
@@ -3882,6 +3903,7 @@ dependencies = [
 "pin-project-lite",
 "postgres-protocol",
 "rand 0.8.5",
+ "smallvec",
 "thiserror",
 "tokio",
 "tracing",
@@ -4071,6 +4093,8 @@ dependencies = [
 "sync_wrapper",
 "task-local-extensions",
 "thiserror",
+ "tikv-jemalloc-ctl",
+ "tikv-jemallocator",
 "tls-listener",
 "tokio",
 "tokio-postgres",
@@ -4171,6 +4195,16 @@ dependencies = [
 "getrandom 0.2.11",
 ]

+[[package]]
+name = "rand_distr"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
+dependencies = [
+ "num-traits",
+ "rand 0.8.5",
+]
+
 [[package]]
 name = "rand_hc"
 version = "0.2.0"
@@ -5511,6 +5545,37 @@ dependencies = [
 "ordered-float 2.10.1",
 ]

+[[package]]
+name = "tikv-jemalloc-ctl"
+version = "0.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "619bfed27d807b54f7f776b9430d4f8060e66ee138a28632ca898584d462c31c"
+dependencies = [
+ "libc",
+ "paste",
+ "tikv-jemalloc-sys",
+]
+
+[[package]]
+name = "tikv-jemalloc-sys"
+version = "0.5.4+5.3.0-patched"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9402443cb8fd499b6f327e40565234ff34dbda27460c5b47db0db77443dd85d1"
+dependencies = [
+ "cc",
+ "libc",
+]
+
+[[package]]
+name = "tikv-jemallocator"
+version = "0.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "965fe0c26be5c56c94e38ba547249074803efd52adfb66de62107d95aab3eaca"
+dependencies = [
+ "libc",
+ "tikv-jemalloc-sys",
+]
+
 [[package]]
 name = "time"
 version = "0.3.21"
@@ -5610,9 +5675,10 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
 dependencies = [
 "futures",
+ "nix 0.26.4",
 "once_cell",
 "scopeguard",
 "thiserror",
@@ -6134,7 +6200,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0dd3a2f8bf3239d34a19719ef1a74146c093126f"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
 dependencies = [
 "io-uring",
 "libc",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -64,7 +64,6 @@ camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive"] }
-close_fds = "0.3.2"
 comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
@@ -149,6 +148,8 @@ tar = "0.4"
 task-local-extensions = "0.1.4"
 test-context = "0.1"
 thiserror = "1.0"
+tikv-jemallocator = "0.5"
+tikv-jemalloc-ctl = "0.5"
 tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
@@ -165,6 +166,7 @@ tracing = "0.1"
 tracing-error = "0.2.0"
 tracing-opentelemetry = "0.20.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
+twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
--- a/2
+++ b/2
@@ -53,6 +53,7 @@ RUN set -e \
      --bin pagectl  \
      --bin safekeeper  \
      --bin storage_broker  \
+      --bin attachment_service  \
      --bin proxy  \
      --bin neon_local \
      --locked --release \
@@ -80,6 +81,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/attachment_service  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin

--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -241,9 +241,12 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
-    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
+COPY patches/pgvector.patch /pgvector.patch
+
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \
+    echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
+    patch -p1 < /pgvector.patch && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
@@ -520,8 +523,7 @@ RUN apt-get update && \
        libboost-regex1.74-dev \
        libboost-serialization1.74-dev \
        libboost-system1.74-dev \
-        libeigen3-dev \
-        libfreetype6-dev
+        libeigen3-dev

 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
@@ -547,6 +549,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
        -D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
        -D RDK_INSTALL_INTREE=OFF \
        -D RDK_INSTALL_COMIC_FONTS=OFF \
+        -D RDK_BUILD_FREETYPE_SUPPORT=OFF \
        -D CMAKE_BUILD_TYPE=Release \
        . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -901,7 +904,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
-# libboost*, libfreetype6, and zlib1g for rdkit
+# libboost* for rdkit
 # ca-certificates for communicating with s3 by compute_ctl
 RUN apt update &&  \
    apt install --no-install-recommends -y \
@@ -914,7 +917,6 @@ RUN apt update &&  \
        libboost-serialization1.74.0 \
        libboost-system1.74.0 \
        libossp-uuid16 \
-        libfreetype6 \
        libgeos-c1v5 \
        libgdal28 \
        libproj19 \
@@ -926,7 +928,6 @@ RUN apt update &&  \
        libcurl4-openssl-dev \
        locales \
        procps \
-        zlib1g \
        ca-certificates && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
--- a/6
+++ b/6
@@ -51,6 +51,8 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
 CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
 # Force cargo not to print progress bar
 CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
+# Set PQ_LIB_DIR to make sure `attachment_service` get linked with bundled libpq (through diesel)
+CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib

 #
 # Top level Makefile to build Neon and PostgreSQL
@@ -174,10 +176,10 @@ neon-pg-ext-clean-%:

 # Build walproposer as a static library. walproposer source code is located
 # in the pgxn/neon directory.
-# 
+#
 # We also need to include libpgport.a and libpgcommon.a, because walproposer
 # uses some functions from those libraries.
-# 
+#
 # Some object files are removed from libpgport.a and libpgcommon.a because
 # they depend on openssl and other libraries that are not included in our
 # Rust build.
--- a/README.md
+++ b/README.md
@@ -14,8 +14,8 @@ Alternatively, compile and run the project [locally](#running-local-installation
 A Neon installation consists of compute nodes and the Neon storage engine. Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.

 The Neon storage engine consists of two major components:
- Pageserver. Scalable storage backend for the compute nodes.
- Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
+- Pageserver: Scalable storage backend for the compute nodes.
+- Safekeepers: The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.

 See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more information.

@@ -81,9 +81,9 @@ The project uses [rust toolchain file](./rust-toolchain.toml) to define the vers

 This file is automatically picked up by [`rustup`](https://rust-lang.github.io/rustup/overrides.html#the-toolchain-file) that installs (if absent) and uses the toolchain version pinned in the file.

-rustup users who want to build with another toolchain can use [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory.
+rustup users who want to build with another toolchain can use the [`rustup override`](https://rust-lang.github.io/rustup/overrides.html#directory-overrides) command to set a specific toolchain for the project's directory.

-non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify their toolchain matches the version in the file.
+non-rustup users most probably are not getting the same toolchain automatically from the file, so are responsible to manually verify that their toolchain matches the version in the file.
 Newer rustc versions most probably will work fine, yet older ones might not be supported due to some new features used by the project or the crates.

 #### Building on Linux
@@ -124,7 +124,7 @@ make -j`sysctl -n hw.logicalcpu` -s
 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.

 To run the integration tests or Python scripts (not required to use the code), install
-Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
+Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.


 #### Running neon database
@@ -166,7 +166,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55432/postgres'

 2. Now, it is possible to connect to postgres and run some queries:
 ```text
-> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
+> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# CREATE TABLE t(key int primary key, value text);
 CREATE TABLE
 postgres=# insert into t values(1,1);
@@ -205,7 +205,7 @@ Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'

 # this new postgres instance will have all the data from 'main' postgres,
 # but all modifications would not affect data in original postgres
-> psql -p55434 -h 127.0.0.1 -U cloud_admin postgres
+> psql -p 55434 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# select * from t;
 key | value
 -----+-------
@@ -216,7 +216,7 @@ postgres=# insert into t values(2,2);
 INSERT 0 1

 # check that the new change doesn't affect the 'main' postgres
-> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
+> psql -p 55432 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# select * from t;
 key | value
 -----+-------
@@ -224,7 +224,7 @@ postgres=# select * from t;
 (1 row)
 ```

-4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances
+4. If you want to run tests afterwards (see below), you must stop all the running pageserver, safekeeper, and postgres instances
   you have just started. You can terminate them all with one command:
 ```sh
 > cargo neon stop
@@ -243,7 +243,7 @@ CARGO_BUILD_FLAGS="--features=testing" make
 ```

 By default, this runs both debug and release modes, and all supported postgres versions. When
-testing locally, it is convenient to run just run one set of permutations, like this:
+testing locally, it is convenient to run just one set of permutations, like this:

 ```sh
 DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -319,7 +319,7 @@ impl ComputeNode {
    // Get basebackup from the libpq connection to pageserver using `connstr` and
    // unarchive it to `pgdata` directory overriding all its previous content.
    #[instrument(skip_all, fields(%lsn))]
-    fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
+    fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");
        let start_time = Instant::now();

@@ -390,6 +390,34 @@ impl ComputeNode {
        Ok(())
    }

+    // Gets the basebackup in a retry loop
+    #[instrument(skip_all, fields(%lsn))]
+    pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
+        let mut retry_period_ms = 500;
+        let mut attempts = 0;
+        let max_attempts = 5;
+        loop {
+            let result = self.try_get_basebackup(compute_state, lsn);
+            match result {
+                Ok(_) => {
+                    return result;
+                }
+                Err(ref e) if attempts < max_attempts => {
+                    warn!(
+                        "Failed to get basebackup: {} (attempt {}/{})",
+                        e, attempts, max_attempts
+                    );
+                    std::thread::sleep(std::time::Duration::from_millis(retry_period_ms));
+                    retry_period_ms *= 2;
+                }
+                Err(_) => {
+                    return result;
+                }
+            }
+            attempts += 1;
+        }
+    }
+
    pub async fn check_safekeepers_synced_async(
        &self,
        compute_state: &ComputeState,
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -758,6 +758,14 @@ BEGIN
    END LOOP;
 END $$;
 "#,
+        r#"
+DO $$
+BEGIN
+    IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
+        EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
+    END IF;
+END
+$$;"#,
    ];

    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -21,10 +21,6 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true

-# TODO: remove this after DB persistence is added, it is only used for
-# a parsing function when loading pageservers from neon_local LocalEnv
-postgres_backend.workspace = true
-
 diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }

 utils = { path = "../../libs/utils/" }
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -2,13 +2,17 @@ use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
-use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
+use pageserver_api::models::{
+    TenantCreateRequest, TenantLocationConfigRequest, TimelineCreateRequest,
+};
 use pageserver_api::shard::TenantShardId;
+use pageserver_client::mgmt_api;
 use std::sync::Arc;
+use std::time::{Duration, Instant};
 use utils::auth::SwappableJwtAuth;
 use utils::http::endpoint::{auth_middleware, request_span};
 use utils::http::request::parse_request_param;
-use utils::id::TenantId;
+use utils::id::{TenantId, TimelineId};

 use utils::{
    http::{
@@ -112,6 +116,78 @@ async fn handle_tenant_create(
    json_response(StatusCode::OK, service.tenant_create(create_req).await?)
 }

+// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
+// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream.  This avoids
+// needing to track a "deleting" state for tenants.
+async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
+where
+    R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
+    F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
+{
+    let started_at = Instant::now();
+    // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
+    // completed.
+    let mut retry_period = Duration::from_secs(1);
+    // On subsequent retries, wait longer.
+    let max_retry_period = Duration::from_secs(5);
+    // Enable callers with a 30 second request timeout to reliably get a response
+    let max_wait = Duration::from_secs(25);
+
+    loop {
+        let status = f(service.clone()).await?;
+        match status {
+            StatusCode::ACCEPTED => {
+                tracing::info!("Deletion accepted, waiting to try again...");
+                tokio::time::sleep(retry_period).await;
+                retry_period = max_retry_period;
+            }
+            StatusCode::NOT_FOUND => {
+                tracing::info!("Deletion complete");
+                return json_response(StatusCode::OK, ());
+            }
+            _ => {
+                tracing::warn!("Unexpected status {status}");
+                return json_response(status, ());
+            }
+        }
+
+        let now = Instant::now();
+        if now + retry_period > started_at + max_wait {
+            tracing::info!("Deletion timed out waiting for 404");
+            // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
+            // the pageserver's swagger definition for this endpoint, and has the same desired
+            // effect of causing the control plane to retry later.
+            return json_response(StatusCode::CONFLICT, ());
+        }
+    }
+}
+
+async fn handle_tenant_location_config(
+    service: Arc<Service>,
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
+    json_response(
+        StatusCode::OK,
+        service
+            .tenant_location_config(tenant_id, config_req)
+            .await?,
+    )
+}
+
+async fn handle_tenant_delete(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
+    deletion_wrapper(service, move |service| async move {
+        service.tenant_delete(tenant_id).await
+    })
+    .await
+}
+
 async fn handle_tenant_timeline_create(
    service: Arc<Service>,
    mut req: Request<Body>,
@@ -126,6 +202,63 @@ async fn handle_tenant_timeline_create(
    )
 }

+async fn handle_tenant_timeline_delete(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
+    deletion_wrapper(service, move |service| async move {
+        service.tenant_timeline_delete(tenant_id, timeline_id).await
+    })
+    .await
+}
+
+async fn handle_tenant_timeline_passthrough(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+
+    let Some(path) = req.uri().path_and_query() else {
+        // This should never happen, our request router only calls us if there is a path
+        return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path")));
+    };
+
+    tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
+
+    // Find the node that holds shard zero
+    let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
+
+    // Callers will always pass an unsharded tenant ID.  Before proxying, we must
+    // rewrite this to a shard-aware shard zero ID.
+    let path = format!("{}", path);
+    let tenant_str = tenant_id.to_string();
+    let tenant_shard_str = format!("{}", tenant_shard_id);
+    let path = path.replace(&tenant_str, &tenant_shard_str);
+
+    let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
+    let resp = client.get_raw(path).await.map_err(|_e|
+        // FIXME: give APiError a proper Unavailable variant.  We return 503 here because
+        // if we can't successfully send a request to the pageserver, we aren't available.
+        ApiError::ShuttingDown)?;
+
+    // We have a reqest::Response, would like a http::Response
+    let mut builder = hyper::Response::builder()
+        .status(resp.status())
+        .version(resp.version());
+    for (k, v) in resp.headers() {
+        builder = builder.header(k, v);
+    }
+
+    let response = builder
+        .body(Body::wrap_stream(resp.bytes_stream()))
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+
+    Ok(response)
+}
+
 async fn handle_tenant_locate(
    service: Arc<Service>,
    req: Request<Body>,
@@ -141,6 +274,11 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, ())
 }

+async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&req);
+    json_response(StatusCode::OK, state.service.node_list().await?)
+}
+
 async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let node_id: NodeId = parse_request_param(&req, "node_id")?;
    let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
@@ -226,26 +364,64 @@ pub fn make_router(

    router
        .data(Arc::new(HttpState::new(service, auth)))
+        // Non-prefixed generic endpoints (status, metrics)
        .get("/status", |r| request_span(r, handle_status))
-        .post("/re-attach", |r| request_span(r, handle_re_attach))
-        .post("/validate", |r| request_span(r, handle_validate))
-        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
-        .post("/inspect", |r| request_span(r, handle_inspect))
-        .post("/node", |r| request_span(r, handle_node_register))
-        .put("/node/:node_id/config", |r| {
+        // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
+        .post("/upcall/v1/re-attach", |r| {
+            request_span(r, handle_re_attach)
+        })
+        .post("/upcall/v1/validate", |r| request_span(r, handle_validate))
+        // Test/dev/debug endpoints
+        .post("/debug/v1/attach-hook", |r| {
+            request_span(r, handle_attach_hook)
+        })
+        .post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
+        .get("/control/v1/tenant/:tenant_id/locate", |r| {
+            tenant_service_handler(r, handle_tenant_locate)
+        })
+        // Node operations
+        .post("/control/v1/node", |r| {
+            request_span(r, handle_node_register)
+        })
+        .get("/control/v1/node", |r| request_span(r, handle_node_list))
+        .put("/control/v1/node/:node_id/config", |r| {
            request_span(r, handle_node_configure)
        })
+        // Tenant Shard operations
+        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
+            tenant_service_handler(r, handle_tenant_shard_migrate)
+        })
+        // Tenant operations
+        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
+        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
        .post("/v1/tenant", |r| {
            tenant_service_handler(r, handle_tenant_create)
        })
+        .delete("/v1/tenant/:tenant_id", |r| {
+            tenant_service_handler(r, handle_tenant_delete)
+        })
+        .put("/v1/tenant/:tenant_id/location_config", |r| {
+            tenant_service_handler(r, handle_tenant_location_config)
+        })
+        // Tenant Shard operations (low level/maintenance)
+        .put("/tenant/:tenant_shard_id/migrate", |r| {
+            tenant_service_handler(r, handle_tenant_shard_migrate)
+        })
+        // Timeline operations
+        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+            tenant_service_handler(r, handle_tenant_timeline_delete)
+        })
        .post("/v1/tenant/:tenant_id/timeline", |r| {
            tenant_service_handler(r, handle_tenant_timeline_create)
        })
-        .get("/tenant/:tenant_id/locate", |r| {
-            tenant_service_handler(r, handle_tenant_locate)
+        // Tenant detail GET passthrough to shard zero
+        .get("/v1/tenant/:tenant_id*", |r| {
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
        })
-        .put("/tenant/:tenant_shard_id/migrate", |r| {
-            tenant_service_handler(r, handle_tenant_shard_migrate)
+        // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
+        // timeline GET APIs will be implicitly included.
+        .get("/v1/tenant/:tenant_id/timeline*", |r| {
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
        })
        // Path aliases for tests_forward_compatibility
        // TODO: remove these in future PR
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -39,7 +39,7 @@ struct Cli {

    /// Path to the .json file to store state (will be created if it doesn't exist)
    #[arg(short, long)]
-    path: Utf8PathBuf,
+    path: Option<Utf8PathBuf>,

    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
    #[arg(long)]
@@ -62,7 +62,7 @@ async fn main() -> anyhow::Result<()> {
        GIT_VERSION,
        launch_ts.to_string(),
        BUILD_TAG,
-        args.path,
+        args.path.as_ref().unwrap_or(&Utf8PathBuf::from("<none>")),
        args.listen
    );

@@ -70,11 +70,7 @@ async fn main() -> anyhow::Result<()> {
        jwt_token: args.jwt_token,
    };

-    let json_path = if args.path.as_os_str().is_empty() {
-        None
-    } else {
-        Some(args.path)
-    };
+    let json_path = args.path;
    let persistence = Arc::new(Persistence::new(args.database_url, json_path.clone()));

    let service = Service::spawn(config, persistence.clone()).await?;
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -9,7 +9,6 @@ use diesel::prelude::*;
 use diesel::Connection;
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
-use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
@@ -129,51 +128,11 @@ impl Persistence {
            })
            .await?;

-        if nodes.is_empty() {
-            return self.list_nodes_local_env().await;
-        }
-
        tracing::info!("list_nodes: loaded {} nodes", nodes.len());

        Ok(nodes)
    }

-    /// Shim for automated compatibility tests: load nodes from LocalEnv instead of database
-    pub(crate) async fn list_nodes_local_env(&self) -> DatabaseResult<Vec<Node>> {
-        // Enable test_backward_compatibility to work by populating our list of
-        // nodes from LocalEnv when it is not present in persistent storage.  Otherwise at
-        // first startup in the compat test, we may have shards but no nodes.
-        use control_plane::local_env::LocalEnv;
-        let env = LocalEnv::load_config().map_err(|e| DatabaseError::Logical(format!("{e}")))?;
-        tracing::info!(
-            "Loading {} pageserver nodes from LocalEnv",
-            env.pageservers.len()
-        );
-        let mut nodes = Vec::new();
-        for ps_conf in env.pageservers {
-            let (pg_host, pg_port) =
-                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
-            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
-                .expect("Unable to parse listen_http_addr");
-            let node = Node {
-                id: ps_conf.id,
-                listen_pg_addr: pg_host.to_string(),
-                listen_pg_port: pg_port.unwrap_or(5432),
-                listen_http_addr: http_host.to_string(),
-                listen_http_port: http_port.unwrap_or(80),
-                availability: NodeAvailability::Active,
-                scheduling: NodeSchedulingPolicy::Active,
-            };
-
-            // Synchronize database with what we learn from LocalEnv
-            self.insert_node(&node).await?;
-
-            nodes.push(node);
-        }
-
-        Ok(nodes)
-    }
-
    /// At startup, load the high level state for shards, such as their config + policy.  This will
    /// be enriched at runtime with state discovered on pageservers.
    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -21,6 +21,7 @@ use pageserver_api::{
    models,
    models::{
        LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
+        TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
        TimelineCreateRequest, TimelineInfo,
    },
    shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
@@ -30,14 +31,14 @@ use utils::{
    completion::Barrier,
    generation::Generation,
    http::error::ApiError,
-    id::{NodeId, TenantId},
+    id::{NodeId, TenantId, TimelineId},
    seqwait::SeqWait,
 };

 use crate::{
    compute_hook::ComputeHook,
    node::Node,
-    persistence::{DatabaseError, Persistence, TenantShardPersistence},
+    persistence::{DatabaseError, NodePersistence, Persistence, TenantShardPersistence},
    scheduler::Scheduler,
    tenant_state::{
        IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
@@ -635,7 +636,7 @@ impl Service {
                shard_number: tenant_shard_id.shard_number.0 as i32,
                shard_count: tenant_shard_id.shard_count.0 as i32,
                shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
-                generation: 0,
+                generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
                generation_pageserver: i64::MAX,
                placement_policy: serde_json::to_string(&placement_policy).unwrap(),
                config: serde_json::to_string(&create_req.config).unwrap(),
@@ -677,6 +678,7 @@ impl Service {
                        })?;

                        response_shards.push(TenantCreateResponseShard {
+                            shard_id: tenant_shard_id,
                            node_id: entry
                                .get()
                                .intent
@@ -709,6 +711,7 @@ impl Service {
                        })?;

                        response_shards.push(TenantCreateResponseShard {
+                            shard_id: tenant_shard_id,
                            node_id: state
                                .intent
                                .attached
@@ -742,14 +745,257 @@ impl Service {
            (waiters, response_shards)
        };

-        let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
+        self.await_waiters(waiters).await?;
+
+        Ok(TenantCreateResponse {
+            shards: response_shards,
+        })
+    }
+
+    /// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
+    /// wait for reconciliation to complete before responding.
+    async fn await_waiters(
+        &self,
+        waiters: Vec<ReconcilerWaiter>,
+    ) -> Result<(), ReconcileWaitError> {
+        let deadline = Instant::now().checked_add(Duration::from_secs(30)).unwrap();
        for waiter in waiters {
            let timeout = deadline.duration_since(Instant::now());
            waiter.wait_timeout(timeout).await?;
        }
-        Ok(TenantCreateResponse {
-            shards: response_shards,
-        })
+
+        Ok(())
+    }
+
+    /// This API is used by the cloud control plane to do coarse-grained control of tenants:
+    /// - Call with mode Attached* to upsert the tenant.
+    /// - Call with mode Detached to switch to PolicyMode::Detached
+    ///
+    /// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has
+    /// secondary locations.
+    pub(crate) async fn tenant_location_config(
+        &self,
+        tenant_id: TenantId,
+        req: TenantLocationConfigRequest,
+    ) -> Result<TenantLocationConfigResponse, ApiError> {
+        if req.tenant_id.shard_count.0 > 1 {
+            return Err(ApiError::BadRequest(anyhow::anyhow!(
+                "This API is for importing single-sharded or unsharded tenants"
+            )));
+        }
+
+        let mut waiters = Vec::new();
+        let mut result = TenantLocationConfigResponse { shards: Vec::new() };
+        let maybe_create = {
+            let mut locked = self.inner.write().unwrap();
+            let result_tx = locked.result_tx.clone();
+            let compute_hook = locked.compute_hook.clone();
+            let pageservers = locked.nodes.clone();
+
+            let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
+
+            // Maybe we have existing shards
+            let mut create = true;
+            for (shard_id, shard) in locked
+                .tenants
+                .range_mut(TenantShardId::tenant_range(tenant_id))
+            {
+                // Saw an existing shard: this is not a creation
+                create = false;
+
+                // Note that for existing tenants we do _not_ respect the generation in the request: this is likely
+                // to be stale.  Once a tenant is created in this service, our view of generation is authoritative, and
+                // callers' generations may be ignored.  This represents a one-way migration of tenants from the outer
+                // cloud control plane into this service.
+
+                // Use location config mode as an indicator of policy: if they ask for
+                // attached we go to default HA attached mode.  If they ask for secondary
+                // we go to secondary-only mode.  If they ask for detached we detach.
+                match req.config.mode {
+                    LocationConfigMode::Detached => {
+                        shard.policy = PlacementPolicy::Detached;
+                    }
+                    LocationConfigMode::Secondary => {
+                        // TODO: implement secondary-only mode.
+                        todo!();
+                    }
+                    LocationConfigMode::AttachedMulti
+                    | LocationConfigMode::AttachedSingle
+                    | LocationConfigMode::AttachedStale => {
+                        // TODO: persistence for changes in policy
+                        if pageservers.len() > 1 {
+                            shard.policy = PlacementPolicy::Double(1)
+                        } else {
+                            // Convenience for dev/test: if we just have one pageserver, import
+                            // tenants into Single mode so that scheduling will succeed.
+                            shard.policy = PlacementPolicy::Single
+                        }
+                    }
+                }
+
+                shard.schedule(&mut scheduler)?;
+
+                let maybe_waiter = shard.maybe_reconcile(
+                    result_tx.clone(),
+                    &pageservers,
+                    &compute_hook,
+                    &self.config,
+                    &self.persistence,
+                );
+                if let Some(waiter) = maybe_waiter {
+                    waiters.push(waiter);
+                }
+
+                if let Some(node_id) = shard.intent.attached {
+                    result.shards.push(TenantShardLocation {
+                        shard_id: *shard_id,
+                        node_id,
+                    })
+                }
+            }
+
+            if create {
+                // Validate request mode
+                match req.config.mode {
+                    LocationConfigMode::Detached | LocationConfigMode::Secondary => {
+                        // When using this API to onboard an existing tenant to this service, it must start in
+                        // an attached state, because we need the request to come with a generation
+                        return Err(ApiError::BadRequest(anyhow::anyhow!(
+                            "Imported tenant must be in attached mode"
+                        )));
+                    }
+
+                    LocationConfigMode::AttachedMulti
+                    | LocationConfigMode::AttachedSingle
+                    | LocationConfigMode::AttachedStale => {
+                        // Pass
+                    }
+                }
+
+                // Validate request generation
+                let Some(generation) = req.config.generation else {
+                    // We can only import attached tenants, because we need the request to come with a generation
+                    return Err(ApiError::BadRequest(anyhow::anyhow!(
+                        "Generation is mandatory when importing tenant"
+                    )));
+                };
+
+                // Synthesize a creation request
+                Some(TenantCreateRequest {
+                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    generation: Some(generation),
+                    shard_parameters: ShardParameters {
+                        // Must preserve the incoming shard_count do distinguish unsharded (0)
+                        // from single-sharded (1): this distinction appears in the S3 keys of the tenant.
+                        count: req.tenant_id.shard_count,
+                        // We only import un-sharded or single-sharded tenants, so stripe
+                        // size can be made up arbitrarily here.
+                        stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
+                    },
+                    config: req.config.tenant_conf,
+                })
+            } else {
+                None
+            }
+        };
+
+        if let Some(create_req) = maybe_create {
+            let create_resp = self.tenant_create(create_req).await?;
+            result.shards = create_resp
+                .shards
+                .into_iter()
+                .map(|s| TenantShardLocation {
+                    node_id: s.node_id,
+                    shard_id: s.shard_id,
+                })
+                .collect();
+        } else {
+            // This was an update, wait for reconciliation
+            self.await_waiters(waiters).await?;
+        }
+
+        Ok(result)
+    }
+
+    pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
+        // TODO: refactor into helper
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                let node_id = shard.intent.attached.ok_or_else(|| {
+                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
+                })?;
+                let node = locked
+                    .nodes
+                    .get(&node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                targets.push((*tenant_shard_id, node.clone()));
+            }
+            targets
+        };
+
+        // TODO: error out if the tenant is not attached anywhere.
+
+        // Phase 1: delete on the pageservers
+        let mut any_pending = false;
+        for (tenant_shard_id, node) in targets {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+            // TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
+            // surface immediately as an error to our caller.
+            let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
+                ApiError::InternalServerError(anyhow::anyhow!(
+                    "Error deleting shard {tenant_shard_id} on node {}: {e}",
+                    node.id
+                ))
+            })?;
+            tracing::info!(
+                "Shard {tenant_shard_id} on node {}, delete returned {}",
+                node.id,
+                status
+            );
+            if status == StatusCode::ACCEPTED {
+                any_pending = true;
+            }
+        }
+
+        if any_pending {
+            // Caller should call us again later.  When we eventually see 404s from
+            // all the shards, we may proceed to delete our records of the tenant.
+            tracing::info!(
+                "Tenant {} has some shards pending deletion, returning 202",
+                tenant_id
+            );
+            return Ok(StatusCode::ACCEPTED);
+        }
+
+        // Fall through: deletion of the tenant on pageservers is complete, we may proceed to drop
+        // our in-memory state and database state.
+
+        // Ordering: we delete persistent state first: if we then
+        // crash, we will drop the in-memory state.
+
+        // Drop persistent state.
+        self.persistence.delete_tenant(tenant_id).await?;
+
+        // Drop in-memory state
+        {
+            let mut locked = self.inner.write().unwrap();
+            locked
+                .tenants
+                .retain(|tenant_shard_id, _shard| tenant_shard_id.tenant_id != tenant_id);
+            tracing::info!(
+                "Deleted tenant {tenant_id}, now have {} tenants",
+                locked.tenants.len()
+            );
+        };
+
+        // Success is represented as 404, to imitate the existing pageserver deletion API
+        Ok(StatusCode::NOT_FOUND)
    }

    pub(crate) async fn tenant_timeline_create(
@@ -759,25 +1005,15 @@ impl Service {
    ) -> Result<TimelineInfo, ApiError> {
        let mut timeline_info = None;

-        let ensure_waiters = {
-            let locked = self.inner.write().unwrap();
-            tracing::info!(
-                "Creating timeline {}/{}, have {} pageservers",
-                tenant_id,
-                create_req.new_timeline_id,
-                locked.nodes.len()
-            );
+        tracing::info!(
+            "Creating timeline {}/{}",
+            tenant_id,
+            create_req.new_timeline_id,
+        );

-            self.ensure_attached(locked, tenant_id)
-                .map_err(ApiError::InternalServerError)?
-        };
-
-        let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
-        for waiter in ensure_waiters {
-            let timeout = deadline.duration_since(Instant::now());
-            waiter.wait_timeout(timeout).await?;
-        }
+        self.ensure_attached_wait(tenant_id).await?;

+        // TODO: refuse to do this if shard splitting is in progress
        let targets = {
            let locked = self.inner.read().unwrap();
            let mut targets = Vec::new();
@@ -848,6 +1084,111 @@ impl Service {
        Ok(timeline_info.expect("targets cannot be empty"))
    }

+    pub(crate) async fn tenant_timeline_delete(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<StatusCode, ApiError> {
+        tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,);
+
+        self.ensure_attached_wait(tenant_id).await?;
+
+        // TODO: refuse to do this if shard splitting is in progress
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            for (tenant_shard_id, shard) in
+                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
+            {
+                let node_id = shard.intent.attached.ok_or_else(|| {
+                    ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
+                })?;
+                let node = locked
+                    .nodes
+                    .get(&node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                targets.push((*tenant_shard_id, node.clone()));
+            }
+            targets
+        };
+
+        if targets.is_empty() {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant not found").into(),
+            ));
+        }
+
+        // TODO: call into shards concurrently
+        let mut any_pending = false;
+        for (tenant_shard_id, node) in targets {
+            let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
+
+            tracing::info!(
+                "Deleting timeline on shard {}/{}, attached to node {}",
+                tenant_shard_id,
+                timeline_id,
+                node.id
+            );
+
+            let status = client
+                .timeline_delete(tenant_shard_id, timeline_id)
+                .await
+                .map_err(|e| {
+                    ApiError::InternalServerError(anyhow::anyhow!(
+                    "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {}: {e}",
+                    node.id
+                ))
+                })?;
+
+            if status == StatusCode::ACCEPTED {
+                any_pending = true;
+            }
+        }
+
+        if any_pending {
+            Ok(StatusCode::ACCEPTED)
+        } else {
+            Ok(StatusCode::NOT_FOUND)
+        }
+    }
+
+    /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
+    /// function looks it up and returns the url.  If the tenant isn't found, returns Err(ApiError::NotFound)
+    pub(crate) fn tenant_shard0_baseurl(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<(String, TenantShardId), ApiError> {
+        let locked = self.inner.read().unwrap();
+        let Some((tenant_shard_id, shard)) = locked
+            .tenants
+            .range(TenantShardId::tenant_range(tenant_id))
+            .next()
+        else {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
+            ));
+        };
+
+        // TODO: should use the ID last published to compute_hook, rather than the intent: the intent might
+        // point to somewhere we haven't attached yet.
+        let Some(node_id) = shard.intent.attached else {
+            return Err(ApiError::Conflict(
+                "Cannot call timeline API on non-attached tenant".to_string(),
+            ));
+        };
+
+        let Some(node) = locked.nodes.get(&node_id) else {
+            // This should never happen
+            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "Shard refers to nonexistent node"
+            )));
+        };
+
+        Ok((node.base_url(), *tenant_shard_id))
+    }
+
    pub(crate) fn tenant_locate(
        &self,
        tenant_id: TenantId,
@@ -993,6 +1334,20 @@ impl Service {
        Ok(TenantShardMigrateResponse {})
    }

+    pub(crate) async fn node_list(&self) -> Result<Vec<NodePersistence>, ApiError> {
+        // It is convenient to avoid taking the big lock and converting Node to a serializable
+        // structure, by fetching from storage instead of reading in-memory state.
+        let nodes = self
+            .persistence
+            .list_nodes()
+            .await?
+            .into_iter()
+            .map(|n| n.to_persistent())
+            .collect();
+
+        Ok(nodes)
+    }
+
    pub(crate) async fn node_register(
        &self,
        register_req: NodeRegisterRequest,
@@ -1166,7 +1521,7 @@ impl Service {
    /// Helper for methods that will try and call pageserver APIs for
    /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
    /// is attached somewhere.
-    fn ensure_attached(
+    fn ensure_attached_schedule(
        &self,
        mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
        tenant_id: TenantId,
@@ -1196,6 +1551,23 @@ impl Service {
        Ok(waiters)
    }

+    async fn ensure_attached_wait(&self, tenant_id: TenantId) -> Result<(), ApiError> {
+        let ensure_waiters = {
+            let locked = self.inner.write().unwrap();
+
+            self.ensure_attached_schedule(locked, tenant_id)
+                .map_err(ApiError::InternalServerError)?
+        };
+
+        let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
+        for waiter in ensure_waiters {
+            let timeout = deadline.duration_since(Instant::now());
+            waiter.wait_timeout(timeout).await?;
+        }
+
+        Ok(())
+    }
+
    /// Check all tenants for pending reconciliation work, and reconcile those in need
    ///
    /// Returns how many reconciliation tasks were started
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -17,6 +17,7 @@ use serde::{de::DeserializeOwned, Deserialize, Serialize};
 use std::{env, str::FromStr};
 use tokio::process::Command;
 use tracing::instrument;
+use url::Url;
 use utils::{
    auth::{Claims, Scope},
    id::{NodeId, TenantId},
@@ -59,6 +60,7 @@ pub struct InspectResponse {

 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
+    pub shard_id: TenantShardId,
    pub node_id: NodeId,
    pub generation: u32,
 }
@@ -523,13 +525,15 @@ impl AttachmentService {
        RQ: Serialize + Sized,
        RS: DeserializeOwned + Sized,
    {
-        let url = self
-            .env
-            .control_plane_api
-            .clone()
-            .unwrap()
-            .join(&path)
-            .unwrap();
+        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
+        // for general purpose API access.
+        let listen_url = self.env.control_plane_api.clone().unwrap();
+        let url = Url::from_str(&format!(
+            "http://{}:{}/{path}",
+            listen_url.host_str().unwrap(),
+            listen_url.port().unwrap()
+        ))
+        .unwrap();

        let mut builder = self.client.request(method, url);
        if let Some(body) = body {
@@ -566,7 +570,7 @@ impl AttachmentService {
        let response = self
            .dispatch::<_, AttachHookResponse>(
                Method::POST,
-                "attach-hook".to_string(),
+                "debug/v1/attach-hook".to_string(),
                Some(request),
            )
            .await?;
@@ -582,7 +586,11 @@ impl AttachmentService {
        let request = InspectRequest { tenant_shard_id };

        let response = self
-            .dispatch::<_, InspectResponse>(Method::POST, "inspect".to_string(), Some(request))
+            .dispatch::<_, InspectResponse>(
+                Method::POST,
+                "debug/v1/inspect".to_string(),
+                Some(request),
+            )
            .await?;

        Ok(response.attachment)
@@ -599,8 +607,12 @@ impl AttachmentService {

    #[instrument(skip(self))]
    pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
-        self.dispatch::<(), _>(Method::GET, format!("tenant/{tenant_id}/locate"), None)
-            .await
+        self.dispatch::<(), _>(
+            Method::GET,
+            format!("control/v1/tenant/{tenant_id}/locate"),
+            None,
+        )
+        .await
    }

    #[instrument(skip(self))]
@@ -622,7 +634,7 @@ impl AttachmentService {

    #[instrument(skip_all, fields(node_id=%req.node_id))]
    pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
-        self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req))
+        self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
            .await
    }

@@ -630,7 +642,7 @@ impl AttachmentService {
    pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> {
        self.dispatch::<_, ()>(
            Method::PUT,
-            format!("node/{}/config", req.node_id),
+            format!("control/v1/node/{}/config", req.node_id),
            Some(req),
        )
        .await
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -256,7 +256,9 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
    for env_key in [
        "AWS_ACCESS_KEY_ID",
        "AWS_SECRET_ACCESS_KEY",
-        "AWS_SESSION_TOKEN",
+        "AWS_PROFILE",
+        // HOME is needed in combination with `AWS_PROFILE` to pick up the SSO sessions.
+        "HOME",
        "AZURE_STORAGE_ACCOUNT",
        "AZURE_STORAGE_ACCESS_KEY",
    ] {
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -51,7 +51,7 @@ project_git_version!(GIT_VERSION);

 const DEFAULT_PG_VERSION: &str = "15";

-const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";
+const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";

 fn default_conf(num_pageservers: u16) -> String {
    let mut template = format!(
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -395,6 +395,11 @@ impl PageServerNode {
                .transpose()
                .context("Failed to parse 'gc_feedback' as bool")?,
            heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
+            lazy_slru_download: settings
+                .remove("lazy_slru_download")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'lazy_slru_download' as bool")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
@@ -495,6 +500,11 @@ impl PageServerNode {
                    .transpose()
                    .context("Failed to parse 'gc_feedback' as bool")?,
                heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
+                lazy_slru_download: settings
+                    .remove("lazy_slru_download")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'lazy_slru_download' as bool")?,
            }
        };

--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -9,5 +9,10 @@ prometheus.workspace = true
 libc.workspace = true
 once_cell.workspace = true
 chrono.workspace = true
+twox-hash.workspace = true

 workspace_hack.workspace = true
+
+[dev-dependencies]
+rand = "0.8"
+rand_distr = "0.4.3"
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -0,0 +1,523 @@
+//! HyperLogLog is an algorithm for the count-distinct problem,
+//! approximating the number of distinct elements in a multiset.
+//! Calculating the exact cardinality of the distinct elements
+//! of a multiset requires an amount of memory proportional to
+//! the cardinality, which is impractical for very large data sets.
+//! Probabilistic cardinality estimators, such as the HyperLogLog algorithm,
+//! use significantly less memory than this, but can only approximate the cardinality.
+
+use std::{
+    collections::HashMap,
+    hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
+    sync::{atomic::AtomicU8, Arc, RwLock},
+};
+
+use prometheus::{
+    core::{self, Describer},
+    proto, Opts,
+};
+use twox_hash::xxh3;
+
+/// Create an [`HyperLogLogVec`] and registers to default registry.
+#[macro_export(local_inner_macros)]
+macro_rules! register_hll_vec {
+    ($N:literal, $OPTS:expr, $LABELS_NAMES:expr $(,)?) => {{
+        let hll_vec = $crate::HyperLogLogVec::<$N>::new($OPTS, $LABELS_NAMES).unwrap();
+        $crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec)
+    }};
+
+    ($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{
+        $crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
+    }};
+}
+
+/// Create an [`HyperLogLog`] and registers to default registry.
+#[macro_export(local_inner_macros)]
+macro_rules! register_hll {
+    ($N:literal, $OPTS:expr $(,)?) => {{
+        let hll = $crate::HyperLogLog::<$N>::with_opts($OPTS).unwrap();
+        $crate::register(Box::new(hll.clone())).map(|_| hll)
+    }};
+
+    ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
+        $crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
+    }};
+}
+
+/// HLL is a probabilistic cardinality measure.
+///
+/// How to use this time-series for a metric name `my_metrics_total_hll`:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// If you want an estimate over time, you can use the following query:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (
+///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
+///             ) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// In the case of low cardinality, you might want to use the linear counting approximation:
+///
+/// ```promql
+/// # LinearCounting(m, V) = m log (m / V)
+/// shards_count * ln(shards_count /
+///     # calculate V = how many shards contain a 0
+///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
+/// )
+/// ```
+///
+/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
+#[derive(Clone)]
+pub struct HyperLogLogVec<const N: usize> {
+    core: Arc<HyperLogLogVecCore<N>>,
+}
+
+struct HyperLogLogVecCore<const N: usize> {
+    pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
+    pub desc: core::Desc,
+    pub opts: Opts,
+}
+
+impl<const N: usize> core::Collector for HyperLogLogVec<N> {
+    fn desc(&self) -> Vec<&core::Desc> {
+        vec![&self.core.desc]
+    }
+
+    fn collect(&self) -> Vec<proto::MetricFamily> {
+        let mut m = proto::MetricFamily::default();
+        m.set_name(self.core.desc.fq_name.clone());
+        m.set_help(self.core.desc.help.clone());
+        m.set_field_type(proto::MetricType::GAUGE);
+
+        let mut metrics = Vec::new();
+        for child in self.core.children.read().unwrap().values() {
+            child.core.collect_into(&mut metrics);
+        }
+        m.set_metric(metrics);
+
+        vec![m]
+    }
+}
+
+impl<const N: usize> HyperLogLogVec<N> {
+    /// Create a new [`HyperLogLogVec`] based on the provided
+    /// [`Opts`] and partitioned by the given label names. At least one label name must be
+    /// provided.
+    pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
+        assert!(N.is_power_of_two());
+        let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
+        let opts = opts.variable_labels(variable_names);
+
+        let desc = opts.describe()?;
+        let v = HyperLogLogVecCore {
+            children: RwLock::new(HashMap::default()),
+            desc,
+            opts,
+        };
+
+        Ok(Self { core: Arc::new(v) })
+    }
+
+    /// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
+    /// of label values (same order as the VariableLabels in Desc). If that combination of
+    /// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
+    ///
+    /// An error is returned if the number of label values is not the same as the
+    /// number of VariableLabels in Desc.
+    pub fn get_metric_with_label_values(
+        &self,
+        vals: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        self.core.get_metric_with_label_values(vals)
+    }
+
+    /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
+    /// occurs.
+    pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
+        self.get_metric_with_label_values(vals).unwrap()
+    }
+}
+
+impl<const N: usize> HyperLogLogVecCore<N> {
+    pub fn get_metric_with_label_values(
+        &self,
+        vals: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        let h = self.hash_label_values(vals)?;
+
+        if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
+            return Ok(metric);
+        }
+
+        self.get_or_create_metric(h, vals)
+    }
+
+    pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
+        if vals.len() != self.desc.variable_labels.len() {
+            return Err(prometheus::Error::InconsistentCardinality {
+                expect: self.desc.variable_labels.len(),
+                got: vals.len(),
+            });
+        }
+
+        let mut h = xxh3::Hash64::default();
+        for val in vals {
+            h.write(val.as_bytes());
+        }
+
+        Ok(h.finish())
+    }
+
+    fn get_or_create_metric(
+        &self,
+        hash: u64,
+        label_values: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        let mut children = self.children.write().unwrap();
+        // Check exist first.
+        if let Some(metric) = children.get(&hash).cloned() {
+            return Ok(metric);
+        }
+
+        let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
+        children.insert(hash, metric.clone());
+        Ok(metric)
+    }
+}
+
+/// HLL is a probabilistic cardinality measure.
+///
+/// How to use this time-series for a metric name `my_metrics_total_hll`:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// If you want an estimate over time, you can use the following query:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (
+///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
+///             ) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// In the case of low cardinality, you might want to use the linear counting approximation:
+///
+/// ```promql
+/// # LinearCounting(m, V) = m log (m / V)
+/// shards_count * ln(shards_count /
+///     # calculate V = how many shards contain a 0
+///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
+/// )
+/// ```
+///
+/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
+#[derive(Clone)]
+pub struct HyperLogLog<const N: usize> {
+    core: Arc<HyperLogLogCore<N>>,
+}
+
+impl<const N: usize> HyperLogLog<N> {
+    /// Create a [`HyperLogLog`] with the `name` and `help` arguments.
+    pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
+        assert!(N.is_power_of_two());
+        let opts = Opts::new(name, help);
+        Self::with_opts(opts)
+    }
+
+    /// Create a [`HyperLogLog`] with the `opts` options.
+    pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
+        Self::with_opts_and_label_values(&opts, &[])
+    }
+
+    fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
+        let desc = opts.describe()?;
+        let labels = make_label_pairs(&desc, label_values)?;
+
+        let v = HyperLogLogCore {
+            shards: [0; N].map(AtomicU8::new),
+            desc,
+            labels,
+        };
+        Ok(Self { core: Arc::new(v) })
+    }
+
+    pub fn measure(&self, item: &impl Hash) {
+        // changing the hasher will break compatibility with previous measurements.
+        self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
+    }
+
+    fn record(&self, hash: u64) {
+        let p = N.ilog2() as u8;
+        let j = hash & (N as u64 - 1);
+        let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
+        self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
+    }
+}
+
+struct HyperLogLogCore<const N: usize> {
+    shards: [AtomicU8; N],
+    desc: core::Desc,
+    labels: Vec<proto::LabelPair>,
+}
+
+impl<const N: usize> core::Collector for HyperLogLog<N> {
+    fn desc(&self) -> Vec<&core::Desc> {
+        vec![&self.core.desc]
+    }
+
+    fn collect(&self) -> Vec<proto::MetricFamily> {
+        let mut m = proto::MetricFamily::default();
+        m.set_name(self.core.desc.fq_name.clone());
+        m.set_help(self.core.desc.help.clone());
+        m.set_field_type(proto::MetricType::GAUGE);
+
+        let mut metrics = Vec::new();
+        self.core.collect_into(&mut metrics);
+        m.set_metric(metrics);
+
+        vec![m]
+    }
+}
+
+impl<const N: usize> HyperLogLogCore<N> {
+    fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
+        self.shards.iter().enumerate().for_each(|(i, x)| {
+            let mut shard_label = proto::LabelPair::default();
+            shard_label.set_name("hll_shard".to_owned());
+            shard_label.set_value(format!("{i}"));
+
+            // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
+
+            // This seems like it would be a race condition,
+            // but HLL is not impacted by a write in one shard happening in between.
+            // This is because in PromQL we will be implementing a harmonic mean of all buckets.
+            // we will also merge samples in a time series using `max by (hll_shard)`.
+
+            // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
+            // this would mean that a dev port-forwarding the metrics url won't break the sampling.
+            let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
+
+            let mut m = proto::Metric::default();
+            let mut c = proto::Gauge::default();
+            c.set_value(v as f64);
+            m.set_gauge(c);
+
+            let mut labels = Vec::with_capacity(self.labels.len() + 1);
+            labels.extend_from_slice(&self.labels);
+            labels.push(shard_label);
+
+            m.set_label(labels);
+            metrics.push(m);
+        })
+    }
+}
+
+fn make_label_pairs(
+    desc: &core::Desc,
+    label_values: &[&str],
+) -> prometheus::Result<Vec<proto::LabelPair>> {
+    if desc.variable_labels.len() != label_values.len() {
+        return Err(prometheus::Error::InconsistentCardinality {
+            expect: desc.variable_labels.len(),
+            got: label_values.len(),
+        });
+    }
+
+    let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
+    if total_len == 0 {
+        return Ok(vec![]);
+    }
+
+    if desc.variable_labels.is_empty() {
+        return Ok(desc.const_label_pairs.clone());
+    }
+
+    let mut label_pairs = Vec::with_capacity(total_len);
+    for (i, n) in desc.variable_labels.iter().enumerate() {
+        let mut label_pair = proto::LabelPair::default();
+        label_pair.set_name(n.clone());
+        label_pair.set_value(label_values[i].to_owned());
+        label_pairs.push(label_pair);
+    }
+
+    for label_pair in &desc.const_label_pairs {
+        label_pairs.push(label_pair.clone());
+    }
+    label_pairs.sort();
+    Ok(label_pairs)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashSet;
+
+    use prometheus::{proto, Opts};
+    use rand::{rngs::StdRng, Rng, SeedableRng};
+    use rand_distr::{Distribution, Zipf};
+
+    use crate::HyperLogLogVec;
+
+    fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
+        let mut metrics = vec![];
+        hll.core
+            .children
+            .read()
+            .unwrap()
+            .values()
+            .for_each(|c| c.core.collect_into(&mut metrics));
+        metrics
+    }
+    fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
+        let mut buckets = [0.0; 32];
+        for metric in metrics.chunks_exact(32) {
+            if filter(&metric[0]) {
+                for (i, m) in metric.iter().enumerate() {
+                    buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
+                }
+            }
+        }
+
+        buckets
+            .into_iter()
+            .map(|f| 2.0f64.powf(-f))
+            .sum::<f64>()
+            .recip()
+            * 0.697
+            * 32.0
+            * 32.0
+    }
+
+    fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
+        let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
+
+        let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
+        let mut set_a = HashSet::new();
+        let mut set_b = HashSet::new();
+
+        for x in iter.by_ref().take(n) {
+            set_a.insert(x.to_bits());
+            hll.with_label_values(&["a"]).measure(&x.to_bits());
+        }
+        for x in iter.by_ref().take(n) {
+            set_b.insert(x.to_bits());
+            hll.with_label_values(&["b"]).measure(&x.to_bits());
+        }
+        let merge = &set_a | &set_b;
+
+        let metrics = collect(&hll);
+        let len = get_cardinality(&metrics, |_| true);
+        let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
+        let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
+
+        ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
+    }
+
+    #[test]
+    fn test_cardinality_small() {
+        let (actual, estimate) = test_cardinality(100, Zipf::new(100, 1.2f64).unwrap());
+
+        assert_eq!(actual, [46, 30, 32]);
+        assert!(51.3 < estimate[0] && estimate[0] < 51.4);
+        assert!(44.0 < estimate[1] && estimate[1] < 44.1);
+        assert!(39.0 < estimate[2] && estimate[2] < 39.1);
+    }
+
+    #[test]
+    fn test_cardinality_medium() {
+        let (actual, estimate) = test_cardinality(10000, Zipf::new(10000, 1.2f64).unwrap());
+
+        assert_eq!(actual, [2529, 1618, 1629]);
+        assert!(2309.1 < estimate[0] && estimate[0] < 2309.2);
+        assert!(1566.6 < estimate[1] && estimate[1] < 1566.7);
+        assert!(1629.5 < estimate[2] && estimate[2] < 1629.6);
+    }
+
+    #[test]
+    fn test_cardinality_large() {
+        let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(1_000_000, 1.2f64).unwrap());
+
+        assert_eq!(actual, [129077, 79579, 79630]);
+        assert!(126067.2 < estimate[0] && estimate[0] < 126067.3);
+        assert!(83076.8 < estimate[1] && estimate[1] < 83076.9);
+        assert!(64251.2 < estimate[2] && estimate[2] < 64251.3);
+    }
+
+    #[test]
+    fn test_cardinality_small2() {
+        let (actual, estimate) = test_cardinality(100, Zipf::new(200, 0.8f64).unwrap());
+
+        assert_eq!(actual, [92, 58, 60]);
+        assert!(116.1 < estimate[0] && estimate[0] < 116.2);
+        assert!(81.7 < estimate[1] && estimate[1] < 81.8);
+        assert!(69.3 < estimate[2] && estimate[2] < 69.4);
+    }
+
+    #[test]
+    fn test_cardinality_medium2() {
+        let (actual, estimate) = test_cardinality(10000, Zipf::new(20000, 0.8f64).unwrap());
+
+        assert_eq!(actual, [8201, 5131, 5051]);
+        assert!(6846.4 < estimate[0] && estimate[0] < 6846.5);
+        assert!(5239.1 < estimate[1] && estimate[1] < 5239.2);
+        assert!(4292.8 < estimate[2] && estimate[2] < 4292.9);
+    }
+
+    #[test]
+    fn test_cardinality_large2() {
+        let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(2_000_000, 0.8f64).unwrap());
+
+        assert_eq!(actual, [777847, 482069, 482246]);
+        assert!(699437.4 < estimate[0] && estimate[0] < 699437.5);
+        assert!(374948.9 < estimate[1] && estimate[1] < 374949.0);
+        assert!(434609.7 < estimate[2] && estimate[2] < 434609.8);
+    }
+}
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -28,7 +28,9 @@ use prometheus::{Registry, Result};
 pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
+mod hll;
 pub mod metric_vec_duration;
+pub use hll::{HyperLogLog, HyperLogLogVec};

 pub type UIntGauge = GenericGauge<AtomicU64>;
 pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -20,6 +20,7 @@ strum_macros.workspace = true
 hex.workspace = true
 thiserror.workspace = true
 humantime-serde.workspace = true
+chrono.workspace = true

 workspace_hack.workspace = true

--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -63,16 +63,84 @@ impl KeySpace {
        KeyPartitioning { parts }
    }

+    /// Update the keyspace such that it doesn't contain any range
+    /// that is overlapping with `other`. This can involve splitting or
+    /// removing of existing ranges.
+    pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
+        let (self_start, self_end) = match (self.start(), self.end()) {
+            (Some(start), Some(end)) => (start, end),
+            _ => {
+                // self is empty
+                return;
+            }
+        };
+
+        // Key spaces are sorted by definition, so skip ahead to the first
+        // potentially intersecting range. Similarly, ignore ranges that start
+        // after the current keyspace ends.
+        let other_ranges = other
+            .ranges
+            .iter()
+            .skip_while(|range| self_start >= range.end)
+            .take_while(|range| self_end > range.start);
+
+        for range in other_ranges {
+            while let Some(overlap_at) = self.overlaps_at(range) {
+                let overlapped = self.ranges[overlap_at].clone();
+
+                if overlapped.start < range.start && overlapped.end <= range.end {
+                    // Higher part of the range is completely overlapped.
+                    self.ranges[overlap_at].end = range.start;
+                }
+                if overlapped.start >= range.start && overlapped.end > range.end {
+                    // Lower part of the range is completely overlapped.
+                    self.ranges[overlap_at].start = range.end;
+                }
+                if overlapped.start < range.start && overlapped.end > range.end {
+                    // Middle part of the range is overlapped.
+                    self.ranges[overlap_at].end = range.start;
+                    self.ranges
+                        .insert(overlap_at + 1, range.end..overlapped.end);
+                }
+                if overlapped.start >= range.start && overlapped.end <= range.end {
+                    // Whole range is overlapped
+                    self.ranges.remove(overlap_at);
+                }
+            }
+        }
+    }
+
+    pub fn start(&self) -> Option<Key> {
+        self.ranges.first().map(|range| range.start)
+    }
+
+    pub fn end(&self) -> Option<Key> {
+        self.ranges.last().map(|range| range.end)
+    }
+
+    #[allow(unused)]
+    pub fn total_size(&self) -> usize {
+        self.ranges
+            .iter()
+            .map(|range| key_range_size(range) as usize)
+            .sum()
+    }
+
+    fn overlaps_at(&self, range: &Range<Key>) -> Option<usize> {
+        match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
+            Ok(0) => None,
+            Err(0) => None,
+            Ok(index) if self.ranges[index - 1].end > range.start => Some(index - 1),
+            Err(index) if self.ranges[index - 1].end > range.start => Some(index - 1),
+            _ => None,
+        }
+    }
+
    ///
    /// Check if key space contains overlapping range
    ///
    pub fn overlaps(&self, range: &Range<Key>) -> bool {
-        match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
-            Ok(0) => false,
-            Err(0) => false,
-            Ok(index) => self.ranges[index - 1].end > range.start,
-            Err(index) => self.ranges[index - 1].end > range.start,
-        }
+        self.overlaps_at(range).is_some()
    }
 }

@@ -441,4 +509,118 @@ mod tests {
        //        xxxxxxxxxxx
        assert!(ks.overlaps(&kr(0..30))); // XXXXX This fails currently!
    }
+
+    #[test]
+    fn test_remove_full_overlapps() {
+        let mut key_space1 = KeySpace {
+            ranges: vec![
+                Key::from_i128(1)..Key::from_i128(4),
+                Key::from_i128(5)..Key::from_i128(8),
+                Key::from_i128(10)..Key::from_i128(12),
+            ],
+        };
+        let key_space2 = KeySpace {
+            ranges: vec![
+                Key::from_i128(2)..Key::from_i128(3),
+                Key::from_i128(6)..Key::from_i128(7),
+                Key::from_i128(11)..Key::from_i128(13),
+            ],
+        };
+        key_space1.remove_overlapping_with(&key_space2);
+        assert_eq!(
+            key_space1.ranges,
+            vec![
+                Key::from_i128(1)..Key::from_i128(2),
+                Key::from_i128(3)..Key::from_i128(4),
+                Key::from_i128(5)..Key::from_i128(6),
+                Key::from_i128(7)..Key::from_i128(8),
+                Key::from_i128(10)..Key::from_i128(11)
+            ]
+        );
+    }
+
+    #[test]
+    fn test_remove_partial_overlaps() {
+        // Test partial ovelaps
+        let mut key_space1 = KeySpace {
+            ranges: vec![
+                Key::from_i128(1)..Key::from_i128(5),
+                Key::from_i128(7)..Key::from_i128(10),
+                Key::from_i128(12)..Key::from_i128(15),
+            ],
+        };
+        let key_space2 = KeySpace {
+            ranges: vec![
+                Key::from_i128(3)..Key::from_i128(6),
+                Key::from_i128(8)..Key::from_i128(11),
+                Key::from_i128(14)..Key::from_i128(17),
+            ],
+        };
+        key_space1.remove_overlapping_with(&key_space2);
+        assert_eq!(
+            key_space1.ranges,
+            vec![
+                Key::from_i128(1)..Key::from_i128(3),
+                Key::from_i128(7)..Key::from_i128(8),
+                Key::from_i128(12)..Key::from_i128(14),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_remove_no_overlaps() {
+        let mut key_space1 = KeySpace {
+            ranges: vec![
+                Key::from_i128(1)..Key::from_i128(5),
+                Key::from_i128(7)..Key::from_i128(10),
+                Key::from_i128(12)..Key::from_i128(15),
+            ],
+        };
+        let key_space2 = KeySpace {
+            ranges: vec![
+                Key::from_i128(6)..Key::from_i128(7),
+                Key::from_i128(11)..Key::from_i128(12),
+                Key::from_i128(15)..Key::from_i128(17),
+            ],
+        };
+        key_space1.remove_overlapping_with(&key_space2);
+        assert_eq!(
+            key_space1.ranges,
+            vec![
+                Key::from_i128(1)..Key::from_i128(5),
+                Key::from_i128(7)..Key::from_i128(10),
+                Key::from_i128(12)..Key::from_i128(15),
+            ]
+        );
+    }
+
+    #[test]
+    fn test_remove_one_range_overlaps_multiple() {
+        let mut key_space1 = KeySpace {
+            ranges: vec![
+                Key::from_i128(1)..Key::from_i128(3),
+                Key::from_i128(3)..Key::from_i128(6),
+                Key::from_i128(6)..Key::from_i128(10),
+                Key::from_i128(12)..Key::from_i128(15),
+                Key::from_i128(17)..Key::from_i128(20),
+                Key::from_i128(20)..Key::from_i128(30),
+                Key::from_i128(30)..Key::from_i128(40),
+            ],
+        };
+        let key_space2 = KeySpace {
+            ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
+        };
+        key_space1.remove_overlapping_with(&key_space2);
+        assert_eq!(
+            key_space1.ranges,
+            vec![
+                Key::from_i128(1)..Key::from_i128(3),
+                Key::from_i128(3)..Key::from_i128(6),
+                Key::from_i128(6)..Key::from_i128(9),
+                Key::from_i128(19)..Key::from_i128(20),
+                Key::from_i128(20)..Key::from_i128(30),
+                Key::from_i128(30)..Key::from_i128(40),
+            ]
+        );
+    }
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -8,6 +8,7 @@ use std::{
 };

 use byteorder::{BigEndian, ReadBytesExt};
+use postgres_ffi::BLCKSZ;
 use serde::{Deserialize, Serialize};
 use serde_with::serde_as;
 use strum_macros;
@@ -271,6 +272,7 @@ pub struct TenantConfig {
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
    pub gc_feedback: Option<bool>,
    pub heatmap_period: Option<String>,
+    pub lazy_slru_download: Option<bool>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -364,6 +366,19 @@ pub struct TenantLocationConfigRequest {
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantShardLocation {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantLocationConfigResponse {
+    pub shards: Vec<TenantShardLocation>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantConfigRequest {
@@ -439,6 +454,8 @@ pub struct TenantDetails {
    #[serde(flatten)]
    pub tenant_info: TenantInfo,

+    pub walredo: Option<WalRedoManagerStatus>,
+
    pub timelines: Vec<TimelineId>,
 }

@@ -626,6 +643,12 @@ pub struct TimelineGcRequest {
    pub gc_horizon: Option<u64>,
 }

+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WalRedoManagerStatus {
+    pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
+    pub pid: Option<u32>,
+}
+
 // Wrapped in libpq CopyData
 #[derive(PartialEq, Eq, Debug)]
 pub enum PagestreamFeMessage {
@@ -633,6 +656,7 @@ pub enum PagestreamFeMessage {
    Nblocks(PagestreamNblocksRequest),
    GetPage(PagestreamGetPageRequest),
    DbSize(PagestreamDbSizeRequest),
+    GetSlruSegment(PagestreamGetSlruSegmentRequest),
 }

 // Wrapped in libpq CopyData
@@ -643,6 +667,7 @@ pub enum PagestreamBeMessage {
    GetPage(PagestreamGetPageResponse),
    Error(PagestreamErrorResponse),
    DbSize(PagestreamDbSizeResponse),
+    GetSlruSegment(PagestreamGetSlruSegmentResponse),
 }

 // Keep in sync with `pagestore_client.h`
@@ -653,6 +678,7 @@ enum PagestreamBeMessageTag {
    GetPage = 102,
    Error = 103,
    DbSize = 104,
+    GetSlruSegment = 105,
 }
 impl TryFrom<u8> for PagestreamBeMessageTag {
    type Error = u8;
@@ -663,6 +689,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
            102 => Ok(PagestreamBeMessageTag::GetPage),
            103 => Ok(PagestreamBeMessageTag::Error),
            104 => Ok(PagestreamBeMessageTag::DbSize),
+            105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
            _ => Err(value),
        }
    }
@@ -697,6 +724,14 @@ pub struct PagestreamDbSizeRequest {
    pub dbnode: u32,
 }

+#[derive(Debug, PartialEq, Eq)]
+pub struct PagestreamGetSlruSegmentRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub kind: u8,
+    pub segno: u32,
+}
+
 #[derive(Debug)]
 pub struct PagestreamExistsResponse {
    pub exists: bool,
@@ -712,6 +747,11 @@ pub struct PagestreamGetPageResponse {
    pub page: Bytes,
 }

+#[derive(Debug)]
+pub struct PagestreamGetSlruSegmentResponse {
+    pub segment: Bytes,
+}
+
 #[derive(Debug)]
 pub struct PagestreamErrorResponse {
    pub message: String,
@@ -775,6 +815,14 @@ impl PagestreamFeMessage {
                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.dbnode);
            }
+
+            Self::GetSlruSegment(req) => {
+                bytes.put_u8(4);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
+                bytes.put_u8(req.kind);
+                bytes.put_u32(req.segno);
+            }
        }

        bytes.into()
@@ -825,6 +873,14 @@ impl PagestreamFeMessage {
                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                dbnode: body.read_u32::<BigEndian>()?,
            })),
+            4 => Ok(PagestreamFeMessage::GetSlruSegment(
+                PagestreamGetSlruSegmentRequest {
+                    latest: body.read_u8()? != 0,
+                    lsn: Lsn::from(body.read_u64::<BigEndian>()?),
+                    kind: body.read_u8()?,
+                    segno: body.read_u32::<BigEndian>()?,
+                },
+            )),
            _ => bail!("unknown smgr message tag: {:?}", msg_tag),
        }
    }
@@ -860,6 +916,12 @@ impl PagestreamBeMessage {
                bytes.put_u8(Tag::DbSize as u8);
                bytes.put_i64(resp.db_size);
            }
+
+            Self::GetSlruSegment(resp) => {
+                bytes.put_u8(Tag::GetSlruSegment as u8);
+                bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
+                bytes.put(&resp.segment[..]);
+            }
        }

        bytes.into()
@@ -900,6 +962,14 @@ impl PagestreamBeMessage {
                    let db_size = buf.read_i64::<BigEndian>()?;
                    Self::DbSize(PagestreamDbSizeResponse { db_size })
                }
+                Tag::GetSlruSegment => {
+                    let n_blocks = buf.read_u32::<BigEndian>()?;
+                    let mut segment = vec![0; n_blocks as usize * BLCKSZ as usize];
+                    buf.read_exact(&mut segment)?;
+                    Self::GetSlruSegment(PagestreamGetSlruSegmentResponse {
+                        segment: segment.into(),
+                    })
+                }
            };
        let remaining = buf.into_inner();
        if !remaining.is_empty() {
@@ -918,6 +988,7 @@ impl PagestreamBeMessage {
            Self::GetPage(_) => "GetPage",
            Self::Error(_) => "Error",
            Self::DbSize(_) => "DbSize",
+            Self::GetSlruSegment(_) => "GetSlruSegment",
        }
    }
 }
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -123,9 +123,11 @@ impl RelTag {
    PartialOrd,
    Ord,
    strum_macros::EnumIter,
+    strum_macros::FromRepr,
 )]
+#[repr(u8)]
 pub enum SlruKind {
-    Clog,
+    Clog = 0,
    MultiXactMembers,
    MultiXactOffsets,
 }
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -207,10 +207,16 @@ pub fn find_end_of_wal(
                let seg_offs = curr_lsn.segment_offset(wal_seg_size);
                segment.seek(SeekFrom::Start(seg_offs as u64))?;
                // loop inside segment
-                loop {
+                while curr_lsn.segment_number(wal_seg_size) == segno {
                    let bytes_read = segment.read(&mut buf)?;
                    if bytes_read == 0 {
-                        break; // EOF
+                        debug!(
+                            "find_end_of_wal reached end at {:?}, EOF in segment {:?} at offset {}",
+                            result,
+                            seg_file_path,
+                            curr_lsn.segment_offset(wal_seg_size)
+                        );
+                        return Ok(result);
                    }
                    curr_lsn += bytes_read as u64;
                    decoder.feed_bytes(&buf[0..bytes_read]);
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -10,6 +10,7 @@ byteorder.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
+smallvec.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 thiserror.workspace = true
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -7,7 +7,8 @@ pub mod framed;

 use byteorder::{BigEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
-use std::{borrow::Cow, collections::HashMap, fmt, io, str};
+use smallvec::SmallVec;
+use std::{borrow::Cow, fmt, io, ops::Range, str};

 // re-export for use in utils pageserver_feedback.rs
 pub use postgres_protocol::PG_EPOCH;
@@ -49,29 +50,67 @@ pub enum FeStartupPacket {
    },
 }

-#[derive(Debug)]
 pub struct StartupMessageParams {
-    params: HashMap<String, String>,
+    data: String,
+    pairs: SmallVec<[Range<u32>; 4]>,
+    // for easy access
+    user: Option<Range<u32>>,
+    database: Option<Range<u32>>,
+    options: Option<Range<u32>>,
+    replication: Option<Range<u32>>,
+}
+
+impl fmt::Debug for StartupMessageParams {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_map().entries(self.iter()).finish()
+    }
 }

 impl StartupMessageParams {
    /// Get parameter's value by its name.
    pub fn get(&self, name: &str) -> Option<&str> {
-        self.params.get(name).map(|s| s.as_str())
+        self.pairs
+            .iter()
+            .map(|r| &self.data[r.start as usize..r.end as usize])
+            .find_map(|pair| pair.strip_prefix(name).and_then(|x| x.strip_prefix('\0')))
+    }
+
+    pub fn user(&self) -> Option<&str> {
+        self.user
+            .clone()
+            .and_then(|r| self.data.get(r.start as usize..r.end as usize))
+    }
+
+    pub fn database(&self) -> Option<&str> {
+        self.database
+            .clone()
+            .and_then(|r| self.data.get(r.start as usize..r.end as usize))
+    }
+
+    pub(crate) fn options_str(&self) -> Option<&str> {
+        self.options
+            .clone()
+            .and_then(|r| self.data.get(r.start as usize..r.end as usize))
+    }
+
+    pub fn replication(&self) -> Option<&str> {
+        self.replication
+            .clone()
+            .and_then(|r| self.data.get(r.start as usize..r.end as usize))
    }

    /// Split command-line options according to PostgreSQL's logic,
    /// taking into account all escape sequences but leaving them as-is.
    /// [`None`] means that there's no `options` in [`Self`].
    pub fn options_raw(&self) -> Option<impl Iterator<Item = &str>> {
-        self.get("options").map(Self::parse_options_raw)
+        self.options_str().map(Self::parse_options_raw)
    }

    /// Split command-line options according to PostgreSQL's logic,
    /// applying all escape sequences (using owned strings as needed).
    /// [`None`] means that there's no `options` in [`Self`].
    pub fn options_escaped(&self) -> Option<impl Iterator<Item = Cow<'_, str>>> {
-        self.get("options").map(Self::parse_options_escaped)
+        self.options_str().map(Self::parse_options_escaped)
    }

    /// Split command-line options according to PostgreSQL's logic,
@@ -111,15 +150,44 @@ impl StartupMessageParams {

    /// Iterate through key-value pairs in an arbitrary order.
    pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
-        self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
+        self.pairs
+            .iter()
+            .map(|r| &self.data[r.start as usize..r.end as usize])
+            .flat_map(|pair| pair.split_once('\0'))
    }

    // This function is mostly useful in tests.
    #[doc(hidden)]
    pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self {
-        Self {
-            params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(),
+        let mut this = Self {
+            data: Default::default(),
+            pairs: Default::default(),
+            user: Default::default(),
+            database: Default::default(),
+            options: Default::default(),
+            replication: Default::default(),
+        };
+        for (k, v) in pairs {
+            let start = this.data.len();
+            this.data.push_str(k);
+            this.data.push('\0');
+            let value_offset = this.data.len();
+            this.data.push_str(v);
+            let end = this.data.len();
+            this.data.push('\0');
+            let range = start as u32..end as u32;
+            this.pairs.push(range);
+            let value_range = value_offset as u32..end as u32;
+            match k {
+                "user" => this.user = Some(value_range),
+                "database" => this.database = Some(value_range),
+                "options" => this.options = Some(value_range),
+                "replication" => this.replication = Some(value_range),
+                _ => {}
+            }
        }
+        this.data.push('\0');
+        this
    }
 }

@@ -346,33 +414,62 @@ impl FeStartupPacket {

                // Parse pairs of null-terminated strings (key, value).
                // See `postgres: ProcessStartupPacket, build_startup_packet`.
-                let mut tokens = str::from_utf8(&msg)
+                let data = str::from_utf8(&msg)
                    .map_err(|_e| {
                        ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
                    })?
-                    .strip_suffix('\0') // drop packet's own null
-                    .ok_or_else(|| {
-                        ProtocolError::Protocol(
+                    .to_owned();
+
+                let mut params = StartupMessageParams {
+                    data,
+                    pairs: Default::default(),
+                    user: Default::default(),
+                    database: Default::default(),
+                    options: Default::default(),
+                    replication: Default::default(),
+                };
+
+                let mut offset = 0;
+                let mut rest = params.data.as_str();
+                loop {
+                    let Some((key, rest1)) = rest.split_once('\0') else {
+                        return Err(ProtocolError::Protocol(
                            "StartupMessage params: missing null terminator".to_string(),
-                        )
-                    })?
-                    .split_terminator('\0');
+                        ));
+                    };
+                    // pairs terminated
+                    if key.is_empty() {
+                        params.data.truncate(offset + 1);
+                        params.data.shrink_to_fit();
+                        break;
+                    }
+                    let Some((value, rest2)) = rest1.split_once('\0') else {
+                        return Err(ProtocolError::Protocol(
+                            "StartupMessage params: missing null terminator".to_string(),
+                        ));
+                    };
+                    rest = rest2;

-                let mut params = HashMap::new();
-                while let Some(name) = tokens.next() {
-                    let value = tokens.next().ok_or_else(|| {
-                        ProtocolError::Protocol(
-                            "StartupMessage params: key without value".to_string(),
-                        )
-                    })?;
+                    let start = offset;
+                    let value_offset = offset + key.len() + 1;
+                    let end = value_offset + value.len();
+                    offset = end + 1;

-                    params.insert(name.to_owned(), value.to_owned());
+                    params.pairs.push(start as u32..end as u32);
+                    let value_range = value_offset as u32..end as u32;
+                    match key {
+                        "user" => params.user = Some(value_range),
+                        "database" => params.database = Some(value_range),
+                        "options" => params.options = Some(value_range),
+                        "replication" => params.replication = Some(value_range),
+                        _ => {}
+                    }
                }

                FeStartupPacket::StartupMessage {
                    major_version,
                    minor_version,
-                    params: StartupMessageParams { params },
+                    params,
                }
            }
        };
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -28,6 +28,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::debug;

 use crate::s3_bucket::RequestKind;
+use crate::TimeTravelError;
 use crate::{
    AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
    RemoteStorage, StorageMetadata,
@@ -379,12 +380,10 @@ impl RemoteStorage for AzureBlobStorage {
        _timestamp: SystemTime,
        _done_if_after: SystemTime,
        _cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), TimeTravelError> {
        // TODO use Azure point in time recovery feature for this
        // https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
-        Err(anyhow::anyhow!(
-            "time travel recovery for azure blob storage is not implemented"
-        ))
+        Err(TimeTravelError::Unimplemented)
    }
 }

--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -219,7 +219,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
        timestamp: SystemTime,
        done_if_after: SystemTime,
        cancel: CancellationToken,
-    ) -> anyhow::Result<()>;
+    ) -> Result<(), TimeTravelError>;
 }

 pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
@@ -269,6 +269,45 @@ impl std::fmt::Display for DownloadError {

 impl std::error::Error for DownloadError {}

+#[derive(Debug)]
+pub enum TimeTravelError {
+    /// Validation or other error happened due to user input.
+    BadInput(anyhow::Error),
+    /// The used remote storage does not have time travel recovery implemented
+    Unimplemented,
+    /// The number of versions/deletion markers is above our limit.
+    TooManyVersions,
+    /// A cancellation token aborted the process, typically during
+    /// request closure or process shutdown.
+    Cancelled,
+    /// Other errors
+    Other(anyhow::Error),
+}
+
+impl std::fmt::Display for TimeTravelError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            TimeTravelError::BadInput(e) => {
+                write!(
+                    f,
+                    "Failed to time travel recover a prefix due to user input: {e}"
+                )
+            }
+            TimeTravelError::Unimplemented => write!(
+                f,
+                "time travel recovery is not implemented for the current storage backend"
+            ),
+            TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"),
+            TimeTravelError::TooManyVersions => {
+                write!(f, "Number of versions/delete markers above limit")
+            }
+            TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"),
+        }
+    }
+}
+
+impl std::error::Error for TimeTravelError {}
+
 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
 #[derive(Clone)]
@@ -404,7 +443,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        timestamp: SystemTime,
        done_if_after: SystemTime,
        cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), TimeTravelError> {
        match self {
            Self::LocalFs(s) => {
                s.time_travel_recover(prefix, timestamp, done_if_after, cancel)
@@ -434,7 +473,12 @@ impl GenericRemoteStorage {
                Self::LocalFs(LocalFs::new(root.clone())?)
            }
            RemoteStorageKind::AwsS3(s3_config) => {
-                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
+                // The profile and access key id are only printed here for debugging purposes,
+                // their values don't indicate the eventually taken choice for auth.
+                let profile = std::env::var("AWS_PROFILE").unwrap_or_else(|_| "<none>".into());
+                let access_key_id =
+                    std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
+                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
                      s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
                Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
            }
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -18,7 +18,9 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

-use crate::{Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath};
+use crate::{
+    Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath, TimeTravelError,
+};

 use super::{RemoteStorage, StorageMetadata};

@@ -430,8 +432,8 @@ impl RemoteStorage for LocalFs {
        _timestamp: SystemTime,
        _done_if_after: SystemTime,
        _cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
-        unimplemented!()
+    ) -> Result<(), TimeTravelError> {
+        Err(TimeTravelError::Unimplemented)
    }
 }

--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -46,7 +46,7 @@ use utils::backoff;
 use super::StorageMetadata;
 use crate::{
    ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
-    S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
+    S3Config, TimeTravelError, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 pub(super) mod metrics;
@@ -639,7 +639,7 @@ impl RemoteStorage for S3Bucket {
        timestamp: SystemTime,
        done_if_after: SystemTime,
        cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), TimeTravelError> {
        let kind = RequestKind::TimeTravel;
        let _guard = self.permit(kind).await;

@@ -657,75 +657,112 @@ impl RemoteStorage for S3Bucket {
        let max_retries = 10;
        let is_permanent = |_e: &_| false;

-        let list = backoff::retry(
-            || async {
-                Ok(self
-                    .client
-                    .list_object_versions()
-                    .bucket(self.bucket_name.clone())
-                    .set_prefix(prefix.clone())
-                    .send()
-                    .await?)
-            },
-            is_permanent,
-            warn_threshold,
-            max_retries,
-            "listing object versions for time_travel_recover",
-            backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
-        )
-        .await?;
+        let mut key_marker = None;
+        let mut version_id_marker = None;
+        let mut versions_and_deletes = Vec::new();

-        if list.is_truncated().unwrap_or_default() {
-            anyhow::bail!("Received truncated ListObjectVersions response for prefix={prefix:?}");
+        loop {
+            let response = backoff::retry(
+                || async {
+                    self.client
+                        .list_object_versions()
+                        .bucket(self.bucket_name.clone())
+                        .set_prefix(prefix.clone())
+                        .set_key_marker(key_marker.clone())
+                        .set_version_id_marker(version_id_marker.clone())
+                        .send()
+                        .await
+                        .map_err(|e| TimeTravelError::Other(e.into()))
+                },
+                is_permanent,
+                warn_threshold,
+                max_retries,
+                "listing object versions for time_travel_recover",
+                backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
+            )
+            .await?;
+
+            tracing::trace!(
+                "  Got List response version_id_marker={:?}, key_marker={:?}",
+                response.version_id_marker,
+                response.key_marker
+            );
+            let versions = response
+                .versions
+                .unwrap_or_default()
+                .into_iter()
+                .map(VerOrDelete::from_version);
+            let deletes = response
+                .delete_markers
+                .unwrap_or_default()
+                .into_iter()
+                .map(VerOrDelete::from_delete_marker);
+            itertools::process_results(versions.chain(deletes), |n_vds| {
+                versions_and_deletes.extend(n_vds)
+            })
+            .map_err(TimeTravelError::Other)?;
+            fn none_if_empty(v: Option<String>) -> Option<String> {
+                v.filter(|v| !v.is_empty())
+            }
+            version_id_marker = none_if_empty(response.next_version_id_marker);
+            key_marker = none_if_empty(response.next_key_marker);
+            if version_id_marker.is_none() {
+                // The final response is not supposed to be truncated
+                if response.is_truncated.unwrap_or_default() {
+                    return Err(TimeTravelError::Other(anyhow::anyhow!(
+                        "Received truncated ListObjectVersions response for prefix={prefix:?}"
+                    )));
+                }
+                break;
+            }
+            // Limit the number of versions deletions, mostly so that we don't
+            // keep requesting forever if the list is too long, as we'd put the
+            // list in RAM.
+            // Building a list of 100k entries that reaches the limit roughly takes
+            // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
+            const COMPLEXITY_LIMIT: usize = 100_000;
+            if versions_and_deletes.len() >= COMPLEXITY_LIMIT {
+                return Err(TimeTravelError::TooManyVersions);
+            }
        }

-        let mut versions_deletes = list
-            .versions()
-            .iter()
-            .map(VerOrDelete::Version)
-            .chain(list.delete_markers().iter().map(VerOrDelete::DeleteMarker))
-            .collect::<Vec<_>>();
+        tracing::info!(
+            "Built list for time travel with {} versions and deletions",
+            versions_and_deletes.len()
+        );

-        versions_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified()));
+        // Work on the list of references instead of the objects directly,
+        // otherwise we get lifetime errors in the sort_by_key call below.
+        let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
+
+        versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified));

        let mut vds_for_key = HashMap::<_, Vec<_>>::new();

-        for vd in versions_deletes {
-            let last_modified = vd.last_modified();
-            let version_id = vd.version_id();
-            let key = vd.key();
-            let (Some(last_modified), Some(version_id), Some(key)) =
-                (last_modified, version_id, key)
-            else {
-                anyhow::bail!(
-                    "One (or more) of last_modified, key, and id is None. \
-                    Is versioning enabled in the bucket? last_modified={:?} key={:?} version_id={:?}",
-                    last_modified, key, version_id,
-                );
-            };
+        for vd in &versions_and_deletes {
+            let VerOrDelete {
+                version_id, key, ..
+            } = &vd;
            if version_id == "null" {
-                anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \
-                    indicating either disabled versioning, or legacy objects with null version id values");
+                return Err(TimeTravelError::Other(anyhow!("Received ListVersions response for key={key} with version_id='null', \
+                    indicating either disabled versioning, or legacy objects with null version id values")));
            }
            tracing::trace!(
-                "Parsing version key={key} version_id={version_id} is_delete={}",
-                matches!(vd, VerOrDelete::DeleteMarker(_))
+                "Parsing version key={key} version_id={version_id} kind={:?}",
+                vd.kind
            );

-            vds_for_key
-                .entry(key)
-                .or_default()
-                .push((vd, last_modified, version_id));
+            vds_for_key.entry(key).or_default().push(vd);
        }
        for (key, versions) in vds_for_key {
-            let (last_vd, last_last_modified, _version_id) = versions.last().unwrap();
-            if last_last_modified > &&done_if_after {
+            let last_vd = versions.last().unwrap();
+            if last_vd.last_modified > done_if_after {
                tracing::trace!("Key {key} has version later than done_if_after, skipping");
                continue;
            }
            // the version we want to restore to.
            let version_to_restore_to =
-                match versions.binary_search_by_key(&timestamp, |tpl| *tpl.1) {
+                match versions.binary_search_by_key(&timestamp, |tpl| tpl.last_modified) {
                    Ok(v) => v,
                    Err(e) => e,
                };
@@ -743,7 +780,11 @@ impl RemoteStorage for S3Bucket {
                do_delete = true;
            } else {
                match &versions[version_to_restore_to - 1] {
-                    (VerOrDelete::Version(_), _last_modified, version_id) => {
+                    VerOrDelete {
+                        kind: VerOrDeleteKind::Version,
+                        version_id,
+                        ..
+                    } => {
                        tracing::trace!("Copying old version {version_id} for {key}...");
                        // Restore the state to the last version by copying
                        let source_id =
@@ -751,37 +792,46 @@ impl RemoteStorage for S3Bucket {

                        backoff::retry(
                            || async {
-                                Ok(self
-                                    .client
+                                self.client
                                    .copy_object()
                                    .bucket(self.bucket_name.clone())
                                    .key(key)
                                    .copy_source(&source_id)
                                    .send()
-                                    .await?)
+                                    .await
+                                    .map_err(|e| TimeTravelError::Other(e.into()))
                            },
                            is_permanent,
                            warn_threshold,
                            max_retries,
-                            "listing object versions for time_travel_recover",
-                            backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
+                            "copying object version for time_travel_recover",
+                            backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
                        )
                        .await?;
+                        tracing::info!(%version_id, %key, "Copied old version in S3");
                    }
-                    (VerOrDelete::DeleteMarker(_), _last_modified, _version_id) => {
+                    VerOrDelete {
+                        kind: VerOrDeleteKind::DeleteMarker,
+                        ..
+                    } => {
                        do_delete = true;
                    }
                }
            };
            if do_delete {
-                if matches!(last_vd, VerOrDelete::DeleteMarker(_)) {
+                if matches!(last_vd.kind, VerOrDeleteKind::DeleteMarker) {
                    // Key has since been deleted (but there was some history), no need to do anything
                    tracing::trace!("Key {key} already deleted, skipping.");
                } else {
                    tracing::trace!("Deleting {key}...");

-                    let oid = ObjectIdentifier::builder().key(key.to_owned()).build()?;
-                    self.delete_oids(kind, &[oid]).await?;
+                    let oid = ObjectIdentifier::builder()
+                        .key(key.to_owned())
+                        .build()
+                        .map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
+                    self.delete_oids(kind, &[oid])
+                        .await
+                        .map_err(TimeTravelError::Other)?;
                }
            }
        }
@@ -811,29 +861,59 @@ fn start_measuring_requests(
    })
 }

-enum VerOrDelete<'a> {
-    Version(&'a ObjectVersion),
-    DeleteMarker(&'a DeleteMarkerEntry),
+// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
+struct VerOrDelete {
+    kind: VerOrDeleteKind,
+    last_modified: DateTime,
+    version_id: String,
+    key: String,
 }

-impl<'a> VerOrDelete<'a> {
-    fn last_modified(&self) -> Option<&'a DateTime> {
-        match self {
-            VerOrDelete::Version(v) => v.last_modified(),
-            VerOrDelete::DeleteMarker(v) => v.last_modified(),
-        }
+#[derive(Debug)]
+enum VerOrDeleteKind {
+    Version,
+    DeleteMarker,
+}
+
+impl VerOrDelete {
+    fn with_kind(
+        kind: VerOrDeleteKind,
+        last_modified: Option<DateTime>,
+        version_id: Option<String>,
+        key: Option<String>,
+    ) -> anyhow::Result<Self> {
+        let lvk = (last_modified, version_id, key);
+        let (Some(last_modified), Some(version_id), Some(key)) = lvk else {
+            anyhow::bail!(
+                "One (or more) of last_modified, key, and id is None. \
+            Is versioning enabled in the bucket? last_modified={:?}, version_id={:?}, key={:?}",
+                lvk.0,
+                lvk.1,
+                lvk.2,
+            );
+        };
+        Ok(Self {
+            kind,
+            last_modified,
+            version_id,
+            key,
+        })
    }
-    fn version_id(&self) -> Option<&'a str> {
-        match self {
-            VerOrDelete::Version(v) => v.version_id(),
-            VerOrDelete::DeleteMarker(v) => v.version_id(),
-        }
+    fn from_version(v: ObjectVersion) -> anyhow::Result<Self> {
+        Self::with_kind(
+            VerOrDeleteKind::Version,
+            v.last_modified,
+            v.version_id,
+            v.key,
+        )
    }
-    fn key(&self) -> Option<&'a str> {
-        match self {
-            VerOrDelete::Version(v) => v.key(),
-            VerOrDelete::DeleteMarker(v) => v.key(),
-        }
+    fn from_delete_marker(v: DeleteMarkerEntry) -> anyhow::Result<Self> {
+        Self::with_kind(
+            VerOrDeleteKind::DeleteMarker,
+            v.last_modified,
+            v.version_id,
+            v.key,
+        )
    }
 }

--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -11,7 +11,7 @@ use tokio_util::sync::CancellationToken;

 use crate::{
    Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
-    StorageMetadata,
+    StorageMetadata, TimeTravelError,
 };

 pub struct UnreliableWrapper {
@@ -191,8 +191,9 @@ impl RemoteStorage for UnreliableWrapper {
        timestamp: SystemTime,
        done_if_after: SystemTime,
        cancel: CancellationToken,
-    ) -> anyhow::Result<()> {
-        self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))?;
+    ) -> Result<(), TimeTravelError> {
+        self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
+            .map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
        self.inner
            .time_travel_recover(prefix, timestamp, done_if_after, cancel)
            .await
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -112,6 +112,55 @@ pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Erro
    tokio::fs::File::open(path.as_ref()).await?.sync_all().await
 }

+pub async fn fsync_async_opt(
+    path: impl AsRef<Utf8Path>,
+    do_fsync: bool,
+) -> Result<(), std::io::Error> {
+    if do_fsync {
+        fsync_async(path.as_ref()).await?;
+    }
+    Ok(())
+}
+
+/// Like postgres' durable_rename, renames file issuing fsyncs do make it
+/// durable. After return, file and rename are guaranteed to be persisted.
+///
+/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make
+/// contents durable; 2) its directory entry to make rename durable 3) again to
+/// already renamed file, which is not required by standards but postgres does
+/// it, let's stick to that. Postgres additionally fsyncs newpath *before*
+/// rename if it exists to ensure that at least one of the files survives, but
+/// current callers don't need that.
+///
+/// virtual_file.rs has similar code, but it doesn't use vfs.
+///
+/// Useful links: <https://lwn.net/Articles/457667/>
+/// <https://www.postgresql.org/message-id/flat/56583BDD.9060302%402ndquadrant.com>
+/// <https://thunk.org/tytso/blog/2009/03/15/dont-fear-the-fsync/>
+pub async fn durable_rename(
+    old_path: impl AsRef<Utf8Path>,
+    new_path: impl AsRef<Utf8Path>,
+    do_fsync: bool,
+) -> io::Result<()> {
+    // first fsync the file
+    fsync_async_opt(old_path.as_ref(), do_fsync).await?;
+
+    // Time to do the real deal.
+    tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?;
+
+    // Postgres'ish fsync of renamed file.
+    fsync_async_opt(new_path.as_ref(), do_fsync).await?;
+
+    // Now fsync the parent
+    let parent = match new_path.as_ref().parent() {
+        Some(p) => p,
+        None => Utf8Path::new("./"), // assume current dir if there is no parent
+    };
+    fsync_async_opt(parent, do_fsync).await?;
+
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {

--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -1,4 +1,10 @@
-use std::{sync::Arc, time::Duration};
+use std::{
+    sync::{
+        atomic::{AtomicBool, Ordering},
+        Arc,
+    },
+    time::Duration,
+};

 /// Gates are a concurrency helper, primarily used for implementing safe shutdown.
 ///
@@ -6,62 +12,70 @@ use std::{sync::Arc, time::Duration};
 /// the resource calls `close()` when they want to ensure that all holders of guards
 /// have released them, and that no future guards will be issued.
 pub struct Gate {
-    /// Each caller of enter() takes one unit from the semaphore. In close(), we
-    /// take all the units to ensure all GateGuards are destroyed.
-    sem: Arc<tokio::sync::Semaphore>,
-
-    /// For observability only: a name that will be used to log warnings if a particular
-    /// gate is holding up shutdown
-    name: String,
+    inner: Arc<GateInner>,
 }

 impl std::fmt::Debug for Gate {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "Gate<{}>", self.name)
+        f.debug_struct("Gate")
+            // use this for identification
+            .field("ptr", &Arc::as_ptr(&self.inner))
+            .field("inner", &self.inner)
+            .finish()
+    }
+}
+
+struct GateInner {
+    sem: tokio::sync::Semaphore,
+    closing: std::sync::atomic::AtomicBool,
+}
+
+impl std::fmt::Debug for GateInner {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let avail = self.sem.available_permits();
+
+        let guards = u32::try_from(avail)
+            .ok()
+            // the sem only supports 32-bit ish amount, but lets play it safe
+            .and_then(|x| Gate::MAX_UNITS.checked_sub(x));
+
+        let closing = self.closing.load(Ordering::Relaxed);
+
+        if let Some(guards) = guards {
+            f.debug_struct("Gate")
+                .field("remaining_guards", &guards)
+                .field("closing", &closing)
+                .finish()
+        } else {
+            f.debug_struct("Gate")
+                .field("avail_permits", &avail)
+                .field("closing", &closing)
+                .finish()
+        }
    }
 }

 /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
 /// not complete.
 #[derive(Debug)]
-pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
+pub struct GateGuard {
+    // Record the span where the gate was entered, so that we can identify who was blocking Gate::close
+    span_at_enter: tracing::Span,
+    gate: Arc<GateInner>,
+}

-/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
-async fn warn_if_stuck<Fut: std::future::Future>(
-    fut: Fut,
-    name: &str,
-    warn_period: std::time::Duration,
-) -> <Fut as std::future::Future>::Output {
-    let started = std::time::Instant::now();
-
-    let mut fut = std::pin::pin!(fut);
-
-    let mut warned = false;
-    let ret = loop {
-        match tokio::time::timeout(warn_period, &mut fut).await {
-            Ok(ret) => break ret,
-            Err(_) => {
-                tracing::warn!(
-                    gate = name,
-                    elapsed_ms = started.elapsed().as_millis(),
-                    "still waiting, taking longer than expected..."
-                );
-                warned = true;
-            }
+impl Drop for GateGuard {
+    fn drop(&mut self) {
+        if self.gate.closing.load(Ordering::Relaxed) {
+            self.span_at_enter.in_scope(
+                || tracing::info!(gate = ?Arc::as_ptr(&self.gate), "kept the gate from closing"),
+            );
        }
-    };

-    // If we emitted a warning for slowness, also emit a message when we complete, so that
-    // someone debugging a shutdown can know for sure whether we have moved past this operation.
-    if warned {
-        tracing::info!(
-            gate = name,
-            elapsed_ms = started.elapsed().as_millis(),
-            "completed, after taking longer than expected"
-        )
+        // when the permit was acquired, it was forgotten to allow us to manage it's lifecycle
+        // manually, so "return" the permit now.
+        self.gate.sem.add_permits(1);
    }
-
-    ret
 }

 #[derive(Debug)]
@@ -69,15 +83,19 @@ pub enum GateError {
    GateClosed,
 }

-impl Gate {
-    const MAX_UNITS: u32 = u32::MAX;
-
-    pub fn new(name: String) -> Self {
+impl Default for Gate {
+    fn default() -> Self {
        Self {
-            sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
-            name,
+            inner: Arc::new(GateInner {
+                sem: tokio::sync::Semaphore::new(Self::MAX_UNITS as usize),
+                closing: AtomicBool::new(false),
+            }),
        }
    }
+}
+
+impl Gate {
+    const MAX_UNITS: u32 = u32::MAX;

    /// Acquire a guard that will prevent close() calls from completing. If close()
    /// was already called, this will return an error which should be interpreted
@@ -88,11 +106,23 @@ impl Gate {
    /// to avoid blocking close() indefinitely: typically types that contain a Gate will
    /// also contain a CancellationToken.
    pub fn enter(&self) -> Result<GateGuard, GateError> {
-        self.sem
-            .clone()
-            .try_acquire_owned()
-            .map(GateGuard)
-            .map_err(|_| GateError::GateClosed)
+        let permit = self
+            .inner
+            .sem
+            .try_acquire()
+            .map_err(|_| GateError::GateClosed)?;
+
+        // we now have the permit, let's disable the normal raii functionality and leave
+        // "returning" the permit to our GateGuard::drop.
+        //
+        // this is done to avoid the need for multiple Arcs (one for semaphore, next for other
+        // fields).
+        permit.forget();
+
+        Ok(GateGuard {
+            span_at_enter: tracing::Span::current(),
+            gate: self.inner.clone(),
+        })
    }

    /// Types with a shutdown() method and a gate should call this method at the
@@ -102,48 +132,88 @@ impl Gate {
    /// important that the holders of such guards are respecting a CancellationToken which has
    /// been cancelled before entering this function.
    pub async fn close(&self) {
-        warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
+        let started_at = std::time::Instant::now();
+        let mut do_close = std::pin::pin!(self.do_close());
+
+        let nag_after = Duration::from_secs(1);
+
+        let Err(_timeout) = tokio::time::timeout(nag_after, &mut do_close).await else {
+            return;
+        };
+
+        tracing::info!(
+            gate = ?self.as_ptr(),
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "closing is taking longer than expected"
+        );
+
+        // close operation is not trying to be cancellation safe as pageserver does not need it.
+        //
+        // note: "closing" is not checked in Gate::enter -- it exists just for observability,
+        // dropping of GateGuard after this will log who they were.
+        self.inner.closing.store(true, Ordering::Relaxed);
+
+        do_close.await;
+
+        tracing::info!(
+            gate = ?self.as_ptr(),
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "close completed"
+        );
+    }
+
+    /// Used as an identity of a gate. This identity will be resolved to something useful when
+    /// it's actually closed in a hopefully sensible `tracing::Span` which will describe it even
+    /// more.
+    ///
+    /// `GateGuard::drop` also logs this pointer when it has realized it has been keeping the gate
+    /// open for too long.
+    fn as_ptr(&self) -> *const GateInner {
+        Arc::as_ptr(&self.inner)
    }

    /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish.  This
    /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
    /// the CancellationToken on such types is analogous to "Did shutdown start?"
    pub fn close_complete(&self) -> bool {
-        self.sem.is_closed()
+        self.inner.sem.is_closed()
    }

+    #[tracing::instrument(level = tracing::Level::DEBUG, skip_all, fields(gate = ?self.as_ptr()))]
    async fn do_close(&self) {
-        tracing::debug!(gate = self.name, "Closing Gate...");
-        match self.sem.acquire_many(Self::MAX_UNITS).await {
-            Ok(_units) => {
+        tracing::debug!("Closing Gate...");
+
+        match self.inner.sem.acquire_many(Self::MAX_UNITS).await {
+            Ok(_permit) => {
                // While holding all units, close the semaphore.  All subsequent calls to enter() will fail.
-                self.sem.close();
+                self.inner.sem.close();
            }
-            Err(_) => {
+            Err(_closed) => {
                // Semaphore closed: we are the only function that can do this, so it indicates a double-call.
                // This is legal.  Timeline::shutdown for example is not protected from being called more than
                // once.
-                tracing::debug!(gate = self.name, "Double close")
+                tracing::debug!("Double close")
            }
        }
-        tracing::debug!(gate = self.name, "Closed Gate.")
+        tracing::debug!("Closed Gate.")
    }
 }

 #[cfg(test)]
 mod tests {
-    use futures::FutureExt;
-
    use super::*;

    #[tokio::test]
-    async fn test_idle_gate() {
-        // Having taken no gates, we should not be blocked in close
-        let gate = Gate::new("test".to_string());
+    async fn close_unused() {
+        // Having taken no guards, we should not be blocked in close
+        let gate = Gate::default();
        gate.close().await;
+    }

+    #[tokio::test]
+    async fn close_idle() {
        // If a guard is dropped before entering, close should not be blocked
-        let gate = Gate::new("test".to_string());
+        let gate = Gate::default();
        let guard = gate.enter().unwrap();
        drop(guard);
        gate.close().await;
@@ -152,25 +222,30 @@ mod tests {
        gate.enter().expect_err("enter should fail after close");
    }

-    #[tokio::test]
-    async fn test_busy_gate() {
-        let gate = Gate::new("test".to_string());
+    #[tokio::test(start_paused = true)]
+    async fn close_busy_gate() {
+        let gate = Gate::default();
+        let forever = Duration::from_secs(24 * 7 * 365);

-        let guard = gate.enter().unwrap();
+        let guard =
+            tracing::info_span!("i am holding back the gate").in_scope(|| gate.enter().unwrap());

        let mut close_fut = std::pin::pin!(gate.close());

-        // Close should be blocked
-        assert!(close_fut.as_mut().now_or_never().is_none());
+        // Close should be waiting for guards to drop
+        tokio::time::timeout(forever, &mut close_fut)
+            .await
+            .unwrap_err();

        // Attempting to enter() should fail, even though close isn't done yet.
        gate.enter()
            .expect_err("enter should fail after entering close");

+        // this will now log, which we cannot verify except manually
        drop(guard);

        // Guard is gone, close should finish
-        assert!(close_fut.as_mut().now_or_never().is_some());
+        close_fut.await;

        // Attempting to enter() is still forbidden
        gate.enter().expect_err("enter should fail finishing close");
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -21,7 +21,6 @@ camino.workspace = true
 camino-tempfile.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
-close_fds.workspace = true
 const_format.workspace = true
 consumption_metrics.workspace = true
 crc32c.workspace = true
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -69,6 +69,25 @@ impl Client {
        resp.json().await.map_err(Error::ReceiveBody)
    }

+    /// Get an arbitrary path and returning a streaming Response.  This function is suitable
+    /// for pass-through/proxy use cases where we don't care what the response content looks
+    /// like.
+    ///
+    /// Use/add one of the properly typed methods below if you know aren't proxying, and
+    /// know what kind of response you expect.
+    pub async fn get_raw(&self, path: String) -> Result<reqwest::Response> {
+        debug_assert!(path.starts_with('/'));
+        let uri = format!("{}{}", self.mgmt_api_endpoint, path);
+
+        let req = self.client.request(Method::GET, uri);
+        let req = if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        };
+        req.send().await.map_err(Error::ReceiveBody)
+    }
+
    pub async fn tenant_details(
        &self,
        tenant_shard_id: TenantShardId,
@@ -171,6 +190,25 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

+    /// The tenant deletion API can return 202 if deletion is incomplete, or
+    /// 404 if it is complete.  Callers are responsible for checking the status
+    /// code and retrying.  Error codes other than 404 will return Err().
+    pub async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result<StatusCode> {
+        let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint);
+
+        match self.request(Method::DELETE, &uri, ()).await {
+            Err(Error::ApiError(status_code, msg)) => {
+                if status_code == StatusCode::NOT_FOUND {
+                    Ok(StatusCode::NOT_FOUND)
+                } else {
+                    Err(Error::ApiError(status_code, msg))
+                }
+            }
+            Err(e) => Err(e),
+            Ok(response) => Ok(response.status()),
+        }
+    }
+
    pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
        let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
        self.request(Method::PUT, &uri, req).await?;
@@ -234,6 +272,32 @@ impl Client {
            .map_err(Error::ReceiveBody)
    }

+    /// The timeline deletion API can return 201 if deletion is incomplete, or
+    /// 403 if it is complete.  Callers are responsible for checking the status
+    /// code and retrying.  Error codes other than 403 will return Err().
+    pub async fn timeline_delete(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<StatusCode> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
+            self.mgmt_api_endpoint
+        );
+
+        match self.request(Method::DELETE, &uri, ()).await {
+            Err(Error::ApiError(status_code, msg)) => {
+                if status_code == StatusCode::NOT_FOUND {
+                    Ok(StatusCode::NOT_FOUND)
+                } else {
+                    Err(Error::ApiError(status_code, msg))
+                }
+            }
+            Err(e) => Err(e),
+            Ok(response) => Ok(response.status()),
+        }
+    }
+
    pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
        let uri = format!(
            "{}/v1/tenant/{}/reset",
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -156,7 +156,8 @@ impl PagestreamClient {
            PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
            PagestreamBeMessage::Exists(_)
            | PagestreamBeMessage::Nblocks(_)
-            | PagestreamBeMessage::DbSize(_) => {
+            | PagestreamBeMessage::DbSize(_)
+            | PagestreamBeMessage::GetSlruSegment(_) => {
                anyhow::bail!(
                    "unexpected be message kind in response to getpage request: {}",
                    msg.kind()
--- a/pageserver/pagebench/src/util/request_stats.rs
+++ b/pageserver/pagebench/src/util/request_stats.rs
@@ -66,13 +66,10 @@ impl serde::Serialize for LatencyPercentiles {
    {
        use serde::ser::SerializeMap;
        let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?;
-        for p in LATENCY_PERCENTILES {
+        for (p, v) in LATENCY_PERCENTILES.iter().zip(&self.latency_percentiles) {
            ser.serialize_entry(
                &format!("p{p}"),
-                &format!(
-                    "{}",
-                    &humantime::format_duration(self.latency_percentiles[0])
-                ),
+                &format!("{}", humantime::format_duration(*v)),
            )?;
        }
        ser.end()
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -222,6 +222,8 @@ where
    async fn send_tarball(mut self) -> anyhow::Result<()> {
        // TODO include checksum

+        let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;
+
        // Create pgdata subdirs structure
        for dir in PGDATA_SUBDIRS.iter() {
            let header = new_tar_header_dir(dir)?;
@@ -248,29 +250,29 @@ where
                    .context("could not add config file to basebackup tarball")?;
            }
        }
-
-        // Gather non-relational files from object storage pages.
-        let slru_partitions = self
-            .timeline
-            .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
-            .await?
-            .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);
-
-        let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
-
-        for part in slru_partitions.parts {
-            let blocks = self
+        if !lazy_slru_download {
+            // Gather non-relational files from object storage pages.
+            let slru_partitions = self
                .timeline
-                .get_vectored(&part.ranges, self.lsn, self.ctx)
-                .await?;
+                .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
+                .await?
+                .partition(Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64);

-            for (key, block) in blocks {
-                slru_builder.add_block(&key, block?).await?;
+            let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
+
+            for part in slru_partitions.parts {
+                let blocks = self
+                    .timeline
+                    .get_vectored(&part.ranges, self.lsn, self.ctx)
+                    .await?;
+
+                for (key, block) in blocks {
+                    slru_builder.add_block(&key, block?).await?;
+                }
            }
+            slru_builder.finish().await?;
        }

-        slru_builder.finish().await?;
-
        let mut min_restart_lsn: Lsn = Lsn::MAX;
        // Create tablespace directories
        for ((spcnode, dbnode), has_relmap_file) in
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -33,12 +33,10 @@ use pageserver::{
 use postgres_backend::AuthType;
 use utils::failpoint_support;
 use utils::logging::TracingErrorLayerEnablement;
-use utils::signals::ShutdownSignals;
 use utils::{
    auth::{JwtAuth, SwappableJwtAuth},
    logging, project_build_tag, project_git_version,
    sentry_init::init_sentry,
-    signals::Signal,
    tcp_listener,
 };

@@ -656,34 +654,42 @@ fn start_pageserver(
    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());

    // All started up! Now just sit and wait for shutdown signal.
-    ShutdownSignals::handle(|signal| match signal {
-        Signal::Quit => {
-            info!(
-                "Got {}. Terminating in immediate shutdown mode",
-                signal.name()
-            );
-            std::process::exit(111);
-        }
+    {
+        use signal_hook::consts::*;
+        let signal_handler = BACKGROUND_RUNTIME.spawn_blocking(move || {
+            let mut signals =
+                signal_hook::iterator::Signals::new([SIGINT, SIGTERM, SIGQUIT]).unwrap();
+            return signals
+                .forever()
+                .next()
+                .expect("forever() never returns None unless explicitly closed");
+        });
+        let signal = BACKGROUND_RUNTIME
+            .block_on(signal_handler)
+            .expect("join error");
+        match signal {
+            SIGQUIT => {
+                info!("Got signal {signal}. Terminating in immediate shutdown mode",);
+                std::process::exit(111);
+            }
+            SIGINT | SIGTERM => {
+                info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);

-        Signal::Interrupt | Signal::Terminate => {
-            info!(
-                "Got {}. Terminating gracefully in fast shutdown mode",
-                signal.name()
-            );
-
-            // This cancels the `shutdown_pageserver` cancellation tree.
-            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
-            // The plan is to change that over time.
-            shutdown_pageserver.take();
-            let bg_remote_storage = remote_storage.clone();
-            let bg_deletion_queue = deletion_queue.clone();
-            BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
-                bg_remote_storage.map(|_| bg_deletion_queue),
-                0,
-            ));
-            unreachable!()
+                // This cancels the `shutdown_pageserver` cancellation tree.
+                // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
+                // The plan is to change that over time.
+                shutdown_pageserver.take();
+                let bg_remote_storage = remote_storage.clone();
+                let bg_deletion_queue = deletion_queue.clone();
+                BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+                    bg_remote_storage.map(|_| bg_deletion_queue),
+                    0,
+                ));
+                unreachable!()
+            }
+            _ => unreachable!(),
        }
-    })
+    }
 }

 fn create_remote_storage_client(
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -97,23 +97,86 @@ pub enum EvictionOrder {

    /// Order the layers to be evicted by how recently they have been accessed relatively within
    /// the set of resident layers of a tenant.
-    ///
-    /// This strategy will evict layers more fairly but is untested.
    RelativeAccessed {
-        #[serde(default)]
+        /// Determines if the tenant with most layers should lose first.
+        ///
+        /// Having this enabled is currently the only reasonable option, because the order in which
+        /// we read tenants is deterministic. If we find the need to use this as `false`, we need
+        /// to ensure nondeterminism by adding in a random number to break the
+        /// `relative_last_activity==0.0` ties.
+        #[serde(default = "default_highest_layer_count_loses_first")]
        highest_layer_count_loses_first: bool,
    },
 }

+fn default_highest_layer_count_loses_first() -> bool {
+    true
+}
+
 impl EvictionOrder {
-    /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
-    /// counts should be the first ones to have their layers evicted.
-    fn highest_layer_count_loses_first(&self) -> bool {
+    fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
+        use EvictionOrder::*;
+
        match self {
-            EvictionOrder::AbsoluteAccessed => false,
-            EvictionOrder::RelativeAccessed {
+            AbsoluteAccessed => {
+                candidates.sort_unstable_by_key(|(partition, candidate)| {
+                    (*partition, candidate.last_activity_ts)
+                });
+            }
+            RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
+                (*partition, candidate.relative_last_activity)
+            }),
+        }
+    }
+
+    /// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants
+    /// layers in **most** recently used order.
+    fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 {
+        use EvictionOrder::*;
+
+        match self {
+            AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
+            RelativeAccessed {
                highest_layer_count_loses_first,
-            } => *highest_layer_count_loses_first,
+            } => {
+                // keeping the -1 or not decides if every tenant should lose their least recently accessed
+                // layer OR if this should happen in the order of having highest layer count:
+                let fudge = if *highest_layer_count_loses_first {
+                    // relative_last_activity vs. tenant layer count:
+                    // - 0.1..=1.0 (10 layers)
+                    // - 0.01..=1.0 (100 layers)
+                    // - 0.001..=1.0 (1000 layers)
+                    //
+                    // leading to evicting less of the smallest tenants.
+                    0
+                } else {
+                    // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
+                    // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
+                    // be that less than 10k layer evictions is enough, so we would not need to evict from
+                    // all tenants.
+                    //
+                    // as the tenant ordering is now deterministic this could hit the same tenants
+                    // disproportionetly on multiple invocations. alternative could be to remember how many
+                    // layers did we evict last time from this tenant, and inject that as an additional
+                    // fudge here.
+                    1
+                };
+
+                let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1);
+                let divider = total as f32;
+
+                // most recently used is always (total - 0) / divider == 1.0
+                // least recently used depends on the fudge:
+                // -       (total - 1) - (total - 1) / total => 0 / total
+                // -             total - (total - 1) / total => 1 / total
+                let distance = (total - index) as f32;
+
+                finite_f32::FiniteF32::try_from_normalized(distance / divider)
+                    .unwrap_or_else(|val| {
+                        tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}");
+                        finite_f32::FiniteF32::ZERO
+                    })
+            }
        }
    }
 }
@@ -389,52 +452,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

    let selection = select_victims(&candidates, usage_pre);

-    let mut candidates = candidates;
-
-    let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) {
-        // we currently have the layers ordered by AbsoluteAccessed so that we can get the summary
-        // for comparison here. this is a temporary measure to develop alternatives.
-        use std::fmt::Write;
-
-        let mut summary_buf = String::with_capacity(256);
-
-        {
-            let absolute_summary = candidates
-                .iter()
-                .take(selection.amount)
-                .map(|(_, candidate)| candidate)
-                .collect::<summary::EvictionSummary>();
-
-            write!(summary_buf, "{absolute_summary}").expect("string grows");
-
-            info!("absolute accessed selection summary: {summary_buf}");
-        }
-
-        candidates.sort_unstable_by_key(|(partition, candidate)| {
-            (*partition, candidate.relative_last_activity)
-        });
-
-        let selection = select_victims(&candidates, usage_pre);
-
-        {
-            summary_buf.clear();
-
-            let relative_summary = candidates
-                .iter()
-                .take(selection.amount)
-                .map(|(_, candidate)| candidate)
-                .collect::<summary::EvictionSummary>();
-
-            write!(summary_buf, "{relative_summary}").expect("string grows");
-
-            info!("relative accessed selection summary: {summary_buf}");
-        }
-
-        selection
-    } else {
-        selection
-    };
-
    let (evicted_amount, usage_planned) = selection.into_amount_and_planned();

    // phase2: evict layers
@@ -835,54 +852,12 @@ async fn collect_eviction_candidates(
            .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
        let mut cumsum: i128 = 0;

-        // keeping the -1 or not decides if every tenant should lose their least recently accessed
-        // layer OR if this should happen in the order of having highest layer count:
-        let fudge = if eviction_order.highest_layer_count_loses_first() {
-            // relative_age vs. tenant layer count:
-            // - 0.1..=1.0 (10 layers)
-            // - 0.01..=1.0 (100 layers)
-            // - 0.001..=1.0 (1000 layers)
-            //
-            // leading to evicting less of the smallest tenants.
-            0
-        } else {
-            // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
-            // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
-            // be that less than 10k layer evictions is enough, so we would not need to evict from
-            // all tenants.
-            //
-            // as the tenant ordering is now deterministic this could hit the same tenants
-            // disproportionetly on multiple invocations. alternative could be to remember how many
-            // layers did we evict last time from this tenant, and inject that as an additional
-            // fudge here.
-            1
-        };
-
-        let total = tenant_candidates
-            .len()
-            .checked_sub(fudge)
-            .filter(|&x| x > 0)
-            // support 0 or 1 resident layer tenants as well
-            .unwrap_or(1);
-        let divider = total as f32;
+        let total = tenant_candidates.len();

        for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
            // as we iterate this reverse sorted list, the most recently accessed layer will always
            // be 1.0; this is for us to evict it last.
-            candidate.relative_last_activity = if matches!(
-                eviction_order,
-                EvictionOrder::RelativeAccessed { .. }
-            ) {
-                // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
-                // similarly for u16. unsure how it would help.
-                finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
-                    .unwrap_or_else(|val| {
-                        tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
-                        finite_f32::FiniteF32::ZERO
-                    })
-            } else {
-                finite_f32::FiniteF32::ZERO
-            };
+            candidate.relative_last_activity = eviction_order.relative_last_activity(total, i);

            let partition = if cumsum > min_resident_size as i128 {
                MinResidentSizePartition::Above
@@ -927,10 +902,7 @@ async fn collect_eviction_candidates(
    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");

-    // always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we
-    // will sort later by candidate.relative_last_activity to get compare evictions.
-    candidates
-        .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
+    eviction_order.sort(&mut candidates);

    Ok(EvictionCandidates::Finished(candidates))
 }
@@ -1070,6 +1042,12 @@ pub(crate) mod finite_f32 {
        }
    }

+    impl From<FiniteF32> for f32 {
+        fn from(value: FiniteF32) -> f32 {
+            value.0
+        }
+    }
+
    impl FiniteF32 {
        pub const ZERO: FiniteF32 = FiniteF32(0.0);

@@ -1082,136 +1060,9 @@ pub(crate) mod finite_f32 {
                Err(value)
            }
        }
-    }
-}

-mod summary {
-    use super::finite_f32::FiniteF32;
-    use super::{EvictionCandidate, LayerCount};
-    use pageserver_api::shard::TenantShardId;
-    use std::collections::{BTreeMap, HashMap};
-    use std::time::SystemTime;
-
-    #[derive(Debug, Default)]
-    pub(super) struct EvictionSummary {
-        evicted_per_tenant: HashMap<TenantShardId, LayerCount>,
-        total: LayerCount,
-
-        last_absolute: Option<SystemTime>,
-        last_relative: Option<FiniteF32>,
-    }
-
-    impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary {
-        fn from_iter<T: IntoIterator<Item = &'a EvictionCandidate>>(iter: T) -> Self {
-            let mut summary = EvictionSummary::default();
-            for item in iter {
-                let counts = summary
-                    .evicted_per_tenant
-                    .entry(*item.layer.get_tenant_shard_id())
-                    .or_default();
-
-                let sz = item.layer.get_file_size();
-
-                counts.file_sizes += sz;
-                counts.count += 1;
-
-                summary.total.file_sizes += sz;
-                summary.total.count += 1;
-
-                summary.last_absolute = Some(item.last_activity_ts);
-                summary.last_relative = Some(item.relative_last_activity);
-            }
-
-            summary
-        }
-    }
-
-    struct SiBytesAmount(u64);
-
-    impl std::fmt::Display for SiBytesAmount {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            if self.0 < 1024 {
-                return write!(f, "{}B", self.0);
-            }
-
-            let mut tmp = self.0;
-            let mut ch = 0;
-            let suffixes = b"KMGTPE";
-
-            while tmp > 1024 * 1024 && ch < suffixes.len() - 1 {
-                tmp /= 1024;
-                ch += 1;
-            }
-
-            let ch = suffixes[ch] as char;
-
-            write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0)
-        }
-    }
-
-    impl std::fmt::Display for EvictionSummary {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            // wasteful, but it's for testing
-
-            let mut sorted: BTreeMap<usize, Vec<(TenantShardId, u64)>> = BTreeMap::new();
-
-            for (tenant_shard_id, count) in &self.evicted_per_tenant {
-                sorted
-                    .entry(count.count)
-                    .or_default()
-                    .push((*tenant_shard_id, count.file_sizes));
-            }
-
-            let total_file_sizes = SiBytesAmount(self.total.file_sizes);
-
-            writeln!(
-                f,
-                "selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):",
-                self.total.count, self.last_absolute, self.last_relative,
-            )?;
-
-            for (count, per_tenant) in sorted.iter().rev().take(10) {
-                write!(f, "- {count} layers: ")?;
-
-                if per_tenant.len() < 3 {
-                    for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() {
-                        if i > 0 {
-                            write!(f, ", ")?;
-                        }
-                        let bytes = SiBytesAmount(*bytes);
-                        write!(f, "{tenant_shard_id} ({bytes})")?;
-                    }
-                } else {
-                    let num_tenants = per_tenant.len();
-                    let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>();
-                    let total_bytes = SiBytesAmount(total_bytes);
-                    let layers = num_tenants * count;
-
-                    write!(
-                        f,
-                        "{num_tenants} tenants {total_bytes} in total {layers} layers",
-                    )?;
-                }
-
-                writeln!(f)?;
-            }
-
-            if sorted.len() > 10 {
-                let (rem_count, rem_bytes) = sorted
-                    .iter()
-                    .rev()
-                    .map(|(count, per_tenant)| {
-                        (
-                            count,
-                            per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>(),
-                        )
-                    })
-                    .fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1));
-                let rem_bytes = SiBytesAmount(rem_bytes);
-                writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?;
-            }
-
-            Ok(())
+        pub fn into_inner(self) -> f32 {
+            self.into()
        }
    }
 }
@@ -1336,3 +1187,40 @@ mod filesystem_level_usage {
        assert!(!usage.has_pressure());
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn relative_equal_bounds() {
+        let order = EvictionOrder::RelativeAccessed {
+            highest_layer_count_loses_first: false,
+        };
+
+        let len = 10;
+        let v = (0..len)
+            .map(|i| order.relative_last_activity(len, i).into_inner())
+            .collect::<Vec<_>>();
+
+        assert_eq!(v.first(), Some(&1.0));
+        assert_eq!(v.last(), Some(&0.0));
+        assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
+    }
+
+    #[test]
+    fn relative_spare_bounds() {
+        let order = EvictionOrder::RelativeAccessed {
+            highest_layer_count_loses_first: true,
+        };
+
+        let len = 10;
+        let v = (0..len)
+            .map(|i| order.relative_last_activity(len, i).into_inner())
+            .collect::<Vec<_>>();
+
+        assert_eq!(v.first(), Some(&1.0));
+        assert_eq!(v.last(), Some(&0.1));
+        assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
+    }
+}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -178,6 +178,64 @@ paths:
              schema:
                $ref: "#/components/schemas/ServiceUnavailableError"

+  /v1/tenant/{tenant_id}/time_travel_remote_storage:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: travel_to
+        in: query
+        required: true
+        schema:
+          type: string
+          format: date-time
+      - name: done_if_after
+        in: query
+        required: true
+        schema:
+          type: string
+          format: date-time
+    put:
+      description: Time travel the tenant's remote storage
+      responses:
+        "200":
+          description: OK
+          content:
+            application/json:
+              schema:
+                type: string
+        "400":
+          description: Error when no tenant id found in path or invalid timestamp
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/timeline:
    parameters:
@@ -419,12 +477,6 @@ paths:
            type: string
            format: date-time
          description: A timestamp to get the LSN
-        - name: version
-          in: query
-          required: false
-          schema:
-            type: integer
-          description: The version of the endpoint to use
      responses:
        "200":
          description: OK
@@ -674,6 +726,10 @@ paths:
      responses:
        "200":
          description: Tenant is now in requested state
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/TenantLocationConfigResponse"
        "503":
          description: Tenant's state cannot be changed right now.  Wait a few seconds and retry.
          content:
@@ -1426,6 +1482,28 @@ components:
          $ref: '#/components/schemas/SecondaryConfig'
        tenant_conf:
          $ref: '#/components/schemas/TenantConfig'
+    TenantLocationConfigResponse:
+      type: object
+      required:
+        - shards
+      properties:
+        shards:
+          description: Pageservers where this tenant's shards are attached.  Not populated for secondary locations.
+          type: array
+          items:
+            $ref: "#/components/schemas/TenantShardLocation"
+    TenantShardLocation:
+      type: object
+      required:
+        - node_id
+        - shard_id
+      properties:
+        node_id:
+          description: Pageserver node ID where this shard is attached
+          type: integer
+        shard_id:
+          description: Tenant shard ID of the shard
+          type: string
    SecondaryConfig:
      type: object
      properties:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -17,6 +17,8 @@ use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
+use pageserver_api::models::TenantLocationConfigResponse;
+use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
@@ -24,6 +26,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
+use remote_storage::TimeTravelError;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -45,6 +48,7 @@ use crate::tenant::mgr::{
    TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
 use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
+use crate::tenant::remote_timeline_client;
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -75,8 +79,14 @@ use utils::{
 // For APIs that require an Active tenant, how long should we block waiting for that state?
 // This is not functionally necessary (clients will retry), but avoids generating a lot of
 // failed API calls while tenants are activating.
+#[cfg(not(feature = "testing"))]
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);

+// Tests run on slow/oversubscribed nodes, and may need to wait much longer for tenants to
+// finish attaching, if calls to remote storage are slow.
+#[cfg(feature = "testing")]
+const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
+
 pub struct State {
    conf: &'static PageServerConf,
    tenant_manager: Arc<TenantManager>,
@@ -957,6 +967,7 @@ async fn tenant_status(
                attachment_status: state.attachment_status(),
                generation: tenant.generation().into(),
            },
+            walredo: tenant.wal_redo_manager_status(),
            timelines: tenant.list_timeline_ids(),
        })
    }
@@ -1356,7 +1367,7 @@ async fn put_tenant_location_config_handler(
    let location_conf =
        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    state
+    let attached = state
        .tenant_manager
        .upsert_location(
            tenant_shard_id,
@@ -1365,7 +1376,8 @@ async fn put_tenant_location_config_handler(
            tenant::SpawnMode::Normal,
            &ctx,
        )
-        .await?;
+        .await?
+        .is_some();

    if let Some(_flush_ms) = flush {
        match state
@@ -1384,7 +1396,18 @@ async fn put_tenant_location_config_handler(
        tracing::info!("No flush requested when configuring");
    }

-    json_response(StatusCode::OK, ())
+    // This API returns a vector of pageservers where the tenant is attached: this is
+    // primarily for use in the sharding service.  For compatibilty, we also return this
+    // when called directly on a pageserver, but the payload is always zero or one shards.
+    let mut response = TenantLocationConfigResponse { shards: Vec::new() };
+    if attached {
+        response.shards.push(TenantShardLocation {
+            shard_id: tenant_shard_id,
+            node_id: state.conf.id,
+        })
+    }
+
+    json_response(StatusCode::OK, response)
 }

 async fn list_location_config_handler(
@@ -1409,6 +1432,79 @@ async fn list_location_config_handler(
    json_response(StatusCode::OK, result)
 }

+// Do a time travel recovery on the given tenant/tenant shard. Tenant needs to be detached
+// (from all pageservers) as it invalidates consistency assumptions.
+async fn tenant_time_travel_remote_storage_handler(
+    request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let timestamp_raw = must_get_query_param(&request, "travel_to")?;
+    let timestamp = humantime::parse_rfc3339(&timestamp_raw)
+        .with_context(|| format!("Invalid time for travel_to: {timestamp_raw:?}"))
+        .map_err(ApiError::BadRequest)?;
+
+    let done_if_after_raw = must_get_query_param(&request, "done_if_after")?;
+    let done_if_after = humantime::parse_rfc3339(&done_if_after_raw)
+        .with_context(|| format!("Invalid time for done_if_after: {done_if_after_raw:?}"))
+        .map_err(ApiError::BadRequest)?;
+
+    // This is just a sanity check to fend off naive wrong usages of the API:
+    // the tenant needs to be detached *everywhere*
+    let state = get_state(&request);
+    let we_manage_tenant = state.tenant_manager.manages_tenant_shard(tenant_shard_id);
+    if we_manage_tenant {
+        return Err(ApiError::BadRequest(anyhow!(
+            "Tenant {tenant_shard_id} is already attached at this pageserver"
+        )));
+    }
+
+    let Some(storage) = state.remote_storage.as_ref() else {
+        return Err(ApiError::InternalServerError(anyhow::anyhow!(
+            "remote storage not configured, cannot run time travel"
+        )));
+    };
+
+    if timestamp > done_if_after {
+        return Err(ApiError::BadRequest(anyhow!(
+            "The done_if_after timestamp comes before the timestamp to recover to"
+        )));
+    }
+
+    tracing::info!("Issuing time travel request internally. timestamp={timestamp_raw}, done_if_after={done_if_after_raw}");
+
+    remote_timeline_client::upload::time_travel_recover_tenant(
+        storage,
+        &tenant_shard_id,
+        timestamp,
+        done_if_after,
+        &cancel,
+    )
+    .await
+    .map_err(|e| match e {
+        TimeTravelError::BadInput(e) => {
+            warn!("bad input error: {e}");
+            ApiError::BadRequest(anyhow!("bad input error"))
+        }
+        TimeTravelError::Unimplemented => {
+            ApiError::BadRequest(anyhow!("unimplemented for the configured remote storage"))
+        }
+        TimeTravelError::Cancelled => ApiError::InternalServerError(anyhow!("cancelled")),
+        TimeTravelError::TooManyVersions => {
+            ApiError::InternalServerError(anyhow!("too many versions in remote storage"))
+        }
+        TimeTravelError::Other(e) => {
+            warn!("internal error: {e}");
+            ApiError::InternalServerError(anyhow!("internal error"))
+        }
+    })?;
+
+    json_response(StatusCode::OK, ())
+}
+
 /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
 async fn handle_tenant_break(
    r: Request<Body>,
@@ -1954,6 +2050,10 @@ pub fn make_router(
        .get("/v1/location_config", |r| {
            api_handler(r, list_location_config_handler)
        })
+        .put(
+            "/v1/tenant/:tenant_shard_id/time_travel_remote_storage",
+            |r| api_handler(r, tenant_time_travel_remote_storage_handler),
+        )
        .get("/v1/tenant/:tenant_shard_id/timeline", |r| {
            api_handler(r, timeline_list_handler)
        })
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1043,6 +1043,7 @@ pub enum SmgrQueryType {
    GetRelSize,
    GetPageAtLsn,
    GetDbSize,
+    GetSlruSegment,
 }

 #[derive(Debug)]
@@ -1159,11 +1160,12 @@ mod smgr_query_time_tests {
    #[test]
    fn op_label_name() {
        use super::SmgrQueryType::*;
-        let expect: [(super::SmgrQueryType, &'static str); 4] = [
+        let expect: [(super::SmgrQueryType, &'static str); 5] = [
            (GetRelExists, "get_rel_exists"),
            (GetRelSize, "get_rel_size"),
            (GetPageAtLsn, "get_page_at_lsn"),
            (GetDbSize, "get_db_size"),
+            (GetSlruSegment, "get_slru_segment"),
        ];
        for (op, expect) in expect {
            let actual: &'static str = op.into();
@@ -1649,11 +1651,18 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    .unwrap()
 });

+#[rustfmt::skip]
 pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_process_launch_duration",
        "Histogram of the duration of successful WalRedoProcess::launch calls",
-        redo_histogram_time_buckets!(),
+        vec![
+            0.0002, 0.0004, 0.0006, 0.0008, 0.0010,
+            0.0020, 0.0040, 0.0060, 0.0080, 0.0100,
+            0.0200, 0.0400, 0.0600, 0.0800, 0.1000,
+            0.2000, 0.4000, 0.6000, 0.8000, 1.0000,
+            1.5000, 2.0000, 2.5000, 3.0000, 4.0000, 10.0000
+        ],
    )
    .expect("failed to define a metric")
 });
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -22,7 +22,8 @@ use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
-    PagestreamNblocksRequest, PagestreamNblocksResponse,
+    PagestreamGetSlruSegmentRequest, PagestreamGetSlruSegmentResponse, PagestreamNblocksRequest,
+    PagestreamNblocksResponse,
 };
 use pageserver_api::shard::ShardIndex;
 use pageserver_api::shard::{ShardCount, ShardNumber};
@@ -74,8 +75,8 @@ use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;
-
 use pageserver_api::key::rel_block_to_key;
+use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

@@ -368,6 +369,16 @@ impl From<WaitLsnError> for PageStreamError {
    }
 }

+impl From<WaitLsnError> for QueryError {
+    fn from(value: WaitLsnError) -> Self {
+        match value {
+            e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
+            WaitLsnError::Shutdown => Self::Shutdown,
+            WaitLsnError::BadState => Self::Reconnect,
+        }
+    }
+}
+
 impl PageServerHandler {
    pub fn new(
        conf: &'static PageServerConf,
@@ -637,6 +648,15 @@ impl PageServerHandler {
                        span,
                    )
                }
+                PagestreamFeMessage::GetSlruSegment(req) => {
+                    let span = tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.lsn);
+                    (
+                        self.handle_get_slru_segment_request(tenant_id, timeline_id, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
+                        span,
+                    )
+                }
            };

            match response {
@@ -1127,6 +1147,33 @@ impl PageServerHandler {
        }))
    }

+    async fn handle_get_slru_segment_request(
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        req: &PagestreamGetSlruSegmentRequest,
+        ctx: &RequestContext,
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
+        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+
+        let _timer = timeline
+            .query_metrics
+            .start_timer(metrics::SmgrQueryType::GetSlruSegment);
+
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
+
+        let kind = SlruKind::from_repr(req.kind)
+            .ok_or(PageStreamError::BadRequest("invalid SLRU kind".into()))?;
+        let segment = timeline.get_slru_segment(kind, req.segno, lsn, ctx).await?;
+
+        Ok(PagestreamBeMessage::GetSlruSegment(
+            PagestreamGetSlruSegmentResponse { segment },
+        ))
+    }
+
    #[allow(clippy::too_many_arguments)]
    #[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
    async fn handle_basebackup_request<IO>(
@@ -1139,7 +1186,7 @@ impl PageServerHandler {
        full_backup: bool,
        gzip: bool,
        ctx: RequestContext,
-    ) -> anyhow::Result<()>
+    ) -> Result<(), QueryError>
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
@@ -1404,7 +1451,7 @@ where
                    )
                    .await?;
                    pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                    anyhow::Ok(())
+                    Result::<(), QueryError>::Ok(())
                },
            )
            .await?;
@@ -1678,6 +1725,7 @@ impl From<GetActiveTenantError> for QueryError {
            | GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
                QueryError::Shutdown
            }
+            e @ GetActiveTenantError::NotFound(_) => QueryError::NotFound(format!("{e}").into()),
            e => QueryError::Other(anyhow::anyhow!(e)),
        }
    }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -12,7 +12,7 @@ use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{ensure, Context};
-use bytes::{Buf, Bytes};
+use bytes::{Buf, Bytes, BytesMut};
 use pageserver_api::key::{
    dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
    rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
@@ -321,6 +321,27 @@ impl Timeline {
        }
    }

+    /// Get the whole SLRU segment
+    pub(crate) async fn get_slru_segment(
+        &self,
+        kind: SlruKind,
+        segno: u32,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        let n_blocks = self
+            .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
+            .await?;
+        let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
+        for blkno in 0..n_blocks {
+            let block = self
+                .get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx)
+                .await?;
+            segment.extend_from_slice(&block[..BLCKSZ as usize]);
+        }
+        Ok(segment.freeze())
+    }
+
    /// Look up given SLRU page version.
    pub(crate) async fn get_slru_page_at_lsn(
        &self,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -20,6 +20,7 @@ use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models;
 use pageserver_api::models::TimelineState;
+use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
@@ -364,6 +365,14 @@ impl WalRedoManager {
            }
        }
    }
+
+    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
+        match self {
+            WalRedoManager::Prod(m) => m.status(),
+            #[cfg(test)]
+            WalRedoManager::Test(_) => None,
+        }
+    }
 }

 #[derive(Debug, thiserror::Error, PartialEq, Eq)]
@@ -1020,6 +1029,7 @@ impl Tenant {
                Some(remote_timeline_client),
                self.deletion_queue_client.clone(),
            )
+            .instrument(tracing::info_span!("timeline_delete", %timeline_id))
            .await
            .context("resume_deletion")
            .map_err(LoadLocalTimelineError::ResumeDeletion)?;
@@ -1956,6 +1966,10 @@ impl Tenant {
        self.generation
    }

+    pub(crate) fn wal_redo_manager_status(&self) -> Option<WalRedoManagerStatus> {
+        self.walredo_mgr.status()
+    }
+
    /// Changes tenant status to active, unless shutdown was already requested.
    ///
    /// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup
@@ -2093,7 +2107,10 @@ impl Tenant {
            let timelines = self.timelines.lock().unwrap();
            timelines.values().for_each(|timeline| {
                let timeline = Arc::clone(timeline);
-                let span = Span::current();
+                let timeline_id = timeline.timeline_id;
+
+                let span =
+                    tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush);
                js.spawn(async move {
                    if freeze_and_flush {
                        timeline.flush_and_shutdown().instrument(span).await
@@ -2693,7 +2710,7 @@ impl Tenant {
            activate_now_sem: tokio::sync::Semaphore::new(0),
            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
            cancel: CancellationToken::default(),
-            gate: Gate::new(format!("Tenant<{tenant_shard_id}>")),
+            gate: Gate::default(),
        }
    }

@@ -3778,6 +3795,11 @@ async fn run_initdb(
        .env_clear()
        .env("LD_LIBRARY_PATH", &initdb_lib_dir)
        .env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
+        .stdin(std::process::Stdio::null())
+        // stdout invocation produces the same output every time, we don't need it
+        .stdout(std::process::Stdio::null())
+        // we would be interested in the stderr output, if there was any
+        .stderr(std::process::Stdio::piped())
        .spawn()?;

    // Ideally we'd select here with the cancellation token, but the problem is that
@@ -3898,6 +3920,7 @@ pub(crate) mod harness {
                ),
                gc_feedback: Some(tenant_conf.gc_feedback),
                heatmap_period: Some(tenant_conf.heatmap_period),
+                lazy_slru_download: Some(tenant_conf.lazy_slru_download),
            }
        }
    }
@@ -5220,7 +5243,7 @@ mod tests {
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
                .shutdown()
-                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id))
+                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, timeline_id=%TIMELINE_ID))
                .await;
            std::mem::forget(tline);
        }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -345,6 +345,9 @@ pub struct TenantConf {
    /// may be disabled if a Tenant will not have secondary locations: only secondary
    /// locations will use the heatmap uploaded by attached locations.
    pub heatmap_period: Duration,
+
+    /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
+    pub lazy_slru_download: bool,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -430,6 +433,10 @@ pub struct TenantConfOpt {
    #[serde(with = "humantime_serde")]
    #[serde(default)]
    pub heatmap_period: Option<Duration>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub lazy_slru_download: Option<bool>,
 }

 impl TenantConfOpt {
@@ -475,6 +482,9 @@ impl TenantConfOpt {
                .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
            gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
            heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period),
+            lazy_slru_download: self
+                .lazy_slru_download
+                .unwrap_or(global_conf.lazy_slru_download),
        }
    }
 }
@@ -513,6 +523,7 @@ impl Default for TenantConf {
            .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
            gc_feedback: false,
            heatmap_period: Duration::ZERO,
+            lazy_slru_download: false,
        }
    }
 }
@@ -584,6 +595,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
                .map(humantime),
            gc_feedback: value.gc_feedback,
            heatmap_period: value.heatmap_period.map(humantime),
+            lazy_slru_download: value.lazy_slru_download,
        }
    }
 }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -136,7 +136,11 @@ async fn schedule_ordered_timeline_deletions(
    let mut already_running_deletions = vec![];

    for (timeline_id, _) in sorted.into_iter().rev() {
-        if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await {
+        let span = tracing::info_span!("timeline_delete", %timeline_id);
+        let res = DeleteTimelineFlow::run(tenant, timeline_id, true)
+            .instrument(span)
+            .await;
+        if let Err(e) = res {
            match e {
                DeleteTimelineError::NotFound => {
                    // Timeline deletion finished after call to clone above but before call
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,7 +51,10 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
-use std::collections::VecDeque;
+use pageserver_api::keyspace::KeySpaceAccum;
+use std::cmp::Ordering;
+use std::collections::{BTreeMap, VecDeque};
+use std::iter::Peekable;
 use std::ops::Range;
 use std::sync::Arc;
 use utils::lsn::Lsn;
@@ -144,11 +147,221 @@ impl Drop for BatchedUpdates<'_> {
 }

 /// Return value of LayerMap::search
+#[derive(Eq, PartialEq, Debug)]
 pub struct SearchResult {
    pub layer: Arc<PersistentLayerDesc>,
    pub lsn_floor: Lsn,
 }

+pub struct OrderedSearchResult(SearchResult);
+
+impl Ord for OrderedSearchResult {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.0.lsn_floor.cmp(&other.0.lsn_floor)
+    }
+}
+
+impl PartialOrd for OrderedSearchResult {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl PartialEq for OrderedSearchResult {
+    fn eq(&self, other: &Self) -> bool {
+        self.0.lsn_floor == other.0.lsn_floor
+    }
+}
+
+impl Eq for OrderedSearchResult {}
+
+pub struct RangeSearchResult {
+    pub found: BTreeMap<OrderedSearchResult, KeySpaceAccum>,
+    pub not_found: KeySpaceAccum,
+}
+
+impl RangeSearchResult {
+    fn new() -> Self {
+        Self {
+            found: BTreeMap::new(),
+            not_found: KeySpaceAccum::new(),
+        }
+    }
+}
+
+/// Collector for results of range search queries on the LayerMap.
+/// It should be provided with two iterators for the delta and image coverage
+/// that contain all the changes for layers which intersect the range.
+struct RangeSearchCollector<Iter>
+where
+    Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
+{
+    delta_coverage: Peekable<Iter>,
+    image_coverage: Peekable<Iter>,
+    key_range: Range<Key>,
+    end_lsn: Lsn,
+
+    current_delta: Option<Arc<PersistentLayerDesc>>,
+    current_image: Option<Arc<PersistentLayerDesc>>,
+
+    result: RangeSearchResult,
+}
+
+#[derive(Debug)]
+enum NextLayerType {
+    Delta(i128),
+    Image(i128),
+    Both(i128),
+}
+
+impl NextLayerType {
+    fn next_change_at_key(&self) -> Key {
+        match self {
+            NextLayerType::Delta(at) => Key::from_i128(*at),
+            NextLayerType::Image(at) => Key::from_i128(*at),
+            NextLayerType::Both(at) => Key::from_i128(*at),
+        }
+    }
+}
+
+impl<Iter> RangeSearchCollector<Iter>
+where
+    Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
+{
+    fn new(
+        key_range: Range<Key>,
+        end_lsn: Lsn,
+        delta_coverage: Iter,
+        image_coverage: Iter,
+    ) -> Self {
+        Self {
+            delta_coverage: delta_coverage.peekable(),
+            image_coverage: image_coverage.peekable(),
+            key_range,
+            end_lsn,
+            current_delta: None,
+            current_image: None,
+            result: RangeSearchResult::new(),
+        }
+    }
+
+    /// Run the collector. Collection is implemented via a two pointer algorithm.
+    /// One pointer tracks the start of the current range and the other tracks
+    /// the beginning of the next range which will overlap with the next change
+    /// in coverage across both image and delta.
+    fn collect(mut self) -> RangeSearchResult {
+        let next_layer_type = self.choose_next_layer_type();
+        let mut current_range_start = match next_layer_type {
+            None => {
+                // No changes for the range
+                self.pad_range(self.key_range.clone());
+                return self.result;
+            }
+            Some(layer_type) if self.key_range.end <= layer_type.next_change_at_key() => {
+                // Changes only after the end of the range
+                self.pad_range(self.key_range.clone());
+                return self.result;
+            }
+            Some(layer_type) => {
+                // Changes for the range exist. Record anything before the first
+                // coverage change as not found.
+                let coverage_start = layer_type.next_change_at_key();
+                let range_before = self.key_range.start..coverage_start;
+                self.pad_range(range_before);
+
+                self.advance(&layer_type);
+                coverage_start
+            }
+        };
+
+        while current_range_start < self.key_range.end {
+            let next_layer_type = self.choose_next_layer_type();
+            match next_layer_type {
+                Some(t) => {
+                    let current_range_end = t.next_change_at_key();
+                    self.add_range(current_range_start..current_range_end);
+                    current_range_start = current_range_end;
+
+                    self.advance(&t);
+                }
+                None => {
+                    self.add_range(current_range_start..self.key_range.end);
+                    current_range_start = self.key_range.end;
+                }
+            }
+        }
+
+        self.result
+    }
+
+    /// Mark a range as not found (i.e. no layers intersect it)
+    fn pad_range(&mut self, key_range: Range<Key>) {
+        if !key_range.is_empty() {
+            self.result.not_found.add_range(key_range);
+        }
+    }
+
+    /// Select the appropiate layer for the given range and update
+    /// the collector.
+    fn add_range(&mut self, covered_range: Range<Key>) {
+        let selected = LayerMap::select_layer(
+            self.current_delta.clone(),
+            self.current_image.clone(),
+            self.end_lsn,
+        );
+
+        match selected {
+            Some(search_result) => self
+                .result
+                .found
+                .entry(OrderedSearchResult(search_result))
+                .or_default()
+                .add_range(covered_range),
+            None => self.pad_range(covered_range),
+        }
+    }
+
+    /// Move to the next coverage change.
+    fn advance(&mut self, layer_type: &NextLayerType) {
+        match layer_type {
+            NextLayerType::Delta(_) => {
+                let (_, layer) = self.delta_coverage.next().unwrap();
+                self.current_delta = layer;
+            }
+            NextLayerType::Image(_) => {
+                let (_, layer) = self.image_coverage.next().unwrap();
+                self.current_image = layer;
+            }
+            NextLayerType::Both(_) => {
+                let (_, image_layer) = self.image_coverage.next().unwrap();
+                let (_, delta_layer) = self.delta_coverage.next().unwrap();
+
+                self.current_image = image_layer;
+                self.current_delta = delta_layer;
+            }
+        }
+    }
+
+    /// Pick the next coverage change: the one at the lesser key or both if they're alligned.
+    fn choose_next_layer_type(&mut self) -> Option<NextLayerType> {
+        let next_delta_at = self.delta_coverage.peek().map(|(key, _)| key);
+        let next_image_at = self.image_coverage.peek().map(|(key, _)| key);
+
+        match (next_delta_at, next_image_at) {
+            (None, None) => None,
+            (Some(next_delta_at), None) => Some(NextLayerType::Delta(*next_delta_at)),
+            (None, Some(next_image_at)) => Some(NextLayerType::Image(*next_image_at)),
+            (Some(next_delta_at), Some(next_image_at)) if next_image_at < next_delta_at => {
+                Some(NextLayerType::Image(*next_image_at))
+            }
+            (Some(next_delta_at), Some(next_image_at)) if next_delta_at < next_image_at => {
+                Some(NextLayerType::Delta(*next_delta_at))
+            }
+            (Some(next_delta_at), Some(_)) => Some(NextLayerType::Both(*next_delta_at)),
+        }
+    }
+}
+
 impl LayerMap {
    ///
    /// Find the latest layer (by lsn.end) that covers the given
@@ -186,7 +399,18 @@ impl LayerMap {
        let latest_delta = version.delta_coverage.query(key.to_i128());
        let latest_image = version.image_coverage.query(key.to_i128());

-        match (latest_delta, latest_image) {
+        Self::select_layer(latest_delta, latest_image, end_lsn)
+    }
+
+    fn select_layer(
+        delta_layer: Option<Arc<PersistentLayerDesc>>,
+        image_layer: Option<Arc<PersistentLayerDesc>>,
+        end_lsn: Lsn,
+    ) -> Option<SearchResult> {
+        assert!(delta_layer.as_ref().map_or(true, |l| l.is_delta()));
+        assert!(image_layer.as_ref().map_or(true, |l| !l.is_delta()));
+
+        match (delta_layer, image_layer) {
            (None, None) => None,
            (None, Some(image)) => {
                let lsn_floor = image.get_lsn_range().start;
@@ -223,6 +447,17 @@ impl LayerMap {
        }
    }

+    pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> Option<RangeSearchResult> {
+        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
+
+        let raw_range = key_range.start.to_i128()..key_range.end.to_i128();
+        let delta_changes = version.delta_coverage.range_overlaps(&raw_range);
+        let image_changes = version.image_coverage.range_overlaps(&raw_range);
+
+        let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes);
+        Some(collector.collect())
+    }
+
    /// Start a batch of updates, applied on drop
    pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
        BatchedUpdates { layer_map: self }
@@ -631,3 +866,126 @@ impl LayerMap {
        Ok(())
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[derive(Clone)]
+    struct LayerDesc {
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+        is_delta: bool,
+    }
+
+    fn create_layer_map(layers: Vec<LayerDesc>) -> LayerMap {
+        let mut layer_map = LayerMap::default();
+
+        for layer in layers {
+            layer_map.insert_historic_noflush(PersistentLayerDesc::new_test(
+                layer.key_range,
+                layer.lsn_range,
+                layer.is_delta,
+            ));
+        }
+
+        layer_map.flush_updates();
+        layer_map
+    }
+
+    fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
+        assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
+        let lhs: Vec<_> = lhs
+            .found
+            .into_iter()
+            .map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
+            .collect();
+        let rhs: Vec<_> = rhs
+            .found
+            .into_iter()
+            .map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
+            .collect();
+
+        assert_eq!(lhs, rhs);
+    }
+
+    fn brute_force_range_search(
+        layer_map: &LayerMap,
+        key_range: Range<Key>,
+        end_lsn: Lsn,
+    ) -> RangeSearchResult {
+        let mut range_search_result = RangeSearchResult::new();
+
+        let mut key = key_range.start;
+        while key != key_range.end {
+            let res = layer_map.search(key, end_lsn);
+            match res {
+                Some(res) => {
+                    range_search_result
+                        .found
+                        .entry(OrderedSearchResult(res))
+                        .or_default()
+                        .add_key(key);
+                }
+                None => {
+                    range_search_result.not_found.add_key(key);
+                }
+            }
+
+            key = key.next();
+        }
+
+        range_search_result
+    }
+
+    #[test]
+    fn ranged_search_on_empty_layer_map() {
+        let layer_map = LayerMap::default();
+        let range = Key::from_i128(100)..Key::from_i128(200);
+
+        let res = layer_map.range_search(range, Lsn(100));
+        assert!(res.is_none());
+    }
+
+    #[test]
+    fn ranged_search() {
+        let layers = vec![
+            LayerDesc {
+                key_range: Key::from_i128(15)..Key::from_i128(50),
+                lsn_range: Lsn(0)..Lsn(5),
+                is_delta: false,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(10)..Key::from_i128(20),
+                lsn_range: Lsn(5)..Lsn(20),
+                is_delta: true,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(15)..Key::from_i128(25),
+                lsn_range: Lsn(20)..Lsn(30),
+                is_delta: true,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(35)..Key::from_i128(40),
+                lsn_range: Lsn(25)..Lsn(35),
+                is_delta: true,
+            },
+            LayerDesc {
+                key_range: Key::from_i128(35)..Key::from_i128(40),
+                lsn_range: Lsn(35)..Lsn(40),
+                is_delta: false,
+            },
+        ];
+
+        let layer_map = create_layer_map(layers.clone());
+        for start in 0..60 {
+            for end in (start + 1)..60 {
+                let range = Key::from_i128(start)..Key::from_i128(end);
+                let result = layer_map.range_search(range.clone(), Lsn(100)).unwrap();
+                let expected = brute_force_range_search(&layer_map, range, Lsn(100));
+
+                assert_range_search_result_eq(result, expected);
+            }
+        }
+    }
+}
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -129,6 +129,42 @@ impl<Value: Clone> LayerCoverage<Value> {
            .map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone())))
    }

+    /// Returns an iterator which includes all coverage changes for layers that intersect
+    /// with the provided range.
+    pub fn range_overlaps(
+        &self,
+        key_range: &Range<i128>,
+    ) -> impl Iterator<Item = (i128, Option<Value>)> + '_
+    where
+        Value: Eq,
+    {
+        let first_change = self.query(key_range.start);
+        match first_change {
+            Some(change) => {
+                // If the start of the range is covered, we have to deal with two cases:
+                // 1. Start of the range is aligned with the start of a layer.
+                // In this case the return of `self.range` will contain the layer which aligns with the start of the key range.
+                // We advance said iterator to avoid duplicating the first change.
+                // 2. Start of the range is not aligned with the start of a layer.
+                let range = key_range.start..key_range.end;
+                let mut range_coverage = self.range(range).peekable();
+                if range_coverage
+                    .peek()
+                    .is_some_and(|c| c.1.as_ref() == Some(&change))
+                {
+                    range_coverage.next();
+                }
+                itertools::Either::Left(
+                    std::iter::once((key_range.start, Some(change))).chain(range_coverage),
+                )
+            }
+            None => {
+                let range = key_range.start..key_range.end;
+                let coverage = self.range(range);
+                itertools::Either::Right(coverage)
+            }
+        }
+    }
    /// O(1) clone
    pub fn clone(&self) -> Self {
        Self {
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -898,6 +898,17 @@ impl TenantManager {
        }
    }

+    /// Whether the `TenantManager` is responsible for the tenant shard
+    pub(crate) fn manages_tenant_shard(&self, tenant_shard_id: TenantShardId) -> bool {
+        let locked = self.tenants.read().unwrap();
+
+        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
+            .ok()
+            .flatten();
+
+        peek_slot.is_some()
+    }
+
    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
    pub(crate) async fn upsert_location(
        &self,
@@ -1311,6 +1322,7 @@ impl TenantManager {
        tenant_shard_id: TenantShardId,
        activation_timeout: Duration,
    ) -> Result<(), DeleteTenantError> {
+        super::span::debug_assert_current_span_has_tenant_id();
        // We acquire a SlotGuard during this function to protect against concurrent
        // changes while the ::prepare phase of DeleteTenantFlow executes, but then
        // have to return the Tenant to the map while the background deletion runs.
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1719,6 +1719,11 @@ pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
    RemotePath::from_string(&path).expect("Failed to construct path")
 }

+fn remote_timelines_path_unsharded(tenant_id: &TenantId) -> RemotePath {
+    let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}");
+    RemotePath::from_string(&path).expect("Failed to construct path")
+}
+
 pub fn remote_timeline_path(
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -5,9 +5,11 @@ use camino::Utf8Path;
 use fail::fail_point;
 use pageserver_api::shard::TenantShardId;
 use std::io::{ErrorKind, SeekFrom};
+use std::time::SystemTime;
 use tokio::fs::{self, File};
 use tokio::io::AsyncSeekExt;
 use tokio_util::sync::CancellationToken;
+use utils::backoff;

 use super::Generation;
 use crate::{
@@ -17,7 +19,7 @@ use crate::{
        remote_initdb_preserved_archive_path, remote_path, upload_cancellable,
    },
 };
-use remote_storage::GenericRemoteStorage;
+use remote_storage::{GenericRemoteStorage, TimeTravelError};
 use utils::id::{TenantId, TimelineId};

 use super::index::LayerFileMetadata;
@@ -157,3 +159,45 @@ pub(crate) async fn preserve_initdb_archive(
        .await
        .with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'"))
 }
+
+pub(crate) async fn time_travel_recover_tenant(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    timestamp: SystemTime,
+    done_if_after: SystemTime,
+    cancel: &CancellationToken,
+) -> Result<(), TimeTravelError> {
+    let warn_after = 3;
+    let max_attempts = 10;
+    let mut prefixes = Vec::with_capacity(2);
+    if tenant_shard_id.is_zero() {
+        // Also recover the unsharded prefix for a shard of zero:
+        // - if the tenant is totally unsharded, the unsharded prefix contains all the data
+        // - if the tenant is sharded, we still want to recover the initdb data, but we only
+        //   want to do it once, so let's do it on the 0 shard
+        let timelines_path_unsharded =
+            super::remote_timelines_path_unsharded(&tenant_shard_id.tenant_id);
+        prefixes.push(timelines_path_unsharded);
+    }
+    if !tenant_shard_id.is_unsharded() {
+        // If the tenant is sharded, we need to recover the sharded prefix
+        let timelines_path = super::remote_timelines_path(tenant_shard_id);
+        prefixes.push(timelines_path);
+    }
+    for prefix in &prefixes {
+        backoff::retry(
+            || async {
+                storage
+                    .time_travel_recover(Some(prefix), timestamp, done_if_after, cancel.clone())
+                    .await
+            },
+            |e| !matches!(e, TimeTravelError::Other(_)),
+            warn_after,
+            max_attempts,
+            "time travel recovery of tenant prefix",
+            backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
+        )
+        .await?;
+    }
+    Ok(())
+}
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -112,7 +112,7 @@ impl SecondaryTenant {
            // on shutdown we walk the tenants and fire their
            // individual cancellations?
            cancel: CancellationToken::new(),
-            gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),
+            gate: Gate::default(),

            shard_identity,
            tenant_conf: std::sync::Mutex::new(tenant_conf),
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -884,7 +884,7 @@ impl DeltaLayerInner {

        let keys = self.load_keys(ctx).await?;

-        async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
+        async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
            let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
            let val = Value::des(&buf)?;
            let desc = match val {
@@ -906,13 +906,32 @@ impl DeltaLayerInner {

        for entry in keys {
            let DeltaEntry { key, lsn, val, .. } = entry;
-            let desc = match dump_blob(val, ctx).await {
+            let desc = match dump_blob(&val, ctx).await {
                Ok(desc) => desc,
                Err(err) => {
                    format!("ERROR: {err}")
                }
            };
            println!("  key {key} at {lsn}: {desc}");
+
+            // Print more details about CHECKPOINT records. Would be nice to print details
+            // of many other record types too, but these are particularly interesting, as
+            // have a lot of special processing for them in walingest.rs.
+            use pageserver_api::key::CHECKPOINT_KEY;
+            use postgres_ffi::CheckPoint;
+            if key == CHECKPOINT_KEY {
+                let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
+                let val = Value::des(&buf)?;
+                match val {
+                    Value::Image(img) => {
+                        let checkpoint = CheckPoint::decode(&img)?;
+                        println!("   CHECKPOINT: {:?}", checkpoint);
+                    }
+                    Value::WalRecord(_rec) => {
+                        println!("   unexpected walrecord value for checkpoint key");
+                    }
+                }
+            }
        }

        Ok(())
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -55,13 +55,13 @@ impl PersistentLayerDesc {
    }

    #[cfg(test)]
-    pub fn new_test(key_range: Range<Key>) -> Self {
+    pub fn new_test(key_range: Range<Key>, lsn_range: Range<Lsn>, is_delta: bool) -> Self {
        Self {
            tenant_shard_id: TenantShardId::unsharded(TenantId::generate()),
            timeline_id: TimelineId::generate(),
            key_range,
-            lsn_range: Lsn(0)..Lsn(1),
-            is_delta: false,
+            lsn_range,
+            is_delta,
            file_size: 0,
        }
    }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -124,7 +124,7 @@ pub(super) enum FlushLoopState {

 /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
 #[derive(Debug, Clone, PartialEq, Eq)]
-pub struct Hole {
+pub(crate) struct Hole {
    key_range: Range<Key>,
    coverage_size: usize,
 }
@@ -457,6 +457,21 @@ pub(crate) enum GetVectoredError {
    InvalidLsn(Lsn),
 }

+#[derive(thiserror::Error, Debug)]
+pub(crate) enum GetReadyAncestorError {
+    #[error("ancestor timeline {0} is being stopped")]
+    AncestorStopping(TimelineId),
+
+    #[error("Ancestor LSN wait error: {0}")]
+    AncestorLsnTimeout(#[from] WaitLsnError),
+
+    #[error("Cancelled")]
+    Cancelled,
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 #[derive(Clone, Copy)]
 pub enum LogicalSizeCalculationCause {
    Initial,
@@ -535,22 +550,34 @@ impl From<GetVectoredError> for CreateImageLayersError {
    }
 }

+impl From<GetReadyAncestorError> for PageReconstructError {
+    fn from(e: GetReadyAncestorError) -> Self {
+        use GetReadyAncestorError::*;
+        match e {
+            AncestorStopping(tid) => PageReconstructError::AncestorStopping(tid),
+            AncestorLsnTimeout(wait_err) => PageReconstructError::AncestorLsnTimeout(wait_err),
+            Cancelled => PageReconstructError::Cancelled,
+            Other(other) => PageReconstructError::Other(other),
+        }
+    }
+}
+
 /// Public interface functions
 impl Timeline {
    /// Get the LSN where this branch was created
-    pub fn get_ancestor_lsn(&self) -> Lsn {
+    pub(crate) fn get_ancestor_lsn(&self) -> Lsn {
        self.ancestor_lsn
    }

    /// Get the ancestor's timeline id
-    pub fn get_ancestor_timeline_id(&self) -> Option<TimelineId> {
+    pub(crate) fn get_ancestor_timeline_id(&self) -> Option<TimelineId> {
        self.ancestor_timeline
            .as_ref()
            .map(|ancestor| ancestor.timeline_id)
    }

    /// Lock and get timeline's GC cutoff
-    pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
+    pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
        self.latest_gc_cutoff_lsn.read()
    }

@@ -706,27 +733,27 @@ impl Timeline {
    }

    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
-    pub fn get_last_record_lsn(&self) -> Lsn {
+    pub(crate) fn get_last_record_lsn(&self) -> Lsn {
        self.last_record_lsn.load().last
    }

-    pub fn get_prev_record_lsn(&self) -> Lsn {
+    pub(crate) fn get_prev_record_lsn(&self) -> Lsn {
        self.last_record_lsn.load().prev
    }

    /// Atomically get both last and prev.
-    pub fn get_last_record_rlsn(&self) -> RecordLsn {
+    pub(crate) fn get_last_record_rlsn(&self) -> RecordLsn {
        self.last_record_lsn.load()
    }

-    pub fn get_disk_consistent_lsn(&self) -> Lsn {
+    pub(crate) fn get_disk_consistent_lsn(&self) -> Lsn {
        self.disk_consistent_lsn.load()
    }

    /// remote_consistent_lsn from the perspective of the tenant's current generation,
    /// not validated with control plane yet.
    /// See [`Self::get_remote_consistent_lsn_visible`].
-    pub fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
+    pub(crate) fn get_remote_consistent_lsn_projected(&self) -> Option<Lsn> {
        if let Some(remote_client) = &self.remote_client {
            remote_client.remote_consistent_lsn_projected()
        } else {
@@ -737,7 +764,7 @@ impl Timeline {
    /// remote_consistent_lsn which the tenant is guaranteed not to go backward from,
    /// i.e. a value of remote_consistent_lsn_projected which has undergone
    /// generation validation in the deletion queue.
-    pub fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
+    pub(crate) fn get_remote_consistent_lsn_visible(&self) -> Option<Lsn> {
        if let Some(remote_client) = &self.remote_client {
            remote_client.remote_consistent_lsn_visible()
        } else {
@@ -748,7 +775,7 @@ impl Timeline {
    /// The sum of the file size of all historic layers in the layer map.
    /// This method makes no distinction between local and remote layers.
    /// Hence, the result **does not represent local filesystem usage**.
-    pub async fn layer_size_sum(&self) -> u64 {
+    pub(crate) async fn layer_size_sum(&self) -> u64 {
        let guard = self.layers.read().await;
        let layer_map = guard.layer_map();
        let mut size = 0;
@@ -758,7 +785,7 @@ impl Timeline {
        size
    }

-    pub fn resident_physical_size(&self) -> u64 {
+    pub(crate) fn resident_physical_size(&self) -> u64 {
        self.metrics.resident_physical_size_get()
    }

@@ -834,7 +861,7 @@ impl Timeline {
    }

    /// Check that it is valid to request operations with that lsn.
-    pub fn check_lsn_is_in_scope(
+    pub(crate) fn check_lsn_is_in_scope(
        &self,
        lsn: Lsn,
        latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
@@ -850,7 +877,7 @@ impl Timeline {

    /// Flush to disk all data that was written with the put_* functions
    #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
-    pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
+    pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> {
        self.freeze_inmem_layer(false).await;
        self.flush_frozen_layers_and_wait().await
    }
@@ -994,7 +1021,7 @@ impl Timeline {
    }

    /// Mutate the timeline with a [`TimelineWriter`].
-    pub async fn writer(&self) -> TimelineWriter<'_> {
+    pub(crate) async fn writer(&self) -> TimelineWriter<'_> {
        TimelineWriter {
            tl: self,
            _write_guard: self.write_lock.lock().await,
@@ -1006,7 +1033,7 @@ impl Timeline {
    ///
    /// Also flush after a period of time without new data -- it helps
    /// safekeepers to regard pageserver as caught up and suspend activity.
-    pub async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
+    pub(crate) async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
        let last_lsn = self.get_last_record_lsn();
        let open_layer_size = {
            let guard = self.layers.read().await;
@@ -1044,13 +1071,16 @@ impl Timeline {
        Ok(())
    }

-    pub fn activate(
+    pub(crate) fn activate(
        self: &Arc<Self>,
        broker_client: BrokerClientChannel,
        background_jobs_can_start: Option<&completion::Barrier>,
        ctx: &RequestContext,
    ) {
-        self.spawn_initial_logical_size_computation_task(ctx);
+        if self.tenant_shard_id.is_zero() {
+            // Logical size is only maintained accurately on shard zero.
+            self.spawn_initial_logical_size_computation_task(ctx);
+        }
        self.launch_wal_receiver(ctx, broker_client);
        self.set_state(TimelineState::Active);
        self.launch_eviction_task(background_jobs_can_start);
@@ -1060,7 +1090,6 @@ impl Timeline {
    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
    ///
    /// While we are flushing, we continue to accept read I/O.
-    #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
    pub(crate) async fn flush_and_shutdown(&self) {
        debug_assert_current_span_has_tenant_and_timeline_id();

@@ -1109,6 +1138,8 @@ impl Timeline {
    /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
    /// the graceful [`Timeline::flush_and_shutdown`] function.
    pub(crate) async fn shutdown(&self) {
+        span::debug_assert_current_span_has_tenant_and_timeline_id();
+
        // Signal any subscribers to our cancellation token to drop out
        tracing::debug!("Cancelling CancellationToken");
        self.cancel.cancel();
@@ -1144,7 +1175,7 @@ impl Timeline {
        self.gate.close().await;
    }

-    pub fn set_state(&self, new_state: TimelineState) {
+    pub(crate) fn set_state(&self, new_state: TimelineState) {
        match (self.current_state(), new_state) {
            (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
                info!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
@@ -1164,7 +1195,7 @@ impl Timeline {
        }
    }

-    pub fn set_broken(&self, reason: String) {
+    pub(crate) fn set_broken(&self, reason: String) {
        let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture());
        let broken_state = TimelineState::Broken {
            reason,
@@ -1178,27 +1209,27 @@ impl Timeline {
        self.cancel.cancel();
    }

-    pub fn current_state(&self) -> TimelineState {
+    pub(crate) fn current_state(&self) -> TimelineState {
        self.state.borrow().clone()
    }

-    pub fn is_broken(&self) -> bool {
+    pub(crate) fn is_broken(&self) -> bool {
        matches!(&*self.state.borrow(), TimelineState::Broken { .. })
    }

-    pub fn is_active(&self) -> bool {
+    pub(crate) fn is_active(&self) -> bool {
        self.current_state() == TimelineState::Active
    }

-    pub fn is_stopping(&self) -> bool {
+    pub(crate) fn is_stopping(&self) -> bool {
        self.current_state() == TimelineState::Stopping
    }

-    pub fn subscribe_for_state_updates(&self) -> watch::Receiver<TimelineState> {
+    pub(crate) fn subscribe_for_state_updates(&self) -> watch::Receiver<TimelineState> {
        self.state.subscribe()
    }

-    pub async fn wait_to_become_active(
+    pub(crate) async fn wait_to_become_active(
        &self,
        _ctx: &RequestContext, // Prepare for use by cancellation
    ) -> Result<(), TimelineState> {
@@ -1223,7 +1254,7 @@ impl Timeline {
        }
    }

-    pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
+    pub(crate) async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
        let guard = self.layers.read().await;
        let layer_map = guard.layer_map();
        let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
@@ -1247,7 +1278,10 @@ impl Timeline {
    }

    #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
-    pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
+    pub(crate) async fn download_layer(
+        &self,
+        layer_file_name: &str,
+    ) -> anyhow::Result<Option<bool>> {
        let Some(layer) = self.find_layer(layer_file_name).await else {
            return Ok(None);
        };
@@ -1264,7 +1298,7 @@ impl Timeline {
    /// Evict just one layer.
    ///
    /// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
-    pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
+    pub(crate) async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
        let _gate = self
            .gate
            .enter()
@@ -1287,6 +1321,13 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;

 // Private functions
 impl Timeline {
+    pub(crate) fn get_lazy_slru_download(&self) -> bool {
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        tenant_conf
+            .lazy_slru_download
+            .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
+    }
+
    fn get_checkpoint_distance(&self) -> u64 {
        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
@@ -1495,7 +1536,7 @@ impl Timeline {
                delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),

                cancel,
-                gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")),
+                gate: Gate::default(),

                compaction_lock: tokio::sync::Mutex::default(),
                gc_lock: tokio::sync::Mutex::default(),
@@ -1817,6 +1858,12 @@ impl Timeline {
        priority: GetLogicalSizePriority,
        ctx: &RequestContext,
    ) -> logical_size::CurrentLogicalSize {
+        if !self.tenant_shard_id.is_zero() {
+            // Logical size is only accurately maintained on shard zero: when called elsewhere, for example
+            // when HTTP API is serving a GET for timeline zero, return zero
+            return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero());
+        }
+
        let current_size = self.current_logical_size.current_size();
        debug!("Current size: {current_size:?}");

@@ -2059,7 +2106,7 @@ impl Timeline {
            .expect("only this task sets it");
    }

-    pub fn spawn_ondemand_logical_size_calculation(
+    pub(crate) fn spawn_ondemand_logical_size_calculation(
        self: &Arc<Self>,
        lsn: Lsn,
        cause: LogicalSizeCalculationCause,
@@ -2105,6 +2152,9 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
        span::debug_assert_current_span_has_tenant_and_timeline_id();
+        // We should never be calculating logical sizes on shard !=0, because these shards do not have
+        // accurate relation sizes, and they do not emit consumption metrics.
+        debug_assert!(self.tenant_shard_id.is_zero());

        let _guard = self.gate.enter();

@@ -2138,7 +2188,7 @@ impl Timeline {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub async fn calculate_logical_size(
+    async fn calculate_logical_size(
        &self,
        up_to_lsn: Lsn,
        cause: LogicalSizeCalculationCause,
@@ -2392,60 +2442,8 @@ impl Timeline {
                    timeline.ancestor_lsn,
                    cont_lsn
                );
-                let ancestor = match timeline.get_ancestor_timeline() {
-                    Ok(timeline) => timeline,
-                    Err(e) => return Err(PageReconstructError::from(e)),
-                };

-                // It's possible that the ancestor timeline isn't active yet, or
-                // is active but hasn't yet caught up to the branch point. Wait
-                // for it.
-                //
-                // This cannot happen while the pageserver is running normally,
-                // because you cannot create a branch from a point that isn't
-                // present in the pageserver yet. However, we don't wait for the
-                // branch point to be uploaded to cloud storage before creating
-                // a branch. I.e., the branch LSN need not be remote consistent
-                // for the branching operation to succeed.
-                //
-                // Hence, if we try to load a tenant in such a state where
-                // 1. the existence of the branch was persisted (in IndexPart and/or locally)
-                // 2. but the ancestor state is behind branch_lsn because it was not yet persisted
-                // then we will need to wait for the ancestor timeline to
-                // re-stream WAL up to branch_lsn before we access it.
-                //
-                // How can a tenant get in such a state?
-                // - ungraceful pageserver process exit
-                // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219
-                //
-                // NB: this could be avoided by requiring
-                //   branch_lsn >= remote_consistent_lsn
-                // during branch creation.
-                match ancestor.wait_to_become_active(ctx).await {
-                    Ok(()) => {}
-                    Err(TimelineState::Stopping) => {
-                        return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id));
-                    }
-                    Err(state) => {
-                        return Err(PageReconstructError::Other(anyhow::anyhow!(
-                            "Timeline {} will not become active. Current state: {:?}",
-                            ancestor.timeline_id,
-                            &state,
-                        )));
-                    }
-                }
-                ancestor
-                    .wait_lsn(timeline.ancestor_lsn, ctx)
-                    .await
-                    .map_err(|e| match e {
-                        e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e),
-                        WaitLsnError::Shutdown => PageReconstructError::Cancelled,
-                        e @ WaitLsnError::BadState => {
-                            PageReconstructError::Other(anyhow::anyhow!(e))
-                        }
-                    })?;
-
-                timeline_owned = ancestor;
+                timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
                timeline = &*timeline_owned;
                prev_lsn = Lsn(u64::MAX);
                continue 'outer;
@@ -2575,6 +2573,66 @@ impl Timeline {
        Some((lsn, img))
    }

+    async fn get_ready_ancestor_timeline(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<Arc<Timeline>, GetReadyAncestorError> {
+        let ancestor = match self.get_ancestor_timeline() {
+            Ok(timeline) => timeline,
+            Err(e) => return Err(GetReadyAncestorError::from(e)),
+        };
+
+        // It's possible that the ancestor timeline isn't active yet, or
+        // is active but hasn't yet caught up to the branch point. Wait
+        // for it.
+        //
+        // This cannot happen while the pageserver is running normally,
+        // because you cannot create a branch from a point that isn't
+        // present in the pageserver yet. However, we don't wait for the
+        // branch point to be uploaded to cloud storage before creating
+        // a branch. I.e., the branch LSN need not be remote consistent
+        // for the branching operation to succeed.
+        //
+        // Hence, if we try to load a tenant in such a state where
+        // 1. the existence of the branch was persisted (in IndexPart and/or locally)
+        // 2. but the ancestor state is behind branch_lsn because it was not yet persisted
+        // then we will need to wait for the ancestor timeline to
+        // re-stream WAL up to branch_lsn before we access it.
+        //
+        // How can a tenant get in such a state?
+        // - ungraceful pageserver process exit
+        // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219
+        //
+        // NB: this could be avoided by requiring
+        //   branch_lsn >= remote_consistent_lsn
+        // during branch creation.
+        match ancestor.wait_to_become_active(ctx).await {
+            Ok(()) => {}
+            Err(TimelineState::Stopping) => {
+                return Err(GetReadyAncestorError::AncestorStopping(
+                    ancestor.timeline_id,
+                ));
+            }
+            Err(state) => {
+                return Err(GetReadyAncestorError::Other(anyhow::anyhow!(
+                    "Timeline {} will not become active. Current state: {:?}",
+                    ancestor.timeline_id,
+                    &state,
+                )));
+            }
+        }
+        ancestor
+            .wait_lsn(self.ancestor_lsn, ctx)
+            .await
+            .map_err(|e| match e {
+                e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e),
+                WaitLsnError::Shutdown => GetReadyAncestorError::Cancelled,
+                e @ WaitLsnError::BadState => GetReadyAncestorError::Other(anyhow::anyhow!(e)),
+            })?;
+
+        Ok(ancestor)
+    }
+
    fn get_ancestor_timeline(&self) -> anyhow::Result<Arc<Timeline>> {
        let ancestor = self.ancestor_timeline.as_ref().with_context(|| {
            format!(
@@ -2785,12 +2843,12 @@ impl Timeline {
    }

    /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id, layer=%frozen_layer))]
    async fn flush_frozen_layer(
        self: &Arc<Self>,
        frozen_layer: Arc<InMemoryLayer>,
        ctx: &RequestContext,
    ) -> Result<(), FlushLayerError> {
+        span::debug_assert_current_span_has_tenant_and_timeline_id();
        // As a special case, when we have just imported an image into the repository,
        // instead of writing out a L0 delta layer, we directly write out image layer
        // files instead. This is possible as long as *all* the data imported into the
@@ -3379,7 +3437,7 @@ enum DurationRecorder {
 }

 impl DurationRecorder {
-    pub fn till_now(&self) -> DurationRecorder {
+    fn till_now(&self) -> DurationRecorder {
        match self {
            DurationRecorder::NotStarted => {
                panic!("must only call on recorded measurements")
@@ -3390,7 +3448,7 @@ impl DurationRecorder {
            }
        }
    }
-    pub fn into_recorded(self) -> Option<RecordedDuration> {
+    fn into_recorded(self) -> Option<RecordedDuration> {
        match self {
            DurationRecorder::NotStarted => None,
            DurationRecorder::Recorded(recorded, _) => Some(recorded),
@@ -4590,7 +4648,9 @@ impl Timeline {
        }
    }

-    pub fn get_download_all_remote_layers_task_info(&self) -> Option<DownloadRemoteLayersTaskInfo> {
+    pub(crate) fn get_download_all_remote_layers_task_info(
+        &self,
+    ) -> Option<DownloadRemoteLayersTaskInfo> {
        self.download_all_remote_layers_task_info
            .read()
            .unwrap()
@@ -4686,7 +4746,7 @@ fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageRecon
 // TODO Currently, Deref is used to allow easy access to read methods from this trait.
 // This is probably considered a bad practice in Rust and should be fixed eventually,
 // but will cause large code changes.
-pub struct TimelineWriter<'a> {
+pub(crate) struct TimelineWriter<'a> {
    tl: &'a Timeline,
    _write_guard: tokio::sync::MutexGuard<'a, ()>,
 }
@@ -4704,7 +4764,7 @@ impl<'a> TimelineWriter<'a> {
    ///
    /// This will implicitly extend the relation, if the page is beyond the
    /// current end-of-file.
-    pub async fn put(
+    pub(crate) async fn put(
        &self,
        key: Key,
        lsn: Lsn,
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -356,12 +356,14 @@ impl DeleteTimelineFlow {
    // NB: If this fails half-way through, and is retried, the retry will go through
    // all the same steps again. Make sure the code here is idempotent, and don't
    // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_shard_id.tenant_id, shard_id=%tenant.tenant_shard_id.shard_slug()))]
+    #[instrument(skip_all, fields(%inplace))]
    pub async fn run(
        tenant: &Arc<Tenant>,
        timeline_id: TimelineId,
        inplace: bool,
    ) -> Result<(), DeleteTimelineError> {
+        super::debug_assert_current_span_has_tenant_and_timeline_id();
+
        let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;

        guard.mark_in_progress()?;
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -319,6 +319,13 @@ impl Timeline {
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
+        if !self.tenant_shard_id.is_zero() {
+            // Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size
+            // for consumption metrics (consumption metrics are only sent from shard 0).  We may therefore
+            // skip imitating logical size accesses for eviction purposes.
+            return ControlFlow::Continue(());
+        }
+
        let mut state = self.eviction_task_timeline_state.lock().await;

        // Only do the imitate_layer accesses approximately as often as the threshold.  A little
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -101,6 +101,14 @@ impl From<&Exact> for u64 {
    }
 }

+impl Approximate {
+    /// For use in situations where we don't have a sane logical size value but need
+    /// to return something, e.g. in HTTP API on shard >0 of a sharded tenant.
+    pub(crate) fn zero() -> Self {
+        Self(0)
+    }
+}
+
 impl CurrentLogicalSize {
    pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 {
        match self {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -426,13 +426,21 @@ pub(super) async fn handle_walreceiver_connection(

            // Send the replication feedback message.
            // Regular standby_status_update fields are put into this message.
-            let current_timeline_size = timeline
-                .get_current_logical_size(
-                    crate::tenant::timeline::GetLogicalSizePriority::User,
-                    &ctx,
-                )
-                // FIXME: https://github.com/neondatabase/neon/issues/5963
-                .size_dont_care_about_accuracy();
+            let current_timeline_size = if timeline.tenant_shard_id.is_zero() {
+                timeline
+                    .get_current_logical_size(
+                        crate::tenant::timeline::GetLogicalSizePriority::User,
+                        &ctx,
+                    )
+                    // FIXME: https://github.com/neondatabase/neon/issues/5963
+                    .size_dont_care_about_accuracy()
+            } else {
+                // Non-zero shards send zero for logical size.  The safekeeper will ignore
+                // this number.  This is because in a sharded tenant, only shard zero maintains
+                // accurate logical size.
+                0
+            };
+
            let status_update = PageserverFeedback {
                current_timeline_size,
                last_received_lsn,
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1363,16 +1363,22 @@ impl WalIngest {
            self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
            self.checkpoint_modified = true;
        }
-        let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
-            if mbr.xid.wrapping_sub(acc) as i32 > 0 {
-                mbr.xid
+        let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| {
+            if let Some(max_xid) = acc {
+                if mbr.xid.wrapping_sub(max_xid) as i32 > 0 {
+                    Some(mbr.xid)
+                } else {
+                    acc
+                }
            } else {
-                acc
+                Some(mbr.xid)
            }
        });

-        if self.checkpoint.update_next_xid(max_mbr_xid) {
-            self.checkpoint_modified = true;
+        if let Some(max_xid) = max_mbr_xid {
+            if self.checkpoint.update_next_xid(max_xid) {
+                self.checkpoint_modified = true;
+            }
        }
        Ok(())
    }
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,6 +22,7 @@ use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
+use pageserver_api::models::WalRedoManagerStatus;
 use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use std::collections::VecDeque;
@@ -29,7 +30,6 @@ use std::io;
 use std::io::prelude::*;
 use std::ops::{Deref, DerefMut};
 use std::os::unix::io::AsRawFd;
-use std::os::unix::prelude::CommandExt;
 use std::process::Stdio;
 use std::process::{Child, ChildStdin, ChildStdout, Command};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock};
@@ -179,6 +179,20 @@ impl PostgresRedoManager {
            )
        }
    }
+
+    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
+        Some(WalRedoManagerStatus {
+            last_redo_at: {
+                let at = *self.last_redo_at.lock().unwrap();
+                at.and_then(|at| {
+                    let age = at.elapsed();
+                    // map any chrono errors silently to None here
+                    chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
+                })
+            },
+            pid: self.redo_process.read().unwrap().as_ref().map(|p| p.id()),
+        })
+    }
 }

 impl PostgresRedoManager {
@@ -243,8 +257,7 @@ impl PostgresRedoManager {
                        let mut proc_guard = self.redo_process.write().unwrap();
                        match &*proc_guard {
                            None => {
-                                let timer =
-                                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
+                                let start = Instant::now();
                                let proc = Arc::new(
                                    WalRedoProcess::launch(
                                        self.conf,
@@ -253,7 +266,14 @@ impl PostgresRedoManager {
                                    )
                                    .context("launch walredo process")?,
                                );
-                                timer.observe_duration();
+                                let duration = start.elapsed();
+                                WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM
+                                    .observe(duration.as_secs_f64());
+                                info!(
+                                    duration_ms = duration.as_millis(),
+                                    pid = proc.id(),
+                                    "launched walredo process"
+                                );
                                *proc_guard = Some(Arc::clone(&proc));
                                proc
                            }
@@ -607,40 +627,6 @@ impl PostgresRedoManager {
    }
 }

-///
-/// Command with ability not to give all file descriptors to child process
-///
-trait CloseFileDescriptors: CommandExt {
-    ///
-    /// Close file descriptors (other than stdin, stdout, stderr) in child process
-    ///
-    fn close_fds(&mut self) -> &mut Command;
-}
-
-impl<C: CommandExt> CloseFileDescriptors for C {
-    fn close_fds(&mut self) -> &mut Command {
-        // SAFETY: Code executed inside pre_exec should have async-signal-safety,
-        // which means it should be safe to execute inside a signal handler.
-        // The precise meaning depends on platform. See `man signal-safety`
-        // for the linux definition.
-        //
-        // The set_fds_cloexec_threadsafe function is documented to be
-        // async-signal-safe.
-        //
-        // Aside from this function, the rest of the code is re-entrant and
-        // doesn't make any syscalls. We're just passing constants.
-        //
-        // NOTE: It's easy to indirectly cause a malloc or lock a mutex,
-        // which is not async-signal-safe. Be careful.
-        unsafe {
-            self.pre_exec(move || {
-                close_fds::set_fds_cloexec_threadsafe(3, &[]);
-                Ok(())
-            })
-        }
-    }
-}
-
 struct WalRedoProcess {
    #[allow(dead_code)]
    conf: &'static PageServerConf,
@@ -669,23 +655,25 @@ impl WalRedoProcess {

        // Start postgres itself
        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
            .stdin(Stdio::piped())
            .stderr(Stdio::piped())
            .stdout(Stdio::piped())
            .env_clear()
            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // The redo process is not trusted, and runs in seccomp mode that
-            // doesn't allow it to open any files. We have to also make sure it
-            // doesn't inherit any file descriptors from the pageserver, that
-            // would allow an attacker to read any files that happen to be open
-            // in the pageserver.
-            //
-            // The Rust standard library makes sure to mark any file descriptors with
-            // as close-on-exec by default, but that's not enough, since we use
-            // libraries that directly call libc open without setting that flag.
-            .close_fds()
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
            .spawn_no_leak_child(tenant_shard_id)
            .context("spawn process")?;
        WAL_REDO_PROCESS_COUNTERS.started.inc();
--- a/patches/pgvector.patch
+++ b/patches/pgvector.patch
@@ -0,0 +1,60 @@
+From de3dd0cd034d2bcc12b456171ce163bdc1f4cb65 Mon Sep 17 00:00:00 2001
+From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+Date: Thu, 1 Feb 2024 17:42:31 +0200
+Subject: [PATCH 1/1] Make v0.6.0 work with Neon
+
+Now that the WAL-logging happens as a separate step at the end of the
+build, we need a few neon-specific hints to make it work.
+---
+ src/hnswbuild.c | 28 ++++++++++++++++++++++++++++
+ 1 file changed, 28 insertions(+)
+
+diff --git a/src/hnswbuild.c b/src/hnswbuild.c
+index 680789b..bfa657a 100644
+--- a/src/hnswbuild.c
+++ b/src/hnswbuild.c
+@@ -1089,13 +1089,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
+ 	SeedRandom(42);
+ #endif
+ 
+#ifdef NEON_SMGR
+	smgr_start_unlogged_build(RelationGetSmgr(index));
+#endif
+
+ 	InitBuildState(buildstate, heap, index, indexInfo, forkNum);
+ 
+ 	BuildGraph(buildstate, forkNum);
+ 
+#ifdef NEON_SMGR
+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
+#endif
+
+ 	if (RelationNeedsWAL(index))
+	{
+ 		log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
+ 
+#ifdef NEON_SMGR
+		{
+#if PG_VERSION_NUM >= 160000
+			RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
+#else
+			RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
+#endif
+
+			SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
+										   MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+			SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
+		}
+#endif
+	}
+
+#ifdef NEON_SMGR
+	smgr_end_unlogged_build(RelationGetSmgr(index));
+#endif
+
+ 	FreeBuildState(buildstate);
+ }
+ 
+-- 
+2.39.2
+
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -328,18 +328,14 @@ pageserver_connect(shardno_t shard_no, int elevel)

 	now = GetCurrentTimestamp();
 	us_since_last_connect = now - last_connect_time;
-	if (us_since_last_connect < delay_us)
+	if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC)
 	{
-		pg_usleep(delay_us - us_since_last_connect);
+		pg_usleep(delay_us);
 		delay_us *= 2;
-		if (delay_us > MAX_RECONNECT_INTERVAL_USEC)
-			delay_us = MAX_RECONNECT_INTERVAL_USEC;
-		last_connect_time = GetCurrentTimestamp();
 	}
 	else
 	{
 		delay_us = MIN_RECONNECT_INTERVAL_USEC;
-		last_connect_time = now;
 	}

 	/*
@@ -366,6 +362,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	values[n] = NULL;
 	n++;
 	conn = PQconnectdbParams(keywords, values, 1);
+	last_connect_time = GetCurrentTimestamp();

 	if (PQstatus(conn) == CONNECTION_BAD)
 	{
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -15,6 +15,7 @@

 #include "neon_pgversioncompat.h"

+#include "access/slru.h"
 #include "access/xlogdefs.h"
 #include RELFILEINFO_HDR
 #include "lib/stringinfo.h"
@@ -34,6 +35,7 @@ typedef enum
 	T_NeonNblocksRequest,
 	T_NeonGetPageRequest,
 	T_NeonDbSizeRequest,
+	T_NeonGetSlruSegmentRequest,

 	/* pagestore -> pagestore_client */
 	T_NeonExistsResponse = 100,
@@ -41,6 +43,7 @@ typedef enum
 	T_NeonGetPageResponse,
 	T_NeonErrorResponse,
 	T_NeonDbSizeResponse,
+	T_NeonGetSlruSegmentResponse,
 } NeonMessageTag;

 /* base struct for c-style inheritance */
@@ -59,6 +62,13 @@ typedef struct
 														(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
 														 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))

+/* SLRUs downloadable from page server */
+typedef enum {
+	SLRU_CLOG,
+	SLRU_MULTIXACT_MEMBERS,
+	SLRU_MULTIXACT_OFFSETS
+} SlruKind;
+
 /*
 * supertype of all the Neon*Request structs below
 *
@@ -101,6 +111,13 @@ typedef struct
 	BlockNumber blkno;
 } NeonGetPageRequest;

+typedef struct
+{
+	NeonRequest req;
+	SlruKind kind;
+	int      segno;
+} NeonGetSlruSegmentRequest;
+
 /* supertype of all the Neon*Response structs below */
 typedef struct
 {
@@ -140,6 +157,14 @@ typedef struct
 												 * message */
 } NeonErrorResponse;

+typedef struct
+{
+	NeonMessageTag tag;
+	int         n_blocks;
+	char		data[BLCKSZ * SLRU_PAGES_PER_SEGMENT];
+} NeonGetSlruSegmentResponse;
+
+
 extern StringInfoData nm_pack_request(NeonRequest *msg);
 extern NeonResponse *nm_unpack_response(StringInfo s);
 extern char *nm_to_string(NeonMessage *msg);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1043,12 +1043,25 @@ nm_pack_request(NeonRequest *msg)
 				break;
 			}

+		case T_NeonGetSlruSegmentRequest:
+			{
+				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
+
+				pq_sendbyte(&s, msg_req->req.latest);
+				pq_sendint64(&s, msg_req->req.lsn);
+				pq_sendbyte(&s, msg_req->kind);
+				pq_sendint32(&s, msg_req->segno);
+
+				break;
+			}
+
 			/* pagestore -> pagestore_client. We never need to create these. */
 		case T_NeonExistsResponse:
 		case T_NeonNblocksResponse:
 		case T_NeonGetPageResponse:
 		case T_NeonErrorResponse:
 		case T_NeonDbSizeResponse:
+		case T_NeonGetSlruSegmentResponse:
 		default:
 			neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
 			break;
@@ -1135,6 +1148,20 @@ nm_unpack_response(StringInfo s)
 				break;
 			}

+		case T_NeonGetSlruSegmentResponse:
+		    {
+				NeonGetSlruSegmentResponse *msg_resp;
+				int n_blocks = pq_getmsgint(s, 4);
+				msg_resp = palloc(sizeof(NeonGetSlruSegmentResponse));
+				msg_resp->tag = tag;
+				msg_resp->n_blocks = n_blocks;
+				memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ);
+				pq_getmsgend(s);
+
+				resp = (NeonResponse *) msg_resp;
+				break;
+			}
+
 			/*
 			 * pagestore_client -> pagestore
 			 *
@@ -1144,6 +1171,7 @@ nm_unpack_response(StringInfo s)
 		case T_NeonNblocksRequest:
 		case T_NeonGetPageRequest:
 		case T_NeonDbSizeRequest:
+		case T_NeonGetSlruSegmentRequest:
 		default:
 			neon_log(ERROR, "unexpected neon message tag 0x%02x", tag);
 			break;
@@ -1213,7 +1241,18 @@ nm_to_string(NeonMessage *msg)
 				appendStringInfoChar(&s, '}');
 				break;
 			}
+		case T_NeonGetSlruSegmentRequest:
+			{
+				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;

+				appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\"");
+				appendStringInfo(&s, ", \"kind\": %u", msg_req->kind);
+				appendStringInfo(&s, ", \"segno\": %u", msg_req->segno);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
+				appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
 			/* pagestore -> pagestore_client */
 		case T_NeonExistsResponse:
 			{
@@ -1267,6 +1306,17 @@ nm_to_string(NeonMessage *msg)
 								 msg_resp->db_size);
 				appendStringInfoChar(&s, '}');

+				break;
+			}
+		case T_NeonGetSlruSegmentResponse:
+			{
+				NeonGetSlruSegmentResponse *msg_resp = (NeonGetSlruSegmentResponse *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentResponse\"");
+				appendStringInfo(&s, ", \"n_blocks\": %u}",
+								 msg_resp->n_blocks);
+				appendStringInfoChar(&s, '}');
+
 				break;
 			}

@@ -2739,6 +2789,74 @@ neon_end_unlogged_build(SMgrRelation reln)
 	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 }

+#define STRPREFIX(str, prefix) (strncmp(str, prefix, strlen(prefix)) == 0)
+
+static int
+neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer)
+{
+	XLogRecPtr request_lsn;
+	/*
+	 * GetRedoStartLsn() returns LSN of basebackup.
+	 * We need to download SLRU segments only once after node startup,
+	 * then SLRUs are maintained locally.
+	 */
+	request_lsn = GetRedoStartLsn();
+	request_lsn = nm_adjust_lsn(request_lsn);
+	SlruKind kind;
+
+    if (STRPREFIX(path, "pg_xact"))
+        kind = SLRU_CLOG;
+    else if (STRPREFIX(path, "pg_multixact/members"))
+        kind = SLRU_MULTIXACT_MEMBERS;
+    else if (STRPREFIX(path, "pg_multixact/offsets"))
+        kind = SLRU_MULTIXACT_OFFSETS;
+    else
+        return -1;
+
+	NeonResponse *resp;
+	NeonGetSlruSegmentRequest request = {
+		.req.tag = T_NeonGetSlruSegmentRequest,
+		.req.latest = false,
+		.req.lsn = request_lsn,
+
+		.kind = kind,
+		.segno = segno
+	};
+	int n_blocks;
+	shardno_t shard_no = 0; /* All SLRUs are at shard 0 */
+	do
+	{
+		while (!page_server->send(shard_no, &request.req) || !page_server->flush(shard_no));
+		consume_prefetch_responses();
+		resp = page_server->receive(shard_no);
+	} while (resp == NULL);
+
+	switch (resp->tag)
+	{
+		case T_NeonGetSlruSegmentResponse:
+			n_blocks = ((NeonGetSlruSegmentResponse *) resp)->n_blocks;
+			memcpy(buffer, ((NeonGetSlruSegmentResponse *) resp)->data, n_blocks*BLCKSZ);
+			break;
+
+		case T_NeonErrorResponse:
+			ereport(ERROR,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg(NEON_TAG "could not read SLRU %d segment %d at lsn %X/%08X",
+							kind,
+							segno,
+							LSN_FORMAT_ARGS(request_lsn)),
+					 errdetail("page server returned error: %s",
+							   ((NeonErrorResponse *) resp)->message)));
+			break;
+
+		default:
+			neon_log(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
+	}
+	pfree(resp);
+
+	return n_blocks;
+}
+
 static void
 AtEOXact_neon(XactEvent event, void *arg)
 {
@@ -2797,6 +2915,8 @@ static const struct f_smgr neon_smgr =
 	.smgr_start_unlogged_build = neon_start_unlogged_build,
 	.smgr_finish_unlogged_build_phase_1 = neon_finish_unlogged_build_phase_1,
 	.smgr_end_unlogged_build = neon_end_unlogged_build,
+
+	.smgr_read_slru_segment = neon_read_slru_segment,
 };

 const f_smgr *
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -140,9 +140,42 @@ static XLogReaderState *reader_state;
 #define TRACE DEBUG5

 #ifdef HAVE_LIBSECCOMP
+
+
+/*
+ * https://man7.org/linux/man-pages/man2/close_range.2.html
+ *
+ * The `close_range` syscall is available as of Linux 5.9.
+ *
+ * The `close_range` libc wrapper is only available in glibc >= 2.34.
+ * Debian Bullseye ships a libc package based on glibc 2.31.
+ * => write the wrapper ourselves, using the syscall number from the kernel headers.
+ *
+ * If the Linux uAPI headers don't define the system call number,
+ * fail the build deliberately rather than ifdef'ing it to ENOSYS.
+ * We prefer a compile time over a runtime error for walredo.
+ */
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <errno.h>
+int close_range(unsigned int start_fd, unsigned int count, unsigned int flags) {
+    return syscall(__NR_close_range, start_fd, count, flags);
+}
+
 static void
 enter_seccomp_mode(void)
 {
+
+	/*
+	 * The pageserver process relies on us to close all the file descriptors
+	 * it potentially leaked to us, _before_ we start processing potentially dangerous
+	 * wal records. See the comment in the Rust code that launches this process.
+	 */
+	int err;
+	if (err = close_range(3, ~0U, 0)) {
+		ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not close files >= fd 3")));
+	}
+
 	PgSeccompRule syscalls[] =
 	{
 		/* Hard requirements */
--- a/poetry.lock
+++ b/poetry.lock
@@ -2,87 +2,87 @@

 [[package]]
 name = "aiohttp"
-version = "3.9.0"
+version = "3.9.2"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6896b8416be9ada4d22cd359d7cb98955576ce863eadad5596b7cdfbf3e17c6c"},
-    {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1736d87dad8ef46a8ec9cddd349fa9f7bd3a064c47dd6469c0d6763d3d49a4fc"},
-    {file = "aiohttp-3.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8c9e5f4d7208cda1a2bb600e29069eecf857e6980d0ccc922ccf9d1372c16f4b"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8488519aa05e636c5997719fe543c8daf19f538f4fa044f3ce94bee608817cff"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ab16c254e2312efeb799bc3c06897f65a133b38b69682bf75d1f1ee1a9c43a9"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7a94bde005a8f926d0fa38b88092a03dea4b4875a61fbcd9ac6f4351df1b57cd"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b777c9286b6c6a94f50ddb3a6e730deec327e9e2256cb08b5530db0f7d40fd8"},
-    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:571760ad7736b34d05597a1fd38cbc7d47f7b65deb722cb8e86fd827404d1f6b"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:deac0a32aec29608eb25d730f4bc5a261a65b6c48ded1ed861d2a1852577c932"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:4ee1b4152bc3190cc40ddd6a14715e3004944263ea208229ab4c297712aa3075"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:3607375053df58ed6f23903aa10cf3112b1240e8c799d243bbad0f7be0666986"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:65b0a70a25456d329a5e1426702dde67be0fb7a4ead718005ba2ca582d023a94"},
-    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5a2eb5311a37fe105aa35f62f75a078537e1a9e4e1d78c86ec9893a3c97d7a30"},
-    {file = "aiohttp-3.9.0-cp310-cp310-win32.whl", hash = "sha256:2cbc14a13fb6b42d344e4f27746a4b03a2cb0c1c3c5b932b0d6ad8881aa390e3"},
-    {file = "aiohttp-3.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:ac9669990e2016d644ba8ae4758688534aabde8dbbc81f9af129c3f5f01ca9cd"},
-    {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f8e05f5163528962ce1d1806fce763ab893b1c5b7ace0a3538cd81a90622f844"},
-    {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4afa8f71dba3a5a2e1e1282a51cba7341ae76585345c43d8f0e624882b622218"},
-    {file = "aiohttp-3.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f929f4c9b9a00f3e6cc0587abb95ab9c05681f8b14e0fe1daecfa83ea90f8318"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28185e36a78d247c55e9fbea2332d16aefa14c5276a582ce7a896231c6b1c208"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a486ddf57ab98b6d19ad36458b9f09e6022de0381674fe00228ca7b741aacb2f"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70e851f596c00f40a2f00a46126c95c2e04e146015af05a9da3e4867cfc55911"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5b7bf8fe4d39886adc34311a233a2e01bc10eb4e842220235ed1de57541a896"},
-    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c67a51ea415192c2e53e4e048c78bab82d21955b4281d297f517707dc836bf3d"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:694df243f394629bcae2d8ed94c589a181e8ba8604159e6e45e7b22e58291113"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3dd8119752dd30dd7bca7d4bc2a92a59be6a003e4e5c2cf7e248b89751b8f4b7"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:eb6dfd52063186ac97b4caa25764cdbcdb4b10d97f5c5f66b0fa95052e744eb7"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:d97c3e286d0ac9af6223bc132dc4bad6540b37c8d6c0a15fe1e70fb34f9ec411"},
-    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:816f4db40555026e4cdda604a1088577c1fb957d02f3f1292e0221353403f192"},
-    {file = "aiohttp-3.9.0-cp311-cp311-win32.whl", hash = "sha256:3abf0551874fecf95f93b58f25ef4fc9a250669a2257753f38f8f592db85ddea"},
-    {file = "aiohttp-3.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:e18d92c3e9e22553a73e33784fcb0ed484c9874e9a3e96c16a8d6a1e74a0217b"},
-    {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:99ae01fb13a618b9942376df77a1f50c20a281390dad3c56a6ec2942e266220d"},
-    {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:05857848da443c8c12110d99285d499b4e84d59918a21132e45c3f0804876994"},
-    {file = "aiohttp-3.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:317719d7f824eba55857fe0729363af58e27c066c731bc62cd97bc9c3d9c7ea4"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1e3b3c107ccb0e537f309f719994a55621acd2c8fdf6d5ce5152aed788fb940"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45820ddbb276113ead8d4907a7802adb77548087ff5465d5c554f9aa3928ae7d"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:05a183f1978802588711aed0dea31e697d760ce9055292db9dc1604daa9a8ded"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51a4cd44788ea0b5e6bb8fa704597af3a30be75503a7ed1098bc5b8ffdf6c982"},
-    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:673343fbc0c1ac44d0d2640addc56e97a052504beacd7ade0dc5e76d3a4c16e8"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7e8a3b79b6d186a9c99761fd4a5e8dd575a48d96021f220ac5b5fa856e5dd029"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6777a390e41e78e7c45dab43a4a0196c55c3b8c30eebe017b152939372a83253"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7ae5f99a32c53731c93ac3075abd3e1e5cfbe72fc3eaac4c27c9dd64ba3b19fe"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f1e4f254e9c35d8965d377e065c4a8a55d396fe87c8e7e8429bcfdeeb229bfb3"},
-    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11ca808f9a6b63485059f5f6e164ef7ec826483c1212a44f268b3653c91237d8"},
-    {file = "aiohttp-3.9.0-cp312-cp312-win32.whl", hash = "sha256:de3cc86f4ea8b4c34a6e43a7306c40c1275e52bfa9748d869c6b7d54aa6dad80"},
-    {file = "aiohttp-3.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:ca4fddf84ac7d8a7d0866664936f93318ff01ee33e32381a115b19fb5a4d1202"},
-    {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f09960b5bb1017d16c0f9e9f7fc42160a5a49fa1e87a175fd4a2b1a1833ea0af"},
-    {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8303531e2c17b1a494ffaeba48f2da655fe932c4e9a2626c8718403c83e5dd2b"},
-    {file = "aiohttp-3.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4790e44f46a4aa07b64504089def5744d3b6780468c4ec3a1a36eb7f2cae9814"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1d7edf74a36de0e5ca50787e83a77cf352f5504eb0ffa3f07000a911ba353fb"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94697c7293199c2a2551e3e3e18438b4cba293e79c6bc2319f5fd652fccb7456"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1b66dbb8a7d5f50e9e2ea3804b01e766308331d0cac76eb30c563ac89c95985"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9623cfd9e85b76b83ef88519d98326d4731f8d71869867e47a0b979ffec61c73"},
-    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f32c86dc967ab8c719fd229ce71917caad13cc1e8356ee997bf02c5b368799bf"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f50b4663c3e0262c3a361faf440761fbef60ccdde5fe8545689a4b3a3c149fb4"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dcf71c55ec853826cd70eadb2b6ac62ec577416442ca1e0a97ad875a1b3a0305"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:42fe4fd9f0dfcc7be4248c162d8056f1d51a04c60e53366b0098d1267c4c9da8"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:76a86a9989ebf82ee61e06e2bab408aec4ea367dc6da35145c3352b60a112d11"},
-    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f9e09a1c83521d770d170b3801eea19b89f41ccaa61d53026ed111cb6f088887"},
-    {file = "aiohttp-3.9.0-cp38-cp38-win32.whl", hash = "sha256:a00ce44c21612d185c5275c5cba4bab8d7c1590f248638b667ed8a782fa8cd6f"},
-    {file = "aiohttp-3.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:d5b9345ab92ebe6003ae11d8092ce822a0242146e6fa270889b9ba965457ca40"},
-    {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98d21092bf2637c5fa724a428a69e8f5955f2182bff61f8036827cf6ce1157bf"},
-    {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:35a68cd63ca6aaef5707888f17a70c36efe62b099a4e853d33dc2e9872125be8"},
-    {file = "aiohttp-3.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d7f6235c7475658acfc1769d968e07ab585c79f6ca438ddfecaa9a08006aee2"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db04d1de548f7a62d1dd7e7cdf7c22893ee168e22701895067a28a8ed51b3735"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:536b01513d67d10baf6f71c72decdf492fb7433c5f2f133e9a9087379d4b6f31"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c8b0a6487e8109427ccf638580865b54e2e3db4a6e0e11c02639231b41fc0f"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7276fe0017664414fdc3618fca411630405f1aaf0cc3be69def650eb50441787"},
-    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23170247ef89ffa842a02bbfdc425028574d9e010611659abeb24d890bc53bb8"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b1a2ea8252cacc7fd51df5a56d7a2bb1986ed39be9397b51a08015727dfb69bd"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2d71abc15ff7047412ef26bf812dfc8d0d1020d664617f4913df2df469f26b76"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:2d820162c8c2bdbe97d328cd4f417c955ca370027dce593345e437b2e9ffdc4d"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:2779f5e7c70f7b421915fd47db332c81de365678180a9f3ab404088f87ba5ff9"},
-    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:366bc870d7ac61726f32a489fbe3d1d8876e87506870be66b01aeb84389e967e"},
-    {file = "aiohttp-3.9.0-cp39-cp39-win32.whl", hash = "sha256:1df43596b826022b14998f0460926ce261544fedefe0d2f653e1b20f49e96454"},
-    {file = "aiohttp-3.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:9c196b30f1b1aa3363a69dd69079ae9bec96c2965c4707eaa6914ba099fb7d4f"},
-    {file = "aiohttp-3.9.0.tar.gz", hash = "sha256:09f23292d29135025e19e8ff4f0a68df078fe4ee013bca0105b2e803989de92d"},
+    {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:772fbe371788e61c58d6d3d904268e48a594ba866804d08c995ad71b144f94cb"},
+    {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:edd4f1af2253f227ae311ab3d403d0c506c9b4410c7fc8d9573dec6d9740369f"},
+    {file = "aiohttp-3.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cfee9287778399fdef6f8a11c9e425e1cb13cc9920fd3a3df8f122500978292b"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cc158466f6a980a6095ee55174d1de5730ad7dec251be655d9a6a9dd7ea1ff9"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54ec82f45d57c9a65a1ead3953b51c704f9587440e6682f689da97f3e8defa35"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abeb813a18eb387f0d835ef51f88568540ad0325807a77a6e501fed4610f864e"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc91d07280d7d169f3a0f9179d8babd0ee05c79d4d891447629ff0d7d8089ec2"},
+    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b65e861f4bebfb660f7f0f40fa3eb9f2ab9af10647d05dac824390e7af8f75b7"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:04fd8ffd2be73d42bcf55fd78cde7958eeee6d4d8f73c3846b7cba491ecdb570"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3d8d962b439a859b3ded9a1e111a4615357b01620a546bc601f25b0211f2da81"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:8ceb658afd12b27552597cf9a65d9807d58aef45adbb58616cdd5ad4c258c39e"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0e4ee4df741670560b1bc393672035418bf9063718fee05e1796bf867e995fad"},
+    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2dec87a556f300d3211decf018bfd263424f0690fcca00de94a837949fbcea02"},
+    {file = "aiohttp-3.9.2-cp310-cp310-win32.whl", hash = "sha256:3e1a800f988ce7c4917f34096f81585a73dbf65b5c39618b37926b1238cf9bc4"},
+    {file = "aiohttp-3.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:ea510718a41b95c236c992b89fdfc3d04cc7ca60281f93aaada497c2b4e05c46"},
+    {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6aaa6f99256dd1b5756a50891a20f0d252bd7bdb0854c5d440edab4495c9f973"},
+    {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a27d8c70ad87bcfce2e97488652075a9bdd5b70093f50b10ae051dfe5e6baf37"},
+    {file = "aiohttp-3.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:54287bcb74d21715ac8382e9de146d9442b5f133d9babb7e5d9e453faadd005e"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb3d05569aa83011fcb346b5266e00b04180105fcacc63743fc2e4a1862a891"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8534e7d69bb8e8d134fe2be9890d1b863518582f30c9874ed7ed12e48abe3c4"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd9d5b989d57b41e4ff56ab250c5ddf259f32db17159cce630fd543376bd96b"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa6904088e6642609981f919ba775838ebf7df7fe64998b1a954fb411ffb4663"},
+    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bda42eb410be91b349fb4ee3a23a30ee301c391e503996a638d05659d76ea4c2"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:193cc1ccd69d819562cc7f345c815a6fc51d223b2ef22f23c1a0f67a88de9a72"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b9f1cb839b621f84a5b006848e336cf1496688059d2408e617af33e3470ba204"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d22a0931848b8c7a023c695fa2057c6aaac19085f257d48baa24455e67df97ec"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4112d8ba61fbd0abd5d43a9cb312214565b446d926e282a6d7da3f5a5aa71d36"},
+    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c4ad4241b52bb2eb7a4d2bde060d31c2b255b8c6597dd8deac2f039168d14fd7"},
+    {file = "aiohttp-3.9.2-cp311-cp311-win32.whl", hash = "sha256:ee2661a3f5b529f4fc8a8ffee9f736ae054adfb353a0d2f78218be90617194b3"},
+    {file = "aiohttp-3.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:4deae2c165a5db1ed97df2868ef31ca3cc999988812e82386d22937d9d6fed52"},
+    {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6f4cdba12539215aaecf3c310ce9d067b0081a0795dd8a8805fdb67a65c0572a"},
+    {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:84e843b33d5460a5c501c05539809ff3aee07436296ff9fbc4d327e32aa3a326"},
+    {file = "aiohttp-3.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8008d0f451d66140a5aa1c17e3eedc9d56e14207568cd42072c9d6b92bf19b52"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61c47ab8ef629793c086378b1df93d18438612d3ed60dca76c3422f4fbafa792"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc71f748e12284312f140eaa6599a520389273174b42c345d13c7e07792f4f57"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1c3a4d0ab2f75f22ec80bca62385db2e8810ee12efa8c9e92efea45c1849133"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a87aa0b13bbee025faa59fa58861303c2b064b9855d4c0e45ec70182bbeba1b"},
+    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2cc0d04688b9f4a7854c56c18aa7af9e5b0a87a28f934e2e596ba7e14783192"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1956e3ac376b1711c1533266dec4efd485f821d84c13ce1217d53e42c9e65f08"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:114da29f39eccd71b93a0fcacff178749a5c3559009b4a4498c2c173a6d74dff"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:3f17999ae3927d8a9a823a1283b201344a0627272f92d4f3e3a4efe276972fe8"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f31df6a32217a34ae2f813b152a6f348154f948c83213b690e59d9e84020925c"},
+    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7a75307ffe31329928a8d47eae0692192327c599113d41b278d4c12b54e1bd11"},
+    {file = "aiohttp-3.9.2-cp312-cp312-win32.whl", hash = "sha256:972b63d589ff8f305463593050a31b5ce91638918da38139b9d8deaba9e0fed7"},
+    {file = "aiohttp-3.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:200dc0246f0cb5405c80d18ac905c8350179c063ea1587580e3335bfc243ba6a"},
+    {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:158564d0d1020e0d3fe919a81d97aadad35171e13e7b425b244ad4337fc6793a"},
+    {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:da1346cd0ccb395f0ed16b113ebb626fa43b7b07fd7344fce33e7a4f04a8897a"},
+    {file = "aiohttp-3.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eaa9256de26ea0334ffa25f1913ae15a51e35c529a1ed9af8e6286dd44312554"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1543e7fb00214fb4ccead42e6a7d86f3bb7c34751ec7c605cca7388e525fd0b4"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:186e94570433a004e05f31f632726ae0f2c9dee4762a9ce915769ce9c0a23d89"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d52d20832ac1560f4510d68e7ba8befbc801a2b77df12bd0cd2bcf3b049e52a4"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c45e4e815ac6af3b72ca2bde9b608d2571737bb1e2d42299fc1ffdf60f6f9a1"},
+    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa906b9bdfd4a7972dd0628dbbd6413d2062df5b431194486a78f0d2ae87bd55"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:68bbee9e17d66f17bb0010aa15a22c6eb28583edcc8b3212e2b8e3f77f3ebe2a"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4c189b64bd6d9a403a1a3f86a3ab3acbc3dc41a68f73a268a4f683f89a4dec1f"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8a7876f794523123bca6d44bfecd89c9fec9ec897a25f3dd202ee7fc5c6525b7"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d23fba734e3dd7b1d679b9473129cd52e4ec0e65a4512b488981a56420e708db"},
+    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b141753be581fab842a25cb319f79536d19c2a51995d7d8b29ee290169868eab"},
+    {file = "aiohttp-3.9.2-cp38-cp38-win32.whl", hash = "sha256:103daf41ff3b53ba6fa09ad410793e2e76c9d0269151812e5aba4b9dd674a7e8"},
+    {file = "aiohttp-3.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:328918a6c2835861ff7afa8c6d2c70c35fdaf996205d5932351bdd952f33fa2f"},
+    {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5264d7327c9464786f74e4ec9342afbbb6ee70dfbb2ec9e3dfce7a54c8043aa3"},
+    {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07205ae0015e05c78b3288c1517afa000823a678a41594b3fdc870878d645305"},
+    {file = "aiohttp-3.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0a1e638cffc3ec4d4784b8b4fd1cf28968febc4bd2718ffa25b99b96a741bd"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d43302a30ba1166325974858e6ef31727a23bdd12db40e725bec0f759abce505"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16a967685907003765855999af11a79b24e70b34dc710f77a38d21cd9fc4f5fe"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fa3ee92cd441d5c2d07ca88d7a9cef50f7ec975f0117cd0c62018022a184308"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b500c5ad9c07639d48615a770f49618130e61be36608fc9bc2d9bae31732b8f"},
+    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c07327b368745b1ce2393ae9e1aafed7073d9199e1dcba14e035cc646c7941bf"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc7d6502c23a0ec109687bf31909b3fb7b196faf198f8cff68c81b49eb316ea9"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:07be2be7071723c3509ab5c08108d3a74f2181d4964e869f2504aaab68f8d3e8"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:122468f6fee5fcbe67cb07014a08c195b3d4c41ff71e7b5160a7bcc41d585a5f"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:00a9abcea793c81e7f8778ca195a1714a64f6d7436c4c0bb168ad2a212627000"},
+    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a9825fdd64ecac5c670234d80bb52bdcaa4139d1f839165f548208b3779c6c6"},
+    {file = "aiohttp-3.9.2-cp39-cp39-win32.whl", hash = "sha256:5422cd9a4a00f24c7244e1b15aa9b87935c85fb6a00c8ac9b2527b38627a9211"},
+    {file = "aiohttp-3.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:7d579dcd5d82a86a46f725458418458fa43686f6a7b252f2966d359033ffc8ab"},
+    {file = "aiohttp-3.9.2.tar.gz", hash = "sha256:b0ad0a5e86ce73f5368a164c10ada10504bf91869c05ab75d982c6048217fbf7"},
 ]

 [package.dependencies]
@@ -2043,6 +2043,7 @@ files = [
    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2668,4 +2669,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "9cf2734cafd5b6963165d398f1b24621193d5284d0bc7cc26a720a014f523860"
+content-hash = "e99954cbbfef8dcc5e13cea7103c87657639a192f2372983bdb8c5d624c2e447"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -62,6 +62,8 @@ socket2.workspace = true
 sync_wrapper.workspace = true
 task-local-extensions.workspace = true
 thiserror.workspace = true
+tikv-jemallocator.workspace = true
+tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
 tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -9,11 +9,10 @@ use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::validate_password_and_exchange;
 use crate::cache::Cached;
 use crate::console::errors::GetAuthInfoError;
-use crate::console::provider::ConsoleBackend;
+use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
 use crate::console::AuthSecret;
 use crate::context::RequestMonitoring;
-use crate::proxy::connect_compute::handle_try_wake;
-use crate::proxy::retry::retry_after;
+use crate::proxy::wake_compute::wake_compute;
 use crate::proxy::NeonOptions;
 use crate::stream::Stream;
 use crate::{
@@ -28,13 +27,26 @@ use crate::{
 };
 use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
 use futures::TryFutureExt;
-use std::borrow::Cow;
-use std::ops::ControlFlow;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{error, info, warn};
+use tracing::info;

-use super::IpPattern;
+/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
+pub enum MaybeOwned<'a, T> {
+    Owned(T),
+    Borrowed(&'a T),
+}
+
+impl<T> std::ops::Deref for MaybeOwned<'_, T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        match self {
+            MaybeOwned::Owned(t) => t,
+            MaybeOwned::Borrowed(t) => t,
+        }
+    }
+}

 /// This type serves two purposes:
 ///
@@ -46,17 +58,16 @@ use super::IpPattern;
 ///   backends which require them for the authentication process.
 pub enum BackendType<'a, T> {
    /// Cloud API (V2).
-    Console(Cow<'a, ConsoleBackend>, T),
+    Console(MaybeOwned<'a, ConsoleBackend>, T),
    /// Authentication via a web browser.
-    Link(Cow<'a, url::ApiUrl>),
-    #[cfg(test)]
-    /// Test backend.
-    Test(&'a dyn TestBackend),
+    Link(MaybeOwned<'a, url::ApiUrl>),
 }

 pub trait TestBackend: Send + Sync + 'static {
    fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
-    fn get_allowed_ips(&self) -> Result<Vec<IpPattern>, console::errors::GetAuthInfoError>;
+    fn get_allowed_ips_and_secret(
+        &self,
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
 }

 impl std::fmt::Display for BackendType<'_, ()> {
@@ -67,14 +78,14 @@ impl std::fmt::Display for BackendType<'_, ()> {
                ConsoleBackend::Console(endpoint) => {
                    fmt.debug_tuple("Console").field(&endpoint.url()).finish()
                }
-                #[cfg(feature = "testing")]
+                #[cfg(any(test, feature = "testing"))]
                ConsoleBackend::Postgres(endpoint) => {
                    fmt.debug_tuple("Postgres").field(&endpoint.url()).finish()
                }
+                #[cfg(test)]
+                ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
            },
            Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
-            #[cfg(test)]
-            Test(_) => fmt.debug_tuple("Test").finish(),
        }
    }
 }
@@ -85,10 +96,8 @@ impl<T> BackendType<'_, T> {
    pub fn as_ref(&self) -> BackendType<'_, &T> {
        use BackendType::*;
        match self {
-            Console(c, x) => Console(Cow::Borrowed(c), x),
-            Link(c) => Link(Cow::Borrowed(c)),
-            #[cfg(test)]
-            Test(x) => Test(*x),
+            Console(c, x) => Console(MaybeOwned::Borrowed(c), x),
+            Link(c) => Link(MaybeOwned::Borrowed(c)),
        }
    }
 }
@@ -102,8 +111,6 @@ impl<'a, T> BackendType<'a, T> {
        match self {
            Console(c, x) => Console(c, f(x)),
            Link(c) => Link(c),
-            #[cfg(test)]
-            Test(x) => Test(x),
        }
    }
 }
@@ -116,8 +123,6 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
        match self {
            Console(c, x) => x.map(|x| Console(c, x)),
            Link(c) => Ok(Link(c)),
-            #[cfg(test)]
-            Test(x) => Ok(Test(x)),
        }
    }
 }
@@ -147,7 +152,7 @@ impl ComputeUserInfo {
 }

 pub enum ComputeCredentialKeys {
-    #[cfg(feature = "testing")]
+    #[cfg(any(test, feature = "testing"))]
    Password(Vec<u8>),
    AuthKeys(AuthKeys),
 }
@@ -190,20 +195,26 @@ async fn auth_quirks(
        Err(info) => {
            let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer)
                .await?;
-            ctx.set_endpoint_id(Some(res.info.endpoint.clone()));
+
+            ctx.set_endpoint_id(res.info.endpoint.clone());
+            tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
+
            (res.info, Some(res.keys))
        }
        Ok(info) => (info, None),
    };

    info!("fetching user's authentication info");
-    let allowed_ips = api.get_allowed_ips(ctx, &info).await?;
+    let (allowed_ips, maybe_secret) = api.get_allowed_ips_and_secret(ctx, &info).await?;

    // check allowed list
    if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
        return Err(auth::AuthError::ip_address_not_allowed());
    }
-    let cached_secret = api.get_role_secret(ctx, &info).await?;
+    let cached_secret = match maybe_secret {
+        Some(secret) => secret,
+        None => api.get_role_secret(ctx, &info).await?,
+    };

    let secret = cached_secret.value.clone().unwrap_or_else(|| {
        // If we don't have an authentication secret, we mock one to
@@ -271,49 +282,6 @@ async fn authenticate_with_secret(
    classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
 }

-/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache)
-/// only if authentication was successfuly.
-async fn auth_and_wake_compute(
-    ctx: &mut RequestMonitoring,
-    api: &impl console::Api,
-    user_info: ComputeUserInfoMaybeEndpoint,
-    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-    allow_cleartext: bool,
-    config: &'static AuthenticationConfig,
-) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
-    let compute_credentials =
-        auth_quirks(ctx, api, user_info, client, allow_cleartext, config).await?;
-
-    let mut num_retries = 0;
-    let mut node = loop {
-        let wake_res = api.wake_compute(ctx, &compute_credentials.info).await;
-        match handle_try_wake(wake_res, num_retries) {
-            Err(e) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                return Err(e.into());
-            }
-            Ok(ControlFlow::Continue(e)) => {
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-            }
-            Ok(ControlFlow::Break(n)) => break n,
-        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-        tokio::time::sleep(wait_duration).await;
-    };
-
-    ctx.set_project(node.aux.clone());
-
-    match compute_credentials.keys {
-        #[cfg(feature = "testing")]
-        ComputeCredentialKeys::Password(password) => node.config.password(password),
-        ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
-    };
-
-    Ok((node, compute_credentials.info))
-}
-
 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
    /// Get compute endpoint name from the credentials.
    pub fn get_endpoint(&self) -> Option<EndpointId> {
@@ -322,8 +290,6 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
        match self {
            Console(_, user_info) => user_info.endpoint_id.clone(),
            Link(_) => Some("link".into()),
-            #[cfg(test)]
-            Test(_) => Some("test".into()),
        }
    }

@@ -334,8 +300,6 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
        match self {
            Console(_, user_info) => &user_info.user,
            Link(_) => "link",
-            #[cfg(test)]
-            Test(_) => "test",
        }
    }

@@ -358,26 +322,34 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
                    "performing authentication using the console"
                );

-                let (cache_info, user_info) =
-                    auth_and_wake_compute(ctx, &*api, user_info, client, allow_cleartext, config)
-                        .await?;
-                (cache_info, BackendType::Console(api, user_info))
+                let compute_credentials =
+                    auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
+
+                let mut num_retries = 0;
+                let mut node =
+                    wake_compute(&mut num_retries, ctx, &api, &compute_credentials.info).await?;
+
+                ctx.set_project(node.aux.clone());
+
+                match compute_credentials.keys {
+                    #[cfg(any(test, feature = "testing"))]
+                    ComputeCredentialKeys::Password(password) => node.config.password(password),
+                    ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
+                };
+
+                (node, BackendType::Console(api, compute_credentials.info))
            }
            // NOTE: this auth backend doesn't use client credentials.
            Link(url) => {
                info!("performing link authentication");

-                let node_info = link::authenticate(&url, client).await?;
+                let node_info = link::authenticate(ctx, &url, client).await?;

                (
                    CachedNodeInfo::new_uncached(node_info),
                    BackendType::Link(url),
                )
            }
-            #[cfg(test)]
-            Test(_) => {
-                unreachable!("this function should never be called in the test backend")
-            }
        };

        info!("user successfully authenticated");
@@ -386,16 +358,14 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
 }

 impl BackendType<'_, ComputeUserInfo> {
-    pub async fn get_allowed_ips(
+    pub async fn get_allowed_ips_and_secret(
        &self,
        ctx: &mut RequestMonitoring,
-    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
        use BackendType::*;
        match self {
-            Console(api, user_info) => api.get_allowed_ips(ctx, user_info).await,
-            Link(_) => Ok(Cached::new_uncached(Arc::new(vec![]))),
-            #[cfg(test)]
-            Test(x) => Ok(Cached::new_uncached(Arc::new(x.get_allowed_ips()?))),
+            Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            Link(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
        }
    }

@@ -410,8 +380,6 @@ impl BackendType<'_, ComputeUserInfo> {
        match self {
            Console(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await,
            Link(_) => Ok(None),
-            #[cfg(test)]
-            Test(x) => x.wake_compute().map(Some),
        }
    }
 }
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -20,7 +20,7 @@ pub(super) async fn authenticate(
 ) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
    let flow = AuthFlow::new(client);
    let scram_keys = match secret {
-        #[cfg(feature = "testing")]
+        #[cfg(any(test, feature = "testing"))]
        AuthSecret::Md5(_) => {
            info!("auth endpoint chooses MD5");
            return Err(auth::AuthError::bad_auth_method("MD5"));
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -1,6 +1,7 @@
 use crate::{
    auth, compute,
    console::{self, provider::NodeInfo},
+    context::RequestMonitoring,
    error::UserFacingError,
    stream::PqStream,
    waiters,
@@ -54,6 +55,7 @@ pub fn new_psql_session_id() -> String {
 }

 pub(super) async fn authenticate(
+    ctx: &mut RequestMonitoring,
    link_uri: &reqwest::Url,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
@@ -94,6 +96,10 @@ pub(super) async fn authenticate(
        .dbname(&db_info.dbname)
        .user(&db_info.user);

+    ctx.set_user(db_info.user.into());
+    ctx.set_project(db_info.aux.clone());
+    tracing::Span::current().record("ep", &tracing::field::display(&db_info.aux.endpoint_id));
+
    // Backwards compatibility. pg_sni_proxy uses "--" in domain names
    // while direct connections do not. Once we migrate to pg_sni_proxy
    // everywhere, we can remove this.
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -2,7 +2,8 @@

 use crate::{
    auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, EndpointId, RoleName,
+    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI,
+    EndpointId, RoleName,
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
@@ -54,10 +55,10 @@ impl ComputeUserInfoMaybeEndpoint {
    }
 }

-pub fn endpoint_sni<'a>(
-    sni: &'a str,
+pub fn endpoint_sni(
+    sni: &str,
    common_names: &HashSet<String>,
-) -> Result<&'a str, ComputeUserInfoParseError> {
+) -> Result<Option<EndpointId>, ComputeUserInfoParseError> {
    let Some((subdomain, common_name)) = sni.split_once('.') else {
        return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() });
    };
@@ -66,7 +67,10 @@ pub fn endpoint_sni<'a>(
            cn: common_name.into(),
        });
    }
-    Ok(subdomain)
+    if subdomain == SERVERLESS_DRIVER_SNI {
+        return Ok(None);
+    }
+    Ok(Some(EndpointId::from(subdomain)))
 }

 impl ComputeUserInfoMaybeEndpoint {
@@ -79,13 +83,11 @@ impl ComputeUserInfoMaybeEndpoint {
        use ComputeUserInfoParseError::*;

        // Some parameters are stored in the startup message.
-        let get_param = |key| params.get(key).ok_or(MissingKey(key));
-        let user: RoleName = get_param("user")?.into();
+        let user: RoleName = params.user().ok_or(MissingKey("user"))?.into();

        // record the values if we have them
        ctx.set_application(params.get("application_name").map(SmolStr::from));
        ctx.set_user(user.clone());
-        ctx.set_endpoint_id(sni.map(EndpointId::from));

        // Project name might be passed via PG's command-line options.
        let endpoint_option = params
@@ -103,7 +105,7 @@ impl ComputeUserInfoMaybeEndpoint {

        let endpoint_from_domain = if let Some(sni_str) = sni {
            if let Some(cn) = common_names {
-                Some(EndpointId::from(endpoint_sni(sni_str, cn)?))
+                endpoint_sni(sni_str, cn)?
            } else {
                None
            }
@@ -117,13 +119,18 @@ impl ComputeUserInfoMaybeEndpoint {
                Some(Err(InconsistentProjectNames { domain, option }))
            }
            // Invariant: project name may not contain certain characters.
-            (a, b) => a.or(b).map(|name| match project_name_valid(&name) {
+            (a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
                false => Err(MalformedProjectName(name)),
                true => Ok(name),
            }),
        }
        .transpose()?;

+        if let Some(ep) = &endpoint {
+            ctx.set_endpoint_id(ep.clone());
+            tracing::Span::current().record("ep", &tracing::field::display(ep));
+        }
+
        info!(%user, project = endpoint.as_deref(), "credentials");
        if sni.is_some() {
            info!("Connection with sni");
@@ -146,7 +153,7 @@ impl ComputeUserInfoMaybeEndpoint {

        Ok(Self {
            user,
-            endpoint_id: endpoint.map(EndpointId::from),
+            endpoint_id: endpoint,
            options,
        })
    }
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -172,7 +172,7 @@ pub(super) fn validate_password_and_exchange(
    secret: AuthSecret,
 ) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
    match secret {
-        #[cfg(feature = "testing")]
+        #[cfg(any(test, feature = "testing"))]
        AuthSecret::Md5(_) => {
            // test only
            Ok(sasl::Outcome::Success(ComputeCredentialKeys::Password(
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -272,5 +272,5 @@ async fn handle_client(
    let client = tokio::net::TcpStream::connect(destination).await?;

    let metrics_aux: MetricsAuxInfo = Default::default();
-    proxy::proxy::proxy_pass(ctx, tls_stream, client, metrics_aux).await
+    proxy::proxy::passthrough::proxy_pass(ctx, tls_stream, client, metrics_aux).await
 }
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,5 +1,6 @@
 use futures::future::Either;
 use proxy::auth;
+use proxy::auth::backend::MaybeOwned;
 use proxy::config::AuthenticationConfig;
 use proxy::config::CacheOptions;
 use proxy::config::HttpConfig;
@@ -17,9 +18,9 @@ use proxy::usage_metrics;
 use anyhow::bail;
 use proxy::config::{self, ProxyConfig};
 use proxy::serverless;
+use std::net::SocketAddr;
 use std::pin::pin;
 use std::sync::Arc;
-use std::{borrow::Cow, net::SocketAddr};
 use tokio::net::TcpListener;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
@@ -32,6 +33,9 @@ project_build_tag!(BUILD_TAG);

 use clap::{Parser, ValueEnum};

+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
 #[derive(Clone, Debug, ValueEnum)]
 enum AuthBackend {
    Console,
@@ -187,6 +191,13 @@ async fn main() -> anyhow::Result<()> {
    info!("Build_tag: {BUILD_TAG}");
    ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);

+    match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) {
+        Ok(t) => {
+            t.start();
+        }
+        Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"),
+    }
+
    let args = ProxyCliArgs::parse();
    let config = build_config(&args)?;

@@ -249,18 +260,13 @@ async fn main() -> anyhow::Result<()> {
    }

    if let auth::BackendType::Console(api, _) = &config.auth_backend {
-        match &**api {
-            proxy::console::provider::ConsoleBackend::Console(api) => {
-                let cache = api.caches.project_info.clone();
-                if let Some(url) = args.redis_notifications {
-                    info!("Starting redis notifications listener ({url})");
-                    maintenance_tasks
-                        .spawn(notifications::task_main(url.to_owned(), cache.clone()));
-                }
-                maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+        if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
+            let cache = api.caches.project_info.clone();
+            if let Some(url) = args.redis_notifications {
+                info!("Starting redis notifications listener ({url})");
+                maintenance_tasks.spawn(notifications::task_main(url.to_owned(), cache.clone()));
            }
-            #[cfg(feature = "testing")]
-            proxy::console::provider::ConsoleBackend::Postgres(_) => {}
+            maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
        }
    }

@@ -359,18 +365,18 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {

            let api = console::provider::neon::Api::new(endpoint, caches, locks);
            let api = console::provider::ConsoleBackend::Console(api);
-            auth::BackendType::Console(Cow::Owned(api), ())
+            auth::BackendType::Console(MaybeOwned::Owned(api), ())
        }
        #[cfg(feature = "testing")]
        AuthBackend::Postgres => {
            let url = args.auth_endpoint.parse()?;
            let api = console::provider::mock::Api::new(url);
            let api = console::provider::ConsoleBackend::Postgres(api);
-            auth::BackendType::Console(Cow::Owned(api), ())
+            auth::BackendType::Console(MaybeOwned::Owned(api), ())
        }
        AuthBackend::Link => {
            let url = args.uri.parse()?;
-            auth::BackendType::Link(Cow::Owned(url))
+            auth::BackendType::Link(MaybeOwned::Owned(url))
        }
    };
    let http_config = HttpConfig {
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,7 +1,7 @@
-use anyhow::{bail, Context};
+use anyhow::Context;
 use dashmap::DashMap;
 use pq_proto::CancelKeyData;
-use std::net::SocketAddr;
+use std::{net::SocketAddr, sync::Arc};
 use tokio::net::TcpStream;
 use tokio_postgres::{CancelToken, NoTls};
 use tracing::info;
@@ -25,39 +25,31 @@ impl CancelMap {
    }

    /// Run async action within an ephemeral session identified by [`CancelKeyData`].
-    pub async fn with_session<'a, F, R, V>(&'a self, f: F) -> anyhow::Result<V>
-    where
-        F: FnOnce(Session<'a>) -> R,
-        R: std::future::Future<Output = anyhow::Result<V>>,
-    {
+    pub fn get_session(self: Arc<Self>) -> Session {
        // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
        // expose it and we don't want to do another roundtrip to query
        // for it. The client will be able to notice that this is not the
        // actual backend_pid, but backend_pid is not used for anything
        // so it doesn't matter.
-        let key = rand::random();
+        let key = loop {
+            let key = rand::random();

-        // Random key collisions are unlikely to happen here, but they're still possible,
-        // which is why we have to take care not to rewrite an existing key.
-        match self.0.entry(key) {
-            dashmap::mapref::entry::Entry::Occupied(_) => {
-                bail!("query cancellation key already exists: {key}")
+            // Random key collisions are unlikely to happen here, but they're still possible,
+            // which is why we have to take care not to rewrite an existing key.
+            match self.0.entry(key) {
+                dashmap::mapref::entry::Entry::Occupied(_) => continue,
+                dashmap::mapref::entry::Entry::Vacant(e) => {
+                    e.insert(None);
+                }
            }
-            dashmap::mapref::entry::Entry::Vacant(e) => {
-                e.insert(None);
-            }
-        }
-
-        // This will guarantee that the session gets dropped
-        // as soon as the future is finished.
-        scopeguard::defer! {
-            self.0.remove(&key);
-            info!("dropped query cancellation key {key}");
-        }
+            break key;
+        };

        info!("registered new query cancellation key {key}");
-        let session = Session::new(key, self);
-        f(session).await
+        Session {
+            key,
+            cancel_map: self,
+        }
    }

    #[cfg(test)]
@@ -98,23 +90,17 @@ impl CancelClosure {
 }

 /// Helper for registering query cancellation tokens.
-pub struct Session<'a> {
+pub struct Session {
    /// The user-facing key identifying this session.
    key: CancelKeyData,
    /// The [`CancelMap`] this session belongs to.
-    cancel_map: &'a CancelMap,
+    cancel_map: Arc<CancelMap>,
 }

-impl<'a> Session<'a> {
-    fn new(key: CancelKeyData, cancel_map: &'a CancelMap) -> Self {
-        Self { key, cancel_map }
-    }
-}
-
-impl Session<'_> {
+impl Session {
    /// Store the cancel token for the given session.
    /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
-    pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
+    pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
        info!("enabling query cancellation for this session");
        self.cancel_map.0.insert(self.key, Some(cancel_closure));

@@ -122,37 +108,26 @@ impl Session<'_> {
    }
 }

+impl Drop for Session {
+    fn drop(&mut self) {
+        self.cancel_map.0.remove(&self.key);
+        info!("dropped query cancellation key {}", &self.key);
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
-    use once_cell::sync::Lazy;

    #[tokio::test]
    async fn check_session_drop() -> anyhow::Result<()> {
-        static CANCEL_MAP: Lazy<CancelMap> = Lazy::new(Default::default);
-
-        let (tx, rx) = tokio::sync::oneshot::channel();
-        let task = tokio::spawn(CANCEL_MAP.with_session(|session| async move {
-            assert!(CANCEL_MAP.contains(&session));
-
-            tx.send(()).expect("failed to send");
-            futures::future::pending::<()>().await; // sleep forever
-
-            Ok(())
-        }));
-
-        // Wait until the task has been spawned.
-        rx.await.context("failed to hear from the task")?;
-
-        // Drop the session's entry by cancelling the task.
-        task.abort();
-        let error = task.await.expect_err("task should have failed");
-        if !error.is_cancelled() {
-            anyhow::bail!(error);
-        }
+        let cancel_map: Arc<CancelMap> = Default::default();

+        let session = cancel_map.clone().get_session();
+        assert!(cancel_map.contains(&session));
+        drop(session);
        // Check that the session has been dropped.
-        assert!(CANCEL_MAP.is_empty());
+        assert!(cancel_map.is_empty());

        Ok(())
    }
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -89,13 +89,13 @@ impl ConnCfg {
    pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
        // Only set `user` if it's not present in the config.
        // Link auth flow takes username from the console's response.
-        if let (None, Some(user)) = (self.get_user(), params.get("user")) {
+        if let (None, Some(user)) = (self.get_user(), params.user()) {
            self.user(user);
        }

        // Only set `dbname` if it's not present in the config.
        // Link auth flow takes dbname from the console's response.
-        if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
+        if let (None, Some(dbname)) = (self.get_dbname(), params.database()) {
            self.dbname(dbname);
        }

@@ -110,7 +110,7 @@ impl ConnCfg {
        }

        // TODO: This is especially ugly...
-        if let Some(replication) = params.get("replication") {
+        if let Some(replication) = params.replication() {
            use tokio_postgres::config::ReplicationMode;
            match replication {
                "true" | "on" | "yes" | "1" => {
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -100,31 +100,6 @@ pub struct MetricsAuxInfo {
    pub branch_id: BranchId,
 }

-impl MetricsAuxInfo {
-    /// Definitions of labels for traffic metric.
-    pub const TRAFFIC_LABELS: &'static [&'static str] = &[
-        // Received (rx) / sent (tx).
-        "direction",
-        // ID of a project.
-        "project_id",
-        // ID of an endpoint within a project.
-        "endpoint_id",
-        // ID of a branch within a project (snapshot).
-        "branch_id",
-    ];
-
-    /// Values of labels for traffic metric.
-    // TODO: add more type safety (validate arity & positions).
-    pub fn traffic_labels(&self, direction: &'static str) -> [&str; 4] {
-        [
-            direction,
-            &self.project_id,
-            &self.endpoint_id,
-            &self.branch_id,
-        ]
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -1,4 +1,4 @@
-#[cfg(feature = "testing")]
+#[cfg(any(test, feature = "testing"))]
 pub mod mock;
 pub mod neon;

@@ -199,7 +199,7 @@ pub mod errors {
 /// Auth secret which is managed by the cloud.
 #[derive(Clone, Eq, PartialEq, Debug)]
 pub enum AuthSecret {
-    #[cfg(feature = "testing")]
+    #[cfg(any(test, feature = "testing"))]
    /// Md5 hash of user's password.
    Md5([u8; 16]),

@@ -250,11 +250,11 @@ pub trait Api {
        user_info: &ComputeUserInfo,
    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;

-    async fn get_allowed_ips(
+    async fn get_allowed_ips_and_secret(
        &self,
        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
-    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError>;
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;

    /// Wake up the compute node and return the corresponding connection info.
    async fn wake_compute(
@@ -264,13 +264,16 @@ pub trait Api {
    ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
 }

-#[derive(Clone)]
+#[non_exhaustive]
 pub enum ConsoleBackend {
    /// Current Cloud API (V2).
    Console(neon::Api),
    /// Local mock of Cloud API (V2).
-    #[cfg(feature = "testing")]
+    #[cfg(any(test, feature = "testing"))]
    Postgres(mock::Api),
+    /// Internal testing
+    #[cfg(test)]
+    Test(Box<dyn crate::auth::backend::TestBackend>),
 }

 #[async_trait]
@@ -283,21 +286,25 @@ impl Api for ConsoleBackend {
        use ConsoleBackend::*;
        match self {
            Console(api) => api.get_role_secret(ctx, user_info).await,
-            #[cfg(feature = "testing")]
+            #[cfg(any(test, feature = "testing"))]
            Postgres(api) => api.get_role_secret(ctx, user_info).await,
+            #[cfg(test)]
+            Test(_) => unreachable!("this function should never be called in the test backend"),
        }
    }

-    async fn get_allowed_ips(
+    async fn get_allowed_ips_and_secret(
        &self,
        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
-    ) -> Result<CachedAllowedIps, errors::GetAuthInfoError> {
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
        use ConsoleBackend::*;
        match self {
-            Console(api) => api.get_allowed_ips(ctx, user_info).await,
-            #[cfg(feature = "testing")]
-            Postgres(api) => api.get_allowed_ips(ctx, user_info).await,
+            Console(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            #[cfg(any(test, feature = "testing"))]
+            Postgres(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            #[cfg(test)]
+            Test(api) => api.get_allowed_ips_and_secret(),
        }
    }

@@ -310,8 +317,10 @@ impl Api for ConsoleBackend {

        match self {
            Console(api) => api.wake_compute(ctx, user_info).await,
-            #[cfg(feature = "testing")]
+            #[cfg(any(test, feature = "testing"))]
            Postgres(api) => api.wake_compute(ctx, user_info).await,
+            #[cfg(test)]
+            Test(api) => api.wake_compute(),
        }
    }
 }
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -157,14 +157,17 @@ impl super::Api for Api {
        ))
    }

-    async fn get_allowed_ips(
+    async fn get_allowed_ips_and_secret(
        &self,
        _ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
-    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
-        Ok(Cached::new_uncached(Arc::new(
-            self.do_get_auth_info(user_info).await?.allowed_ips,
-        )))
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
+        Ok((
+            Cached::new_uncached(Arc::new(
+                self.do_get_auth_info(user_info).await?.allowed_ips,
+            )),
+            None,
+        ))
    }

    #[tracing::instrument(skip_all)]
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -19,7 +19,6 @@ use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};

-#[derive(Clone)]
 pub struct Api {
    endpoint: http::Endpoint,
    pub caches: &'static ApiCaches,
@@ -194,17 +193,17 @@ impl super::Api for Api {
        Ok(Cached::new_uncached(auth_info.secret))
    }

-    async fn get_allowed_ips(
+    async fn get_allowed_ips_and_secret(
        &self,
        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
-    ) -> Result<CachedAllowedIps, GetAuthInfoError> {
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
        let ep = &user_info.endpoint;
        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(ep) {
            ALLOWED_IPS_BY_CACHE_OUTCOME
                .with_label_values(&["hit"])
                .inc();
-            return Ok(allowed_ips);
+            return Ok((allowed_ips, None));
        }
        ALLOWED_IPS_BY_CACHE_OUTCOME
            .with_label_values(&["miss"])
@@ -223,7 +222,10 @@ impl super::Api for Api {
                .project_info
                .insert_allowed_ips(&project_id, ep, allowed_ips.clone());
        }
-        Ok(Cached::new_uncached(allowed_ips))
+        Ok((
+            Cached::new_uncached(allowed_ips),
+            Some(Cached::new_uncached(auth_info.secret)),
+        ))
    }

    #[tracing::instrument(skip_all)]
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -89,8 +89,11 @@ impl RequestMonitoring {
        self.project = Some(x.project_id);
    }

-    pub fn set_endpoint_id(&mut self, endpoint_id: Option<EndpointId>) {
-        self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone());
+    pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
+        crate::metrics::CONNECTING_ENDPOINTS
+            .with_label_values(&[self.protocol])
+            .measure(&endpoint_id);
+        self.endpoint_id = Some(endpoint_id);
    }

    pub fn set_application(&mut self, app: Option<SmolStr>) {
--- a/proxy/src/jemalloc.rs
+++ b/proxy/src/jemalloc.rs
@@ -0,0 +1,100 @@
+use std::time::Duration;
+
+use metrics::IntGauge;
+use prometheus::{register_int_gauge_with_registry, Registry};
+use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version};
+
+pub struct MetricRecorder {
+    epoch: epoch_mib,
+    active: stats::active_mib,
+    active_gauge: IntGauge,
+    allocated: stats::allocated_mib,
+    allocated_gauge: IntGauge,
+    mapped: stats::mapped_mib,
+    mapped_gauge: IntGauge,
+    metadata: stats::metadata_mib,
+    metadata_gauge: IntGauge,
+    resident: stats::resident_mib,
+    resident_gauge: IntGauge,
+    retained: stats::retained_mib,
+    retained_gauge: IntGauge,
+}
+
+impl MetricRecorder {
+    pub fn new(registry: &Registry) -> Result<Self, anyhow::Error> {
+        tracing::info!(
+            config = config::malloc_conf::read()?,
+            version = version::read()?,
+            "starting jemalloc recorder"
+        );
+
+        Ok(Self {
+            epoch: epoch::mib()?,
+            active: stats::active::mib()?,
+            active_gauge: register_int_gauge_with_registry!(
+                "jemalloc_active_bytes",
+                "Total number of bytes in active pages allocated by the process",
+                registry
+            )?,
+            allocated: stats::allocated::mib()?,
+            allocated_gauge: register_int_gauge_with_registry!(
+                "jemalloc_allocated_bytes",
+                "Total number of bytes allocated by the process",
+                registry
+            )?,
+            mapped: stats::mapped::mib()?,
+            mapped_gauge: register_int_gauge_with_registry!(
+                "jemalloc_mapped_bytes",
+                "Total number of bytes in active extents mapped by the allocator",
+                registry
+            )?,
+            metadata: stats::metadata::mib()?,
+            metadata_gauge: register_int_gauge_with_registry!(
+                "jemalloc_metadata_bytes",
+                "Total number of bytes dedicated to jemalloc metadata",
+                registry
+            )?,
+            resident: stats::resident::mib()?,
+            resident_gauge: register_int_gauge_with_registry!(
+                "jemalloc_resident_bytes",
+                "Total number of bytes in physically resident data pages mapped by the allocator",
+                registry
+            )?,
+            retained: stats::retained::mib()?,
+            retained_gauge: register_int_gauge_with_registry!(
+                "jemalloc_retained_bytes",
+                "Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system",
+                registry
+            )?,
+        })
+    }
+
+    fn _poll(&self) -> Result<(), anyhow::Error> {
+        self.epoch.advance()?;
+        self.active_gauge.set(self.active.read()? as i64);
+        self.allocated_gauge.set(self.allocated.read()? as i64);
+        self.mapped_gauge.set(self.mapped.read()? as i64);
+        self.metadata_gauge.set(self.metadata.read()? as i64);
+        self.resident_gauge.set(self.resident.read()? as i64);
+        self.retained_gauge.set(self.retained.read()? as i64);
+        Ok(())
+    }
+
+    #[inline]
+    pub fn poll(&self) {
+        if let Err(error) = self._poll() {
+            tracing::warn!(%error, "Failed to poll jemalloc stats");
+        }
+    }
+
+    pub fn start(self) -> tokio::task::JoinHandle<()> {
+        tokio::task::spawn(async move {
+            let mut interval = tokio::time::interval(Duration::from_secs(15));
+            interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+            loop {
+                self.poll();
+                interval.tick().await;
+            }
+        })
+    }
+}
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -16,6 +16,7 @@ pub mod console;
 pub mod context;
 pub mod error;
 pub mod http;
+pub mod jemalloc;
 pub mod logging;
 pub mod metrics;
 pub mod parse;
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,10 +1,7 @@
 use ::metrics::{
-    exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec,
-    IntCounterPairVec, IntCounterVec,
-};
-use prometheus::{
-    register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec,
-    IntGaugeVec,
+    exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec,
+    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge_vec, Histogram,
+    HistogramVec, HyperLogLogVec, IntCounterPairVec, IntCounterVec, IntGaugeVec,
 };

 use once_cell::sync::Lazy;
@@ -211,15 +208,6 @@ pub static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
    .unwrap()
 });

-pub static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_io_bytes_per_client",
-        "Number of bytes sent/received between client and backend.",
-        crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS,
-    )
-    .unwrap()
-});
-
 pub static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "proxy_io_bytes",
@@ -236,3 +224,13 @@ pub const fn bool_to_str(x: bool) -> &'static str {
        "false"
    }
 }
+
+pub static CONNECTING_ENDPOINTS: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
+    register_hll_vec!(
+        32,
+        "proxy_connecting_endpoints",
+        "HLL approximate cardinality of endpoints that are connecting",
+        &["protocol"],
+    )
+    .unwrap()
+});
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -2,37 +2,35 @@
 mod tests;

 pub mod connect_compute;
+pub mod handshake;
+pub mod passthrough;
 pub mod retry;
+pub mod wake_compute;

 use crate::{
    auth,
    cancellation::{self, CancelMap},
    compute,
-    config::{AuthenticationConfig, ProxyConfig, TlsConfig},
-    console::messages::MetricsAuxInfo,
+    config::{ProxyConfig, TlsConfig},
    context::RequestMonitoring,
-    metrics::{
-        NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER,
-        NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE,
-    },
+    metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE},
    protocol2::WithClientIp,
+    proxy::{handshake::handshake, passthrough::proxy_pass},
    rate_limiter::EndpointRateLimiter,
    stream::{PqStream, Stream},
-    usage_metrics::{Ids, USAGE_METRICS},
    EndpointCacheKey,
 };
 use anyhow::{bail, Context};
 use futures::TryFutureExt;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
-use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
+use pq_proto::{BeMessage as Be, StartupMessageParams};
 use regex::Regex;
 use smol_str::{format_smolstr, SmolStr};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, Instrument};
-use utils::measured_stream::MeasuredStream;

 use self::connect_compute::{connect_to_compute, TcpMechanism};

@@ -80,6 +78,13 @@ pub async fn task_main(
        let cancel_map = Arc::clone(&cancel_map);
        let endpoint_rate_limiter = endpoint_rate_limiter.clone();

+        let session_span = info_span!(
+            "handle_client",
+            ?session_id,
+            peer_addr = tracing::field::Empty,
+            ep = tracing::field::Empty,
+        );
+
        connections.spawn(
            async move {
                info!("accepted postgres client connection");
@@ -103,22 +108,18 @@ pub async fn task_main(
                handle_client(
                    config,
                    &mut ctx,
-                    &cancel_map,
+                    cancel_map,
                    socket,
                    ClientMode::Tcp,
                    endpoint_rate_limiter,
                )
                .await
            }
-            .instrument(info_span!(
-                "handle_client",
-                ?session_id,
-                peer_addr = tracing::field::Empty
-            ))
            .unwrap_or_else(move |e| {
                // Acknowledge that the task has finished with an error.
-                error!(?session_id, "per-client task finished with an error: {e:#}");
-            }),
+                error!("per-client task finished with an error: {e:#}");
+            })
+            .instrument(session_span),
        );
    }

@@ -171,7 +172,7 @@ impl ClientMode {
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    config: &'static ProxyConfig,
    ctx: &mut RequestMonitoring,
-    cancel_map: &CancelMap,
+    cancel_map: Arc<CancelMap>,
    stream: S,
    mode: ClientMode,
    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -192,138 +193,88 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    let tls = config.tls_config.as_ref();

    let pause = ctx.latency_timer.pause();
-    let do_handshake = handshake(stream, mode.handshake_tls(tls), cancel_map);
+    let do_handshake = handshake(stream, mode.handshake_tls(tls), &cancel_map);
    let (mut stream, params) = match do_handshake.await? {
        Some(x) => x,
        None => return Ok(()), // it's a cancellation request
    };
    drop(pause);

+    let hostname = mode.hostname(stream.get_ref());
+
+    let common_names = tls.map(|tls| &tls.common_names);
+
    // Extract credentials which we're going to use for auth.
-    let user_info = {
-        let hostname = mode.hostname(stream.get_ref());
+    let result = config
+        .auth_backend
+        .as_ref()
+        .map(|_| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names))
+        .transpose();

-        let common_names = tls.map(|tls| &tls.common_names);
-        let result = config
-            .auth_backend
-            .as_ref()
-            .map(|_| {
-                auth::ComputeUserInfoMaybeEndpoint::parse(ctx, &params, hostname, common_names)
-            })
-            .transpose();
+    let user_info = match result {
+        Ok(user_info) => user_info,
+        Err(e) => stream.throw_error(e).await?,
+    };

-        match result {
-            Ok(user_info) => user_info,
-            Err(e) => stream.throw_error(e).await?,
+    // check rate limit
+    if let Some(ep) = user_info.get_endpoint() {
+        if !endpoint_rate_limiter.check(ep) {
+            return stream
+                .throw_error(auth::AuthError::too_many_connections())
+                .await;
+        }
+    }
+
+    let user = user_info.get_user().to_owned();
+    let (mut node_info, user_info) = match user_info
+        .authenticate(
+            ctx,
+            &mut stream,
+            mode.allow_cleartext(),
+            &config.authentication_config,
+        )
+        .await
+    {
+        Ok(auth_result) => auth_result,
+        Err(e) => {
+            let db = params.database();
+            let app = params.get("application_name");
+            let params_span = tracing::info_span!("", ?user, ?db, ?app);
+
+            return stream.throw_error(e).instrument(params_span).await;
        }
    };

-    ctx.set_endpoint_id(user_info.get_endpoint());
+    node_info.allow_self_signed_compute = mode.allow_self_signed_compute(config);

-    let client = Client::new(
-        stream,
-        user_info,
-        &params,
-        mode.allow_self_signed_compute(config),
-        endpoint_rate_limiter,
-    );
-    cancel_map
-        .with_session(|session| {
-            client.connect_to_db(ctx, session, mode, &config.authentication_config)
-        })
-        .await
-}
+    let aux = node_info.aux.clone();
+    let mut node = connect_to_compute(
+        ctx,
+        &TcpMechanism { params: &params },
+        node_info,
+        &user_info,
+    )
+    .or_else(|e| stream.throw_error(e))
+    .await?;

-/// Establish a (most probably, secure) connection with the client.
-/// For better testing experience, `stream` can be any object satisfying the traits.
-/// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
-/// we also take an extra care of propagating only the select handshake errors to client.
-#[tracing::instrument(skip_all)]
-async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
-    stream: S,
-    mut tls: Option<&TlsConfig>,
-    cancel_map: &CancelMap,
-) -> anyhow::Result<Option<(PqStream<Stream<S>>, StartupMessageParams)>> {
-    // Client may try upgrading to each protocol only once
-    let (mut tried_ssl, mut tried_gss) = (false, false);
+    let session = cancel_map.get_session();
+    prepare_client_connection(&node, &session, &mut stream).await?;

-    let mut stream = PqStream::new(Stream::from_raw(stream));
-    loop {
-        let msg = stream.read_startup_packet().await?;
-        info!("received {msg:?}");
+    // Before proxy passing, forward to compute whatever data is left in the
+    // PqStream input buffer. Normally there is none, but our serverless npm
+    // driver in pipeline mode sends startup, password and first query
+    // immediately after opening the connection.
+    let (stream, read_buf) = stream.into_inner();
+    node.stream.write_all(&read_buf).await?;

-        use FeStartupPacket::*;
-        match msg {
-            SslRequest => match stream.get_ref() {
-                Stream::Raw { .. } if !tried_ssl => {
-                    tried_ssl = true;
-
-                    // We can't perform TLS handshake without a config
-                    let enc = tls.is_some();
-                    stream.write_message(&Be::EncryptionResponse(enc)).await?;
-                    if let Some(tls) = tls.take() {
-                        // Upgrade raw stream into a secure TLS-backed stream.
-                        // NOTE: We've consumed `tls`; this fact will be used later.
-
-                        let (raw, read_buf) = stream.into_inner();
-                        // TODO: Normally, client doesn't send any data before
-                        // server says TLS handshake is ok and read_buf is empy.
-                        // However, you could imagine pipelining of postgres
-                        // SSLRequest + TLS ClientHello in one hunk similar to
-                        // pipelining in our node js driver. We should probably
-                        // support that by chaining read_buf with the stream.
-                        if !read_buf.is_empty() {
-                            bail!("data is sent before server replied with EncryptionResponse");
-                        }
-                        let tls_stream = raw.upgrade(tls.to_server_config()).await?;
-
-                        let (_, tls_server_end_point) = tls
-                            .cert_resolver
-                            .resolve(tls_stream.get_ref().1.server_name())
-                            .context("missing certificate")?;
-
-                        stream = PqStream::new(Stream::Tls {
-                            tls: Box::new(tls_stream),
-                            tls_server_end_point,
-                        });
-                    }
-                }
-                _ => bail!(ERR_PROTO_VIOLATION),
-            },
-            GssEncRequest => match stream.get_ref() {
-                Stream::Raw { .. } if !tried_gss => {
-                    tried_gss = true;
-
-                    // Currently, we don't support GSSAPI
-                    stream.write_message(&Be::EncryptionResponse(false)).await?;
-                }
-                _ => bail!(ERR_PROTO_VIOLATION),
-            },
-            StartupMessage { params, .. } => {
-                // Check that the config has been consumed during upgrade
-                // OR we didn't provide it at all (for dev purposes).
-                if tls.is_some() {
-                    stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
-                }
-
-                info!(session_type = "normal", "successful handshake");
-                break Ok(Some((stream, params)));
-            }
-            CancelRequest(cancel_key_data) => {
-                cancel_map.cancel_session(cancel_key_data).await?;
-
-                info!(session_type = "cancellation", "successful handshake");
-                break Ok(None);
-            }
-        }
-    }
+    proxy_pass(ctx, stream, node.stream, aux).await
 }

 /// Finish client connection initialization: confirm auth success, send params, etc.
 #[tracing::instrument(skip_all)]
 async fn prepare_client_connection(
    node: &compute::PostgresConnection,
-    session: cancellation::Session<'_>,
+    session: &cancellation::Session,
    stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> anyhow::Result<()> {
    // Register compute's query cancellation token and produce a new, unique one.
@@ -349,151 +300,6 @@ async fn prepare_client_connection(
    Ok(())
 }

-/// Forward bytes in both directions (client <-> compute).
-#[tracing::instrument(skip_all)]
-pub async fn proxy_pass(
-    ctx: &mut RequestMonitoring,
-    client: impl AsyncRead + AsyncWrite + Unpin,
-    compute: impl AsyncRead + AsyncWrite + Unpin,
-    aux: MetricsAuxInfo,
-) -> anyhow::Result<()> {
-    ctx.set_success();
-    ctx.log();
-
-    let usage = USAGE_METRICS.register(Ids {
-        endpoint_id: aux.endpoint_id.clone(),
-        branch_id: aux.branch_id.clone(),
-    });
-
-    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
-    let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx"));
-    let mut client = MeasuredStream::new(
-        client,
-        |_| {},
-        |cnt| {
-            // Number of bytes we sent to the client (outbound).
-            m_sent.inc_by(cnt as u64);
-            m_sent2.inc_by(cnt as u64);
-            usage.record_egress(cnt as u64);
-        },
-    );
-
-    let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
-    let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx"));
-    let mut compute = MeasuredStream::new(
-        compute,
-        |_| {},
-        |cnt| {
-            // Number of bytes the client sent to the compute node (inbound).
-            m_recv.inc_by(cnt as u64);
-            m_recv2.inc_by(cnt as u64);
-        },
-    );
-
-    // Starting from here we only proxy the client's traffic.
-    info!("performing the proxy pass...");
-    let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
-
-    Ok(())
-}
-
-/// Thin connection context.
-struct Client<'a, S> {
-    /// The underlying libpq protocol stream.
-    stream: PqStream<Stream<S>>,
-    /// Client credentials that we care about.
-    user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>,
-    /// KV-dictionary with PostgreSQL connection params.
-    params: &'a StartupMessageParams,
-    /// Allow self-signed certificates (for testing).
-    allow_self_signed_compute: bool,
-    /// Rate limiter for endpoints
-    endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-}
-
-impl<'a, S> Client<'a, S> {
-    /// Construct a new connection context.
-    fn new(
-        stream: PqStream<Stream<S>>,
-        user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>,
-        params: &'a StartupMessageParams,
-        allow_self_signed_compute: bool,
-        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-    ) -> Self {
-        Self {
-            stream,
-            user_info,
-            params,
-            allow_self_signed_compute,
-            endpoint_rate_limiter,
-        }
-    }
-}
-
-impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
-    /// Let the client authenticate and connect to the designated compute node.
-    // Instrumentation logs endpoint name everywhere. Doesn't work for link
-    // auth; strictly speaking we don't know endpoint name in its case.
-    #[tracing::instrument(name = "", fields(ep = %self.user_info.get_endpoint().unwrap_or_default()), skip_all)]
-    async fn connect_to_db(
-        self,
-        ctx: &mut RequestMonitoring,
-        session: cancellation::Session<'_>,
-        mode: ClientMode,
-        config: &'static AuthenticationConfig,
-    ) -> anyhow::Result<()> {
-        let Self {
-            mut stream,
-            user_info,
-            params,
-            allow_self_signed_compute,
-            endpoint_rate_limiter,
-        } = self;
-
-        // check rate limit
-        if let Some(ep) = user_info.get_endpoint() {
-            if !endpoint_rate_limiter.check(ep) {
-                return stream
-                    .throw_error(auth::AuthError::too_many_connections())
-                    .await;
-            }
-        }
-
-        let user = user_info.get_user().to_owned();
-        let auth_result = match user_info
-            .authenticate(ctx, &mut stream, mode.allow_cleartext(), config)
-            .await
-        {
-            Ok(auth_result) => auth_result,
-            Err(e) => {
-                let db = params.get("database");
-                let app = params.get("application_name");
-                let params_span = tracing::info_span!("", ?user, ?db, ?app);
-
-                return stream.throw_error(e).instrument(params_span).await;
-            }
-        };
-
-        let (mut node_info, user_info) = auth_result;
-
-        node_info.allow_self_signed_compute = allow_self_signed_compute;
-
-        let aux = node_info.aux.clone();
-        let mut node = connect_to_compute(ctx, &TcpMechanism { params }, node_info, &user_info)
-            .or_else(|e| stream.throw_error(e))
-            .await?;
-
-        prepare_client_connection(&node, session, &mut stream).await?;
-        // Before proxy passing, forward to compute whatever data is left in the
-        // PqStream input buffer. Normally there is none, but our serverless npm
-        // driver in pipeline mode sends startup, password and first query
-        // immediately after opening the connection.
-        let (stream, read_buf) = stream.into_inner();
-        node.stream.write_all(&read_buf).await?;
-        proxy_pass(ctx, stream, node.stream, aux).await
-    }
-}
-
 #[derive(Debug, Clone, PartialEq, Eq, Default)]
 pub struct NeonOptions(Vec<(SmolStr, SmolStr)>);

--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -1,15 +1,16 @@
 use crate::{
    auth,
    compute::{self, PostgresConnection},
-    console::{self, errors::WakeComputeError, Api},
+    console::{self, errors::WakeComputeError},
    context::RequestMonitoring,
-    metrics::{bool_to_str, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES},
-    proxy::retry::{retry_after, ShouldRetry},
+    metrics::NUM_CONNECTION_FAILURES,
+    proxy::{
+        retry::{retry_after, ShouldRetry},
+        wake_compute::wake_compute,
+    },
 };
 use async_trait::async_trait;
-use hyper::StatusCode;
 use pq_proto::StartupMessageParams;
-use std::ops::ControlFlow;
 use tokio::time;
 use tracing::{error, info, warn};

@@ -88,39 +89,6 @@ impl ConnectMechanism for TcpMechanism<'_> {
    }
 }

-fn report_error(e: &WakeComputeError, retry: bool) {
-    use crate::console::errors::ApiError;
-    let retry = bool_to_str(retry);
-    let kind = match e {
-        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
-        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ref text,
-        }) if text.contains("written data quota exceeded")
-            || text.contains("the limit for current plan reached") =>
-        {
-            "quota_exceeded"
-        }
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::LOCKED,
-            ..
-        }) => "api_console_locked",
-        WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::BAD_REQUEST,
-            ..
-        }) => "api_console_bad_request",
-        WakeComputeError::ApiError(ApiError::Console { status, .. })
-            if status.is_server_error() =>
-        {
-            "api_console_other_server_error"
-        }
-        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
-        WakeComputeError::TimeoutError => "timeout_error",
-    };
-    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
-}
-
 /// Try to connect to the compute node, retrying if necessary.
 /// This function might update `node_info`, so we take it by `&mut`.
 #[tracing::instrument(skip_all)]
@@ -137,7 +105,7 @@ where
    mechanism.update_connect_config(&mut node_info.config);

    // try once
-    let (config, err) = match mechanism
+    let err = match mechanism
        .connect_once(ctx, &node_info, CONNECT_TIMEOUT)
        .await
    {
@@ -145,51 +113,27 @@ where
            ctx.latency_timer.success();
            return Ok(res);
        }
-        Err(e) => {
-            error!(error = ?e, "could not connect to compute node");
-            (invalidate_cache(node_info), e)
-        }
+        Err(e) => e,
    };

-    ctx.latency_timer.cache_miss();
+    error!(error = ?err, "could not connect to compute node");

    let mut num_retries = 1;

-    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
-    info!("compute node's state has likely changed; requesting a wake-up");
-    let node_info = loop {
-        let wake_res = match user_info {
-            auth::BackendType::Console(api, user_info) => api.wake_compute(ctx, user_info).await,
-            // nothing to do?
-            auth::BackendType::Link(_) => return Err(err.into()),
-            // test backend
-            #[cfg(test)]
-            auth::BackendType::Test(x) => x.wake_compute(),
-        };
+    match user_info {
+        auth::BackendType::Console(api, info) => {
+            // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
+            info!("compute node's state has likely changed; requesting a wake-up");

-        match handle_try_wake(wake_res, num_retries) {
-            Err(e) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                report_error(&e, false);
-                return Err(e.into());
-            }
-            // failed to wake up but we can continue to retry
-            Ok(ControlFlow::Continue(e)) => {
-                report_error(&e, true);
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-            }
-            // successfully woke up a compute node and can break the wakeup loop
-            Ok(ControlFlow::Break(mut node_info)) => {
-                node_info.config.reuse_password(&config);
-                mechanism.update_connect_config(&mut node_info.config);
-                break node_info;
-            }
+            ctx.latency_timer.cache_miss();
+            let config = invalidate_cache(node_info);
+            node_info = wake_compute(&mut num_retries, ctx, api, info).await?;
+
+            node_info.config.reuse_password(&config);
+            mechanism.update_connect_config(&mut node_info.config);
        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-
-        time::sleep(wait_duration).await;
+        // nothing to do?
+        auth::BackendType::Link(_) => {}
    };

    // now that we have a new node, try connect to it repeatedly.
@@ -221,23 +165,3 @@ where
        time::sleep(wait_duration).await;
    }
 }
-
-/// Attempts to wake up the compute node.
-/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
-/// * Returns Ok(Break(node)) if the wakeup succeeded
-/// * Returns Err(e) if there was an error
-pub fn handle_try_wake(
-    result: Result<console::CachedNodeInfo, WakeComputeError>,
-    num_retries: u32,
-) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
-    match result {
-        Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
-                Ok(ControlFlow::Continue(err))
-            }
-            _ => Err(err),
-        },
-        // Ready to try again.
-        Ok(new) => Ok(ControlFlow::Break(new)),
-    }
-}
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -0,0 +1,96 @@
+use anyhow::{bail, Context};
+use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::info;
+
+use crate::{
+    cancellation::CancelMap,
+    config::TlsConfig,
+    proxy::{ERR_INSECURE_CONNECTION, ERR_PROTO_VIOLATION},
+    stream::{PqStream, Stream},
+};
+
+/// Establish a (most probably, secure) connection with the client.
+/// For better testing experience, `stream` can be any object satisfying the traits.
+/// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
+/// we also take an extra care of propagating only the select handshake errors to client.
+#[tracing::instrument(skip_all)]
+pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
+    stream: S,
+    mut tls: Option<&TlsConfig>,
+    cancel_map: &CancelMap,
+) -> anyhow::Result<Option<(PqStream<Stream<S>>, StartupMessageParams)>> {
+    // Client may try upgrading to each protocol only once
+    let (mut tried_ssl, mut tried_gss) = (false, false);
+
+    let mut stream = PqStream::new(Stream::from_raw(stream));
+    loop {
+        let msg = stream.read_startup_packet().await?;
+        info!("received {msg:?}");
+
+        use FeStartupPacket::*;
+        match msg {
+            SslRequest => match stream.get_ref() {
+                Stream::Raw { .. } if !tried_ssl => {
+                    tried_ssl = true;
+
+                    // We can't perform TLS handshake without a config
+                    let enc = tls.is_some();
+                    stream.write_message(&Be::EncryptionResponse(enc)).await?;
+                    if let Some(tls) = tls.take() {
+                        // Upgrade raw stream into a secure TLS-backed stream.
+                        // NOTE: We've consumed `tls`; this fact will be used later.
+
+                        let (raw, read_buf) = stream.into_inner();
+                        // TODO: Normally, client doesn't send any data before
+                        // server says TLS handshake is ok and read_buf is empy.
+                        // However, you could imagine pipelining of postgres
+                        // SSLRequest + TLS ClientHello in one hunk similar to
+                        // pipelining in our node js driver. We should probably
+                        // support that by chaining read_buf with the stream.
+                        if !read_buf.is_empty() {
+                            bail!("data is sent before server replied with EncryptionResponse");
+                        }
+                        let tls_stream = raw.upgrade(tls.to_server_config()).await?;
+
+                        let (_, tls_server_end_point) = tls
+                            .cert_resolver
+                            .resolve(tls_stream.get_ref().1.server_name())
+                            .context("missing certificate")?;
+
+                        stream = PqStream::new(Stream::Tls {
+                            tls: Box::new(tls_stream),
+                            tls_server_end_point,
+                        });
+                    }
+                }
+                _ => bail!(ERR_PROTO_VIOLATION),
+            },
+            GssEncRequest => match stream.get_ref() {
+                Stream::Raw { .. } if !tried_gss => {
+                    tried_gss = true;
+
+                    // Currently, we don't support GSSAPI
+                    stream.write_message(&Be::EncryptionResponse(false)).await?;
+                }
+                _ => bail!(ERR_PROTO_VIOLATION),
+            },
+            StartupMessage { params, .. } => {
+                // Check that the config has been consumed during upgrade
+                // OR we didn't provide it at all (for dev purposes).
+                if tls.is_some() {
+                    stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
+                }
+
+                info!(session_type = "normal", "successful handshake");
+                break Ok(Some((stream, params)));
+            }
+            CancelRequest(cancel_key_data) => {
+                cancel_map.cancel_session(cancel_key_data).await?;
+
+                info!(session_type = "cancellation", "successful handshake");
+                break Ok(None);
+            }
+        }
+    }
+}
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -0,0 +1,53 @@
+use crate::{
+    console::messages::MetricsAuxInfo,
+    context::RequestMonitoring,
+    metrics::NUM_BYTES_PROXIED_COUNTER,
+    usage_metrics::{Ids, USAGE_METRICS},
+};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::info;
+use utils::measured_stream::MeasuredStream;
+
+/// Forward bytes in both directions (client <-> compute).
+#[tracing::instrument(skip_all)]
+pub async fn proxy_pass(
+    ctx: &mut RequestMonitoring,
+    client: impl AsyncRead + AsyncWrite + Unpin,
+    compute: impl AsyncRead + AsyncWrite + Unpin,
+    aux: MetricsAuxInfo,
+) -> anyhow::Result<()> {
+    ctx.set_success();
+    ctx.log();
+
+    let usage = USAGE_METRICS.register(Ids {
+        endpoint_id: aux.endpoint_id.clone(),
+        branch_id: aux.branch_id.clone(),
+    });
+
+    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
+    let mut client = MeasuredStream::new(
+        client,
+        |_| {},
+        |cnt| {
+            // Number of bytes we sent to the client (outbound).
+            m_sent.inc_by(cnt as u64);
+            usage.record_egress(cnt as u64);
+        },
+    );
+
+    let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
+    let mut compute = MeasuredStream::new(
+        compute,
+        |_| {},
+        |cnt| {
+            // Number of bytes the client sent to the compute node (inbound).
+            m_recv.inc_by(cnt as u64);
+        },
+    );
+
+    // Starting from here we only proxy the client's traffic.
+    info!("performing the proxy pass...");
+    let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
+
+    Ok(())
+}
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -5,9 +5,9 @@ mod mitm;
 use super::connect_compute::ConnectMechanism;
 use super::retry::ShouldRetry;
 use super::*;
-use crate::auth::backend::{ComputeUserInfo, TestBackend};
-use crate::auth::IpPattern;
+use crate::auth::backend::{ComputeUserInfo, MaybeOwned, TestBackend};
 use crate::config::CertResolver;
+use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
@@ -371,6 +371,7 @@ enum ConnectAction {
    Fail,
 }

+#[derive(Clone)]
 struct TestConnectMechanism {
    counter: Arc<std::sync::Mutex<usize>>,
    sequence: Vec<ConnectAction>,
@@ -471,7 +472,10 @@ impl TestBackend for TestConnectMechanism {
        }
    }

-    fn get_allowed_ips(&self) -> Result<Vec<IpPattern>, console::errors::GetAuthInfoError> {
+    fn get_allowed_ips_and_secret(
+        &self,
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>
+    {
        unimplemented!("not used in tests")
    }
 }
@@ -487,9 +491,16 @@ fn helper_create_cached_node_info() -> CachedNodeInfo {

 fn helper_create_connect_info(
    mechanism: &TestConnectMechanism,
-) -> (CachedNodeInfo, auth::BackendType<'_, ComputeUserInfo>) {
+) -> (CachedNodeInfo, auth::BackendType<'static, ComputeUserInfo>) {
    let cache = helper_create_cached_node_info();
-    let user_info = auth::BackendType::Test(mechanism);
+    let user_info = auth::BackendType::Console(
+        MaybeOwned::Owned(ConsoleBackend::Test(Box::new(mechanism.clone()))),
+        ComputeUserInfo {
+            endpoint: "endpoint".into(),
+            user: "user".into(),
+            options: NeonOptions::parse_options_raw(""),
+        },
+    );
    (cache, user_info)
 }

--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -0,0 +1,95 @@
+use crate::auth::backend::ComputeUserInfo;
+use crate::console::{
+    errors::WakeComputeError,
+    provider::{CachedNodeInfo, ConsoleBackend},
+    Api,
+};
+use crate::context::RequestMonitoring;
+use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES};
+use crate::proxy::retry::retry_after;
+use hyper::StatusCode;
+use std::ops::ControlFlow;
+use tracing::{error, warn};
+
+use super::retry::ShouldRetry;
+
+/// wake a compute (or retrieve an existing compute session from cache)
+pub async fn wake_compute(
+    num_retries: &mut u32,
+    ctx: &mut RequestMonitoring,
+    api: &ConsoleBackend,
+    info: &ComputeUserInfo,
+) -> Result<CachedNodeInfo, WakeComputeError> {
+    loop {
+        let wake_res = api.wake_compute(ctx, info).await;
+        match handle_try_wake(wake_res, *num_retries) {
+            Err(e) => {
+                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
+                report_error(&e, false);
+                return Err(e);
+            }
+            Ok(ControlFlow::Continue(e)) => {
+                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
+                report_error(&e, true);
+            }
+            Ok(ControlFlow::Break(n)) => return Ok(n),
+        }
+
+        let wait_duration = retry_after(*num_retries);
+        *num_retries += 1;
+        tokio::time::sleep(wait_duration).await;
+    }
+}
+
+/// Attempts to wake up the compute node.
+/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
+/// * Returns Ok(Break(node)) if the wakeup succeeded
+/// * Returns Err(e) if there was an error
+pub fn handle_try_wake(
+    result: Result<CachedNodeInfo, WakeComputeError>,
+    num_retries: u32,
+) -> Result<ControlFlow<CachedNodeInfo, WakeComputeError>, WakeComputeError> {
+    match result {
+        Err(err) => match &err {
+            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
+                Ok(ControlFlow::Continue(err))
+            }
+            _ => Err(err),
+        },
+        // Ready to try again.
+        Ok(new) => Ok(ControlFlow::Break(new)),
+    }
+}
+
+fn report_error(e: &WakeComputeError, retry: bool) {
+    use crate::console::errors::ApiError;
+    let retry = bool_to_str(retry);
+    let kind = match e {
+        WakeComputeError::BadComputeAddress(_) => "bad_compute_address",
+        WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ref text,
+        }) if text.contains("written data quota exceeded")
+            || text.contains("the limit for current plan reached") =>
+        {
+            "quota_exceeded"
+        }
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::LOCKED,
+            ..
+        }) => "api_console_locked",
+        WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::BAD_REQUEST,
+            ..
+        }) => "api_console_bad_request",
+        WakeComputeError::ApiError(ApiError::Console { status, .. })
+            if status.is_server_error() =>
+        {
+            "api_console_other_server_error"
+        }
+        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
+        WakeComputeError::TimeoutError => "timeout_error",
+    };
+    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
+}
--- a/Show More
+++ b/Show More