Fix decoiding of HeapDelete/HeapLock commands

Bump postgres versions
Remove check for the reast of heap_multi_insert WAL ercord content
2026-01-24 13:50:37 +00:00 · 2023-08-16 15:41:49 +03:00 · 2023-08-16 08:52:15 +03:00 · 2023-08-16 08:52:15 +03:00 · 2023-08-16 08:52:15 +03:00 · 2023-08-16 08:52:15 +03:00
139 changed files with 6183 additions and 2537 deletions
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -1,7 +1,20 @@
 name: 'Create Allure report'
 description: 'Generate Allure report from uploaded by actions/allure-report-store tests results'

+inputs:
+  store-test-results-into-db:
+    description: 'Whether to store test results into the database. TEST_RESULT_CONNSTR/TEST_RESULT_CONNSTR_NEW should be set'
+    type: boolean
+    required: false
+    default: false
+
 outputs:
+  base-url:
+    description: 'Base URL for Allure report'
+    value: ${{ steps.generate-report.outputs.base-url }}
+  base-s3-url:
+    description: 'Base S3 URL for Allure report'
+    value: ${{ steps.generate-report.outputs.base-s3-url }}
  report-url:
    description: 'Allure report URL'
    value: ${{ steps.generate-report.outputs.report-url }}
@@ -63,8 +76,8 @@ runs:
          rm -f ${ALLURE_ZIP}
        fi
      env:
-        ALLURE_VERSION: 2.22.1
-        ALLURE_ZIP_SHA256: fdc7a62d94b14c5e0bf25198ae1feded6b005fdbed864b4d3cb4e5e901720b0b
+        ALLURE_VERSION: 2.23.1
+        ALLURE_ZIP_SHA256: 11141bfe727504b3fd80c0f9801eb317407fd0ac983ebb57e671f14bac4bcd86

    # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
    - name: Acquire lock
@@ -102,6 +115,11 @@ runs:
        REPORT_PREFIX=reports/${BRANCH_OR_PR}
        RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}

+        BASE_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}
+        BASE_S3_URL=s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}
+        REPORT_URL=${BASE_URL}/index.html
+        REPORT_JSON_URL=${BASE_URL}/data/suites.json
+
        # Get previously uploaded data for this run
        ZSTD_NBTHREADS=0

@@ -110,10 +128,9 @@ runs:
          # There's no previously uploaded data for this $GITHUB_RUN_ID
          exit 0
        fi
-        for S3_FILEPATH in ${S3_FILEPATHS}; do
-          time aws s3 cp --only-show-errors "s3://${BUCKET}/${S3_FILEPATH}" "${WORKDIR}"

-          archive=${WORKDIR}/$(basename $S3_FILEPATH)
+        time aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${RAW_PREFIX}/" "${WORKDIR}/"
+        for archive in $(find ${WORKDIR} -name "*.tar.zst"); do
          mkdir -p ${archive%.tar.zst}
          time tar -xf ${archive} -C ${archive%.tar.zst}
          rm -f ${archive}
@@ -130,9 +147,10 @@ runs:

        # Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
-        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"

-        REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
+        # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
+        # and to keep files on the host to upload them to the database
+        time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"

        # Generate redirect
        cat <<EOF > ${WORKDIR}/index.html
@@ -144,8 +162,10 @@ runs:
        EOF
        time aws s3 cp --only-show-errors ${WORKDIR}/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"

-        echo "report-url=${REPORT_URL}"                                   >> $GITHUB_OUTPUT
-        echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
+        echo "base-url=${BASE_URL}"               >> $GITHUB_OUTPUT
+        echo "base-s3-url=${BASE_S3_URL}"         >> $GITHUB_OUTPUT
+        echo "report-url=${REPORT_URL}"           >> $GITHUB_OUTPUT
+        echo "report-json-url=${REPORT_JSON_URL}" >> $GITHUB_OUTPUT

        echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}

@@ -159,6 +179,41 @@ runs:
          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
        fi

+    - name: Store Allure test stat in the DB
+      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
+      shell: bash -euxo pipefail {0}
+      env:
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }}
+      run: |
+        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR}
+
+        ./scripts/pysync
+
+        poetry run python3 scripts/ingest_regress_test_result.py \
+          --revision ${COMMIT_SHA} \
+          --reference ${GITHUB_REF} \
+          --build-type unified \
+          --ingest ${WORKDIR}/report/data/suites.json
+
+    - name: Store Allure test stat in the DB (new)
+      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
+      shell: bash -euxo pipefail {0}
+      env:
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
+      run: |
+        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}
+
+        ./scripts/pysync
+
+        poetry run python3 scripts/ingest_regress_test_result-new-format.py \
+          --reference ${GITHUB_REF} \
+          --revision ${COMMIT_SHA} \
+          --run-id ${GITHUB_RUN_ID} \
+          --run-attempt ${GITHUB_RUN_ATTEMPT} \
+          --test-cases-dir ${WORKDIR}/report/data/test-cases
+
    - name: Cleanup
      if: always()
      shell: bash -euxo pipefail {0}
--- a/.github/actions/download/action.yml
+++ b/.github/actions/download/action.yml
@@ -31,7 +31,7 @@ runs:
        BUCKET=neon-github-public-dev
        FILENAME=$(basename $ARCHIVE)

-        S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+        S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
        if [ -z "${S3_KEY}" ]; then
          if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then
            echo 'SKIPPED=true' >> $GITHUB_OUTPUT
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -432,6 +432,11 @@ jobs:
        if: ${{ !cancelled() }}
        id: create-allure-report
        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+        env:
+          REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

      - uses: actions/github-script@v6
        if: ${{ !cancelled() }}
@@ -452,25 +457,6 @@ jobs:
              report,
            })

-      - name: Store Allure test stat in the DB
-        if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
-        env:
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
-        run: |
-          ./scripts/pysync
-
-          curl --fail --output suites.json "${REPORT_JSON_URL}"
-          export BUILD_TYPE=unified
-          export DATABASE_URL="$TEST_RESULT_CONNSTR"
-
-          poetry run python3 scripts/ingest_regress_test_result.py \
-            --revision ${COMMIT_SHA} \
-            --reference ${GITHUB_REF} \
-            --build-type ${BUILD_TYPE} \
-            --ingest suites.json
-
  coverage-report:
    runs-on: [ self-hosted, gen3, small ]
    container:
@@ -794,7 +780,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.13.1
+      VM_BUILDER_VERSION: v0.15.4

    steps:
      - name: Checkout
@@ -1067,7 +1053,7 @@ jobs:
            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
            FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst

-            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
            if [ -z "${S3_KEY}" ]; then
              echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
              exit 1
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -740,6 +740,9 @@ name = "cc"
 version = "1.0.79"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+dependencies = [
+ "jobserver",
+]

 [[package]]
 name = "cexpr"
@@ -907,12 +910,14 @@ dependencies = [
 "opentelemetry",
 "postgres",
 "regex",
+ "remote_storage",
 "reqwest",
 "serde",
 "serde_json",
 "tar",
 "tokio",
 "tokio-postgres",
+ "toml_edit",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
@@ -920,6 +925,7 @@ dependencies = [
 "url",
 "utils",
 "workspace_hack",
+ "zstd",
 ]

 [[package]]
@@ -980,6 +986,7 @@ dependencies = [
 "tar",
 "thiserror",
 "toml",
+ "tracing",
 "url",
 "utils",
 "workspace_hack",
@@ -1972,6 +1979,15 @@ version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"

+[[package]]
+name = "jobserver"
+version = "0.1.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "js-sys"
 version = "0.3.63"
@@ -3237,6 +3253,7 @@ dependencies = [
 "metrics",
 "once_cell",
 "pin-project-lite",
+ "scopeguard",
 "serde",
 "serde_json",
 "tempfile",
@@ -5296,6 +5313,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "bytes",
+ "cc",
 "chrono",
 "clap",
 "clap_builder",
@@ -5396,3 +5414,33 @@ name = "zeroize"
 version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
+
+[[package]]
+name = "zstd"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
+dependencies = [
+ "zstd-safe",
+]
+
+[[package]]
+name = "zstd-safe"
+version = "6.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
+dependencies = [
+ "libc",
+ "zstd-sys",
+]
+
+[[package]]
+name = "zstd-sys"
+version = "2.0.8+zstd.1.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+]
--- a/2
+++ b/2
@@ -51,6 +51,7 @@ RUN set -e \
      --bin safekeeper  \
      --bin storage_broker  \
      --bin proxy  \
+      --bin neon_local \
      --locked --release \
    && cachepot -s

@@ -76,6 +77,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -551,8 +551,8 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.tar.gz -O pg_embedding.tar.gz && \
-    echo "c4ae84eef36fa8ec5868f6e061f39812f19ee5ba3604d428d40935685c7be512 pg_embedding.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
+    echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -816,6 +816,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
 # libboost*, libfreetype6, and zlib1g for rdkit
+# ca-certificates for communicating with s3 by compute_ctl
 RUN apt update &&  \
    apt install --no-install-recommends -y \
        gdb \
@@ -839,7 +840,8 @@ RUN apt update &&  \
        libcurl4-openssl-dev \
        locales \
        procps \
-        zlib1g && \
+        zlib1g \
+        ca-certificates && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

--- a/README.md
+++ b/README.md
@@ -29,13 +29,13 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev
+libcurl4-openssl-dev openssl python-poetry
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel libcurl-devel
+  protobuf-devel libcurl-devel openssl poetry
 ```
 * On Arch based systems, these packages are needed:
 ```bash
@@ -235,6 +235,13 @@ CARGO_BUILD_FLAGS="--features=testing" make
 ./scripts/pytest
 ```

+By default, this runs both debug and release modes, and all supported postgres versions. When
+testing locally, it is convenient to run just run one set of permutations, like this:
+
+```sh
+DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
+```
+
 ## Documentation

 [docs](/docs) Contains a top-level overview of all available markdown documentation.
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -32,3 +32,6 @@ url.workspace = true
 compute_api.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
+toml_edit.workspace = true
+remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
+zstd = "0.12.4"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -5,6 +5,8 @@
 //! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
 //! - Every start is a fresh start, so the data directory is removed and
 //!   initialized again on each run.
+//! - If remote_extension_config is provided, it will be used to fetch extensions list
+//!  and download `shared_preload_libraries` from the remote storage.
 //! - Next it will put configuration files into the `PGDATA` directory.
 //! - Sync safekeepers and get commit LSN.
 //! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -27,7 +29,8 @@
 //! compute_ctl -D /var/db/postgres/compute \
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
-//!             -b /usr/local/bin/postgres
+//!             -b /usr/local/bin/postgres \
+//!             -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
 //! ```
 //!
 use std::collections::HashMap;
@@ -35,7 +38,7 @@ use std::fs::File;
 use std::panic;
 use std::path::Path;
 use std::process::exit;
-use std::sync::{mpsc, Arc, Condvar, Mutex};
+use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock, RwLock};
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
@@ -48,22 +51,33 @@ use compute_api::responses::ComputeStatus;

 use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
+use compute_tools::extension_server::{get_pg_version, init_remote_storage};
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;

-const BUILD_TAG_DEFAULT: &str = "local";
+// this is an arbitrary build tag. Fine as a default / for testing purposes
+// in-case of not-set environment var
+const BUILD_TAG_DEFAULT: &str = "5670669815";

 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

-    let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
-
+    let build_tag = option_env!("BUILD_TAG")
+        .unwrap_or(BUILD_TAG_DEFAULT)
+        .to_string();
    info!("build_tag: {build_tag}");

    let matches = cli().get_matches();
+    let pgbin_default = String::from("postgres");
+    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
+
+    let remote_ext_config = matches.get_one::<String>("remote-ext-config");
+    let ext_remote_storage = remote_ext_config.map(|x| {
+        init_remote_storage(x).expect("cannot initialize remote extension storage from config")
+    });

    let http_port = *matches
        .get_one::<u16>("http-port")
@@ -128,9 +142,6 @@ fn main() -> Result<()> {
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");

-    // Try to use just 'postgres' if no path is provided
-    let pgbin = matches.get_one::<String>("pgbin").unwrap();
-
    let spec;
    let mut live_config_allowed = false;
    match spec_json {
@@ -168,6 +179,7 @@ fn main() -> Result<()> {

    let mut new_state = ComputeState::new();
    let spec_set;
+
    if let Some(spec) = spec {
        let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
        new_state.pspec = Some(pspec);
@@ -179,27 +191,37 @@ fn main() -> Result<()> {
        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
        pgdata: pgdata.to_string(),
        pgbin: pgbin.to_string(),
+        pgversion: get_pg_version(pgbin),
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
+        ext_remote_storage,
+        ext_remote_paths: OnceLock::new(),
+        ext_download_progress: RwLock::new(HashMap::new()),
+        library_index: OnceLock::new(),
+        build_tag,
    };
    let compute = Arc::new(compute_node);

+    // If this is a pooled VM, prewarm before starting HTTP server and becoming
+    // available for binding. Prewarming helps postgres start quicker later,
+    // because QEMU will already have it's memory allocated from the host, and
+    // the necessary binaries will alreaady be cached.
+    if !spec_set {
+        compute.prewarm_postgres()?;
+    }
+
    // Launch http service first, so we were able to serve control-plane
    // requests, while configuration is still in progress.
    let _http_handle =
        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");

+    let extension_server_port: u16 = http_port;
+
    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");

-        // TODO this can stall startups in the unlikely event that we bind
-        //      this compute node while it's busy prewarming. It's not too
-        //      bad because it's just 100ms and unlikely, but it's an
-        //      avoidable problem.
-        compute.prewarm_postgres()?;
-
        let mut state = compute.state.lock().unwrap();
        while state.status != ComputeStatus::ConfigurationPending {
            state = compute.state_changed.wait(state).unwrap();
@@ -236,7 +258,7 @@ fn main() -> Result<()> {
    // Start Postgres
    let mut delay_exit = false;
    let mut exit_code = None;
-    let pg = match compute.start_compute() {
+    let pg = match compute.start_compute(extension_server_port) {
        Ok(pg) => Some(pg),
        Err(err) => {
            error!("could not start the compute node: {:?}", err);
@@ -365,6 +387,12 @@ fn cli() -> clap::Command {
                .long("control-plane-uri")
                .value_name("CONTROL_PLANE_API_BASE_URI"),
        )
+        .arg(
+            Arg::new("remote-ext-config")
+                .short('r')
+                .long("remote-ext-config")
+                .value_name("REMOTE_EXT_CONFIG"),
+        )
 }

 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,16 +1,21 @@
+use std::collections::HashMap;
 use std::fs;
 use std::io::BufRead;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::{Condvar, Mutex};
+use std::sync::{Condvar, Mutex, OnceLock, RwLock};
+use std::time::Instant;

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
+use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use postgres::{Client, NoTls};
+use regex::Regex;
+use tokio;
 use tokio_postgres;
 use tracing::{error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
@@ -20,10 +25,12 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

-use crate::config;
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+
 use crate::pg_helpers::*;
 use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
+use crate::{config, extension_server};

 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
@@ -31,6 +38,7 @@ pub struct ComputeNode {
    pub connstr: url::Url,
    pub pgdata: String,
    pub pgbin: String,
+    pub pgversion: String,
    /// We should only allow live re- / configuration of the compute node if
    /// it uses 'pull model', i.e. it can go to control-plane and fetch
    /// the latest configuration. Otherwise, there could be a case:
@@ -50,6 +58,24 @@ pub struct ComputeNode {
    pub state: Mutex<ComputeState>,
    /// `Condvar` to allow notifying waiters about state changes.
    pub state_changed: Condvar,
+    ///  the S3 bucket that we search for extensions in
+    pub ext_remote_storage: Option<GenericRemoteStorage>,
+    // (key: extension name, value: path to extension archive in remote storage)
+    pub ext_remote_paths: OnceLock<HashMap<String, RemotePath>>,
+    // (key: library name, value: name of extension containing this library)
+    pub library_index: OnceLock<HashMap<String, String>>,
+    // key: ext_archive_name, value: started download time, download_completed?
+    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
+    pub build_tag: String,
+}
+
+// store some metrics about download size that might impact startup time
+#[derive(Clone, Debug)]
+pub struct RemoteExtensionMetrics {
+    num_ext_downloaded: u64,
+    largest_ext_size: u64,
+    total_ext_download_size: u64,
+    prep_extensions_ms: u64,
 }

 #[derive(Clone, Debug)]
@@ -260,7 +286,7 @@ impl ComputeNode {
    #[instrument(skip_all, fields(%lsn))]
    fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");
-        let start_time = Utc::now();
+        let start_time = Instant::now();

        let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;

@@ -273,7 +299,10 @@ impl ComputeNode {
            info!("Storage auth token not set");
        }

+        // Connect to pageserver
        let mut client = config.connect(NoTls)?;
+        let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
+
        let basebackup_cmd = match lsn {
            // HACK We don't use compression on first start (Lsn(0)) because there's no API for it
            Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id),
@@ -319,13 +348,10 @@ impl ComputeNode {
        };

        // Report metrics
-        self.state.lock().unwrap().metrics.basebackup_bytes =
-            measured_reader.get_byte_count() as u64;
-        self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
-            .signed_duration_since(start_time)
-            .to_std()
-            .unwrap()
-            .as_millis() as u64;
+        let mut state = self.state.lock().unwrap();
+        state.metrics.pageserver_connect_micros = pageserver_connect_micros;
+        state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
+        state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
        Ok(())
    }

@@ -473,14 +499,22 @@ impl ComputeNode {
    /// Do all the preparations like PGDATA directory creation, configuration,
    /// safekeepers sync, basebackup, etc.
    #[instrument(skip_all)]
-    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
+    pub fn prepare_pgdata(
+        &self,
+        compute_state: &ComputeState,
+        extension_server_port: u16,
+    ) -> Result<()> {
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        let spec = &pspec.spec;
        let pgdata_path = Path::new(&self.pgdata);

        // Remove/create an empty pgdata directory and put configuration there.
        self.create_pgdata()?;
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
+        config::write_postgres_conf(
+            &pgdata_path.join("postgresql.conf"),
+            &pspec.spec,
+            Some(extension_server_port),
+        )?;

        // Syncing safekeepers is only safe with primary nodes: if a primary
        // is already connected it will be kicked out, so a secondary (standby)
@@ -670,7 +704,7 @@ impl ComputeNode {

        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
        self.pg_reload_conf(&mut client)?;
@@ -700,7 +734,7 @@ impl ComputeNode {
    }

    #[instrument(skip_all)]
-    pub fn start_compute(&self) -> Result<std::process::Child> {
+    pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
@@ -711,7 +745,31 @@ impl ComputeNode {
            pspec.timeline_id,
        );

-        self.prepare_pgdata(&compute_state)?;
+        // This part is sync, because we need to download
+        // remote shared_preload_libraries before postgres start (if any)
+        {
+            let library_load_start_time = Utc::now();
+            let remote_ext_metrics = self.prepare_preload_libraries(&compute_state)?;
+
+            let library_load_time = Utc::now()
+                .signed_duration_since(library_load_start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64;
+            let mut state = self.state.lock().unwrap();
+            state.metrics.load_ext_ms = library_load_time;
+            state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
+            state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
+            state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
+            state.metrics.prep_extensions_ms = remote_ext_metrics.prep_extensions_ms;
+            info!(
+                "Loading shared_preload_libraries took {:?}ms",
+                library_load_time
+            );
+            info!("{:?}", remote_ext_metrics);
+        }
+
+        self.prepare_pgdata(&compute_state, extension_server_port)?;

        let start_time = Utc::now();
        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
@@ -859,4 +917,241 @@ LIMIT 100",
            "{{\"pg_stat_statements\": []}}".to_string()
        }
    }
+
+    // If remote extension storage is configured,
+    // download extension control files
+    pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
+        if let Some(ref ext_remote_storage) = self.ext_remote_storage {
+            let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+            let spec = &pspec.spec;
+            let custom_ext = spec.custom_extensions.clone().unwrap_or(Vec::new());
+            info!("custom extensions: {:?}", &custom_ext);
+
+            let (ext_remote_paths, library_index) = extension_server::get_available_extensions(
+                ext_remote_storage,
+                &self.pgbin,
+                &self.pgversion,
+                &custom_ext,
+                &self.build_tag,
+            )
+            .await?;
+            self.ext_remote_paths
+                .set(ext_remote_paths)
+                .expect("this is the only time we set ext_remote_paths");
+            self.library_index
+                .set(library_index)
+                .expect("this is the only time we set library_index");
+        }
+        Ok(())
+    }
+
+    // download an archive, unzip and place files in correct locations
+    pub async fn download_extension(
+        &self,
+        ext_name: &str,
+        is_library: bool,
+    ) -> Result<u64, DownloadError> {
+        let remote_storage = self
+            .ext_remote_storage
+            .as_ref()
+            .ok_or(DownloadError::BadInput(anyhow::anyhow!(
+                "Remote extensions storage is not configured",
+            )))?;
+
+        let mut real_ext_name = ext_name;
+        if is_library {
+            // sometimes library names might have a suffix like
+            // library.so or library.so.3. We strip this off
+            // because library_index is based on the name without the file extension
+            let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
+            let lib_raw_name = strip_lib_suffix.replace(real_ext_name, "").to_string();
+
+            real_ext_name = self
+                .library_index
+                .get()
+                .expect("must have already downloaded the library_index")
+                .get(&lib_raw_name)
+                .ok_or(DownloadError::BadInput(anyhow::anyhow!(
+                    "library {} is not found",
+                    lib_raw_name
+                )))?;
+        }
+
+        let ext_path = &self
+            .ext_remote_paths
+            .get()
+            .expect("error accessing ext_remote_paths")
+            .get(real_ext_name)
+            .ok_or(DownloadError::BadInput(anyhow::anyhow!(
+                "real_ext_name {} is not found",
+                real_ext_name
+            )))?;
+
+        let ext_archive_name = ext_path.object_name().expect("bad path");
+
+        let mut first_try = false;
+        if !self
+            .ext_download_progress
+            .read()
+            .expect("lock err")
+            .contains_key(ext_archive_name)
+        {
+            self.ext_download_progress
+                .write()
+                .expect("lock err")
+                .insert(ext_archive_name.to_string(), (Utc::now(), false));
+            first_try = true;
+        }
+        let (download_start, download_completed) =
+            self.ext_download_progress.read().expect("lock err")[ext_archive_name];
+        let start_time_delta = Utc::now()
+            .signed_duration_since(download_start)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
+
+        // how long to wait for extension download if it was started by another process
+        const HANG_TIMEOUT: u64 = 3000; // milliseconds
+
+        if download_completed {
+            info!("extension already downloaded, skipping re-download");
+            return Ok(0);
+        } else if start_time_delta < HANG_TIMEOUT && !first_try {
+            info!("download {ext_archive_name} already started by another process, hanging untill completion or timeout");
+            let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(500));
+            loop {
+                info!("waiting for download");
+                interval.tick().await;
+                let (_, download_completed_now) =
+                    self.ext_download_progress.read().expect("lock")[ext_archive_name];
+                if download_completed_now {
+                    info!("download finished by whoever else downloaded it");
+                    return Ok(0);
+                }
+            }
+            // NOTE: the above loop will get terminated
+            // based on the timeout of the download function
+        }
+
+        // if extension hasn't been downloaded before or the previous
+        // attempt to download was at least HANG_TIMEOUT ms ago
+        // then we try to download it here
+        info!("downloading new extension {ext_archive_name}");
+
+        let download_size = extension_server::download_extension(
+            real_ext_name,
+            ext_path,
+            remote_storage,
+            &self.pgbin,
+        )
+        .await
+        .map_err(DownloadError::Other);
+
+        self.ext_download_progress
+            .write()
+            .expect("bad lock")
+            .insert(ext_archive_name.to_string(), (download_start, true));
+
+        download_size
+    }
+
+    #[tokio::main]
+    pub async fn prepare_preload_libraries(
+        &self,
+        compute_state: &ComputeState,
+    ) -> Result<RemoteExtensionMetrics> {
+        if self.ext_remote_storage.is_none() {
+            return Ok(RemoteExtensionMetrics {
+                num_ext_downloaded: 0,
+                largest_ext_size: 0,
+                total_ext_download_size: 0,
+                prep_extensions_ms: 0,
+            });
+        }
+        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+        let spec = &pspec.spec;
+
+        info!("parse shared_preload_libraries from spec.cluster.settings");
+        let mut libs_vec = Vec::new();
+        if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
+            libs_vec = libs
+                .split(&[',', '\'', ' '])
+                .filter(|s| *s != "neon" && !s.is_empty())
+                .map(str::to_string)
+                .collect();
+        }
+        info!("parse shared_preload_libraries from provided postgresql.conf");
+        // that is used in neon_local and python tests
+        if let Some(conf) = &spec.cluster.postgresql_conf {
+            let conf_lines = conf.split('\n').collect::<Vec<&str>>();
+            let mut shared_preload_libraries_line = "";
+            for line in conf_lines {
+                if line.starts_with("shared_preload_libraries") {
+                    shared_preload_libraries_line = line;
+                }
+            }
+            let mut preload_libs_vec = Vec::new();
+            if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
+                preload_libs_vec = libs
+                    .split(&[',', '\'', ' '])
+                    .filter(|s| *s != "neon" && !s.is_empty())
+                    .map(str::to_string)
+                    .collect();
+            }
+            libs_vec.extend(preload_libs_vec);
+        }
+
+        info!("Download ext_index.json, find the extension paths");
+        let prep_ext_start_time = Utc::now();
+        self.prepare_external_extensions(compute_state).await?;
+        let prep_ext_time_delta = Utc::now()
+            .signed_duration_since(prep_ext_start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
+        info!("Prepare extensions took {prep_ext_time_delta}ms");
+
+        // Don't try to download libraries that are not in the index.
+        // Assume that they are already present locally.
+        libs_vec.retain(|lib| {
+            self.library_index
+                .get()
+                .expect("error accessing ext_remote_paths")
+                .contains_key(lib)
+        });
+
+        info!("Downloading to shared preload libraries: {:?}", &libs_vec);
+
+        let mut download_tasks = Vec::new();
+        for library in &libs_vec {
+            download_tasks.push(self.download_extension(library, true));
+        }
+        let results = join_all(download_tasks).await;
+
+        let mut remote_ext_metrics = RemoteExtensionMetrics {
+            num_ext_downloaded: 0,
+            largest_ext_size: 0,
+            total_ext_download_size: 0,
+            prep_extensions_ms: prep_ext_time_delta,
+        };
+        for result in results {
+            let download_size = match result {
+                Ok(res) => {
+                    remote_ext_metrics.num_ext_downloaded += 1;
+                    res
+                }
+                Err(err) => {
+                    // if we failed to download an extension, we don't want to fail the whole
+                    // process, but we do want to log the error
+                    error!("Failed to download extension: {}", err);
+                    0
+                }
+            };
+
+            remote_ext_metrics.largest_ext_size =
+                std::cmp::max(remote_ext_metrics.largest_ext_size, download_size);
+            remote_ext_metrics.total_ext_download_size += download_size;
+        }
+        Ok(remote_ext_metrics)
+    }
 }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -33,7 +33,11 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
 }

 /// Create or completely rewrite configuration file specified by `path`
-pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
+pub fn write_postgres_conf(
+    path: &Path,
+    spec: &ComputeSpec,
+    extension_server_port: Option<u16>,
+) -> Result<()> {
    // File::create() destroys the file content if it exists.
    let mut file = File::create(path)?;

@@ -87,5 +91,9 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
        writeln!(file, "# Managed by compute_ctl: end")?;
    }

+    if let Some(port) = extension_server_port {
+        writeln!(file, "neon.extension_server_port={}", port)?;
+    }
+
    Ok(())
 }
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -0,0 +1,281 @@
+// Download extension files from the extension store
+// and put them in the right place in the postgres directory (share / lib)
+/*
+The layout of the S3 bucket is as follows:
+5615610098 // this is an extension build number
+├── v14
+│   ├── extensions
+│   │   ├── anon.tar.zst
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   ├── anon.tar.zst
+    │   └── embedding.tar.zst
+    └── ext_index.json
+5615261079
+├── v14
+│   ├── extensions
+│   │   └── anon.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── anon.tar.zst
+    └── ext_index.json
+5623261088
+├── v14
+│   ├── extensions
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── embedding.tar.zst
+    └── ext_index.json
+
+Note that build number cannot be part of prefix because we might need extensions
+from other build numbers.
+
+ext_index.json stores the control files and location of extension archives
+It also stores a list of public extensions and a library_index
+
+We don't need to duplicate extension.tar.zst files.
+We only need to upload a new one if it is updated.
+(Although currently we just upload every time anyways, hopefully will change
+this sometime)
+
+*access* is controlled by spec
+
+More specifically, here is an example ext_index.json
+{
+    "public_extensions": [
+        "anon",
+        "pg_buffercache"
+    ],
+    "library_index": {
+        "anon": "anon",
+        "pg_buffercache": "pg_buffercache"
+    },
+    "extension_data": {
+        "pg_buffercache": {
+            "control_data": {
+                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
+            },
+            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
+        },
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
+        }
+    }
+}
+*/
+use anyhow::Context;
+use anyhow::{self, Result};
+use futures::future::join_all;
+use remote_storage::*;
+use serde_json;
+use std::collections::HashMap;
+use std::io::Read;
+use std::num::{NonZeroU32, NonZeroUsize};
+use std::path::Path;
+use std::str;
+use tar::Archive;
+use tokio::io::AsyncReadExt;
+use tracing::info;
+use tracing::log::warn;
+use zstd::stream::read::Decoder;
+
+fn get_pg_config(argument: &str, pgbin: &str) -> String {
+    // gives the result of `pg_config [argument]`
+    // where argument is a flag like `--version` or `--sharedir`
+    let pgconfig = pgbin
+        .strip_suffix("postgres")
+        .expect("bad pgbin")
+        .to_owned()
+        + "/pg_config";
+    let config_output = std::process::Command::new(pgconfig)
+        .arg(argument)
+        .output()
+        .expect("pg_config error");
+    std::str::from_utf8(&config_output.stdout)
+        .expect("pg_config error")
+        .trim()
+        .to_string()
+}
+
+pub fn get_pg_version(pgbin: &str) -> String {
+    // pg_config --version returns a (platform specific) human readable string
+    // such as "PostgreSQL 15.4". We parse this to v14/v15
+    let human_version = get_pg_config("--version", pgbin);
+    if human_version.contains("15") {
+        return "v15".to_string();
+    } else if human_version.contains("14") {
+        return "v14".to_string();
+    }
+    panic!("Unsuported postgres version {human_version}");
+}
+
+// download control files for enabled_extensions
+// return Hashmaps converting library names to extension names (library_index)
+// and specifying the remote path to the archive for each extension name
+pub async fn get_available_extensions(
+    remote_storage: &GenericRemoteStorage,
+    pgbin: &str,
+    pg_version: &str,
+    custom_extensions: &[String],
+    build_tag: &str,
+) -> Result<(HashMap<String, RemotePath>, HashMap<String, String>)> {
+    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
+    let index_path = format!("{build_tag}/{pg_version}/ext_index.json");
+    let index_path = RemotePath::new(Path::new(&index_path)).context("error forming path")?;
+    info!("download ext_index.json from: {:?}", &index_path);
+
+    let mut download = remote_storage.download(&index_path).await?;
+    let mut ext_idx_buffer = Vec::new();
+    download
+        .download_stream
+        .read_to_end(&mut ext_idx_buffer)
+        .await?;
+    info!("ext_index downloaded");
+
+    #[derive(Debug, serde::Deserialize)]
+    struct Index {
+        public_extensions: Vec<String>,
+        library_index: HashMap<String, String>,
+        extension_data: HashMap<String, ExtensionData>,
+    }
+
+    #[derive(Debug, serde::Deserialize)]
+    struct ExtensionData {
+        control_data: HashMap<String, String>,
+        archive_path: String,
+    }
+
+    let ext_index_full = serde_json::from_slice::<Index>(&ext_idx_buffer)?;
+    let mut enabled_extensions = ext_index_full.public_extensions;
+    enabled_extensions.extend_from_slice(custom_extensions);
+    let mut library_index = ext_index_full.library_index;
+    let all_extension_data = ext_index_full.extension_data;
+    info!("library_index: {:?}", library_index);
+
+    info!("enabled_extensions: {:?}", enabled_extensions);
+    let mut ext_remote_paths = HashMap::new();
+    let mut file_create_tasks = Vec::new();
+    for extension in enabled_extensions {
+        let ext_data = &all_extension_data[&extension];
+        for (control_file, control_contents) in &ext_data.control_data {
+            let extension_name = control_file
+                .strip_suffix(".control")
+                .expect("control files must end in .control");
+            let control_path = local_sharedir.join(control_file);
+            if !control_path.exists() {
+                ext_remote_paths.insert(
+                    extension_name.to_string(),
+                    RemotePath::from_string(&ext_data.archive_path)?,
+                );
+                info!("writing file {:?}{:?}", control_path, control_contents);
+                file_create_tasks.push(tokio::fs::write(control_path, control_contents));
+            } else {
+                warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_file);
+                // also delete this from library index
+                library_index.retain(|_, value| value != extension_name);
+            }
+        }
+    }
+    let results = join_all(file_create_tasks).await;
+    for result in results {
+        result?;
+    }
+    info!("ext_remote_paths {:?}", ext_remote_paths);
+    Ok((ext_remote_paths, library_index))
+}
+
+// download the archive for a given extension,
+// unzip it, and place files in the appropriate locations (share/lib)
+pub async fn download_extension(
+    ext_name: &str,
+    ext_path: &RemotePath,
+    remote_storage: &GenericRemoteStorage,
+    pgbin: &str,
+) -> Result<u64> {
+    info!("Download extension {:?} from {:?}", ext_name, ext_path);
+    let mut download = remote_storage.download(ext_path).await?;
+    let mut download_buffer = Vec::new();
+    download
+        .download_stream
+        .read_to_end(&mut download_buffer)
+        .await?;
+    let download_size = download_buffer.len() as u64;
+    // it's unclear whether it is more performant to decompress into memory or not
+    // TODO: decompressing into memory can be avoided
+    let mut decoder = Decoder::new(download_buffer.as_slice())?;
+    let mut decompress_buffer = Vec::new();
+    decoder.read_to_end(&mut decompress_buffer)?;
+    let mut archive = Archive::new(decompress_buffer.as_slice());
+    let unzip_dest = pgbin
+        .strip_suffix("/bin/postgres")
+        .expect("bad pgbin")
+        .to_string()
+        + "/download_extensions";
+    archive.unpack(&unzip_dest)?;
+    info!("Download + unzip {:?} completed successfully", &ext_path);
+
+    let sharedir_paths = (
+        unzip_dest.to_string() + "/share/extension",
+        Path::new(&get_pg_config("--sharedir", pgbin)).join("extension"),
+    );
+    let libdir_paths = (
+        unzip_dest.to_string() + "/lib",
+        Path::new(&get_pg_config("--pkglibdir", pgbin)).to_path_buf(),
+    );
+    // move contents of the libdir / sharedir in unzipped archive to the correct local paths
+    for paths in [sharedir_paths, libdir_paths] {
+        let (zip_dir, real_dir) = paths;
+        info!("mv {zip_dir:?}/*  {real_dir:?}");
+        for file in std::fs::read_dir(zip_dir)? {
+            let old_file = file?.path();
+            let new_file =
+                Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?);
+            info!("moving {old_file:?} to {new_file:?}");
+
+            // extension download failed: Directory not empty (os error 39)
+            match std::fs::rename(old_file, new_file) {
+                Ok(()) => info!("move succeeded"),
+                Err(e) => {
+                    warn!("move failed, probably because the extension already exists: {e}")
+                }
+            }
+        }
+    }
+    info!("done moving extension {ext_name}");
+    Ok(download_size)
+}
+
+// This function initializes the necessary structs to use remote storage
+pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
+    #[derive(Debug, serde::Deserialize)]
+    struct RemoteExtJson {
+        bucket: String,
+        region: String,
+        endpoint: Option<String>,
+        prefix: Option<String>,
+    }
+    let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
+
+    let config = S3Config {
+        bucket_name: remote_ext_json.bucket,
+        bucket_region: remote_ext_json.region,
+        prefix_in_bucket: remote_ext_json.prefix,
+        endpoint: remote_ext_json.endpoint,
+        concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
+        max_keys_per_list_response: None,
+    };
+    let config = RemoteStorageConfig {
+        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
+        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
+        storage: RemoteStorageKind::AwsS3(config),
+    };
+    GenericRemoteStorage::from_config(&config)
+}
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -30,7 +30,6 @@ fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
        status: state.status,
        last_active: state.last_active,
        error: state.error.clone(),
-        metrics: state.metrics.clone(),
    }
 }

@@ -122,6 +121,46 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

+        // download extension files from S3 on demand
+        (&Method::POST, route) if route.starts_with("/extension_server/") => {
+            info!("serving {:?} POST request", route);
+            info!("req.uri {:?}", req.uri());
+
+            let mut is_library = false;
+            if let Some(params) = req.uri().query() {
+                info!("serving {:?} POST request with params: {}", route, params);
+                if params == "is_library=true" {
+                    is_library = true;
+                } else {
+                    let mut resp = Response::new(Body::from("Wrong request parameters"));
+                    *resp.status_mut() = StatusCode::BAD_REQUEST;
+                    return resp;
+                }
+            }
+
+            let filename = route.split('/').last().unwrap().to_string();
+            info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
+
+            // don't even try to download extensions
+            // if no remote storage is configured
+            if compute.ext_remote_storage.is_none() {
+                info!("no extensions remote storage configured");
+                let mut resp = Response::new(Body::from("no remote storage configured"));
+                *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                return resp;
+            }
+
+            match compute.download_extension(&filename, is_library).await {
+                Ok(_) => Response::new(Body::from("OK")),
+                Err(e) => {
+                    error!("extension download failed: {}", e);
+                    let mut resp = Response::new(Body::from(e.to_string()));
+                    *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                    resp
+                }
+            }
+        }
+
        // Return the `404 Not Found` for any other routes.
        _ => {
            let mut not_found = Response::new(Body::from("404 Not Found"));
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -139,6 +139,34 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/GenericError"
+  /extension_server:
+    post:
+      tags:
+      - Extension
+      summary: Download extension from S3 to local folder.
+      description: ""
+      operationId: downloadExtension
+      responses:
+        200:
+          description: Extension downloaded
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Error text or 'OK' if download succeeded.
+                example: "OK"
+        400:
+        description: Request is invalid.
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/GenericError"
+        500:
+        description: Extension download request failed.
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/GenericError"

 components:
  securitySchemes:
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -9,6 +9,7 @@ pub mod http;
 #[macro_use]
 pub mod logger;
 pub mod compute;
+pub mod extension_server;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane(
 pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
    // File `postgresql.conf` is no longer included into `basebackup`, so just
    // always write all config into it creating new file.
-    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
+    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;

    update_pg_hba(pgdata_path)?;

@@ -270,7 +270,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            }
            RoleAction::Create => {
                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
+                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -32,3 +32,4 @@ utils.workspace = true

 compute_api.workspace = true
 workspace_hack.workspace = true
+tracing.workspace = true
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -658,6 +658,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

+            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
+
            // If --safekeepers argument is given, use only the listed safekeeper nodes.
            let safekeepers =
                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -699,7 +701,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    _ => {}
                }
                println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token, safekeepers)?;
+                endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
            } else {
                let branch_name = sub_args
                    .get_one::<String>("branch-name")
@@ -743,7 +745,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    pg_version,
                    mode,
                )?;
-                ep.start(&auth_token, safekeepers)?;
+                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
        }
        "stop" => {
@@ -823,6 +825,16 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNod
    }
 }

+// Get list of options to append to safekeeper command invocation.
+fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
+    init_match
+        .get_many::<String>("safekeeper-extra-opt")
+        .into_iter()
+        .flatten()
+        .map(|s| s.to_owned())
+        .collect()
+}
+
 fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    let (sub_name, sub_args) = match sub_match.subcommand() {
        Some(safekeeper_command_data) => safekeeper_command_data,
@@ -839,7 +851,9 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul

    match sub_name {
        "start" => {
-            if let Err(e) = safekeeper.start() {
+            let extra_opts = safekeeper_extra_opts(sub_args);
+
+            if let Err(e) = safekeeper.start(extra_opts) {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -864,7 +878,8 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
                exit(1);
            }

-            if let Err(e) = safekeeper.start() {
+            let extra_opts = safekeeper_extra_opts(sub_args);
+            if let Err(e) = safekeeper.start(extra_opts) {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -891,7 +906,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow

    for node in env.safekeepers.iter() {
        let safekeeper = SafekeeperNode::from_env(env, node);
-        if let Err(e) = safekeeper.start() {
+        if let Err(e) = safekeeper.start(vec![]) {
            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
            try_stop_all(env, false);
            exit(1);
@@ -954,6 +969,14 @@ fn cli() -> Command {

    let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);

+    let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt")
+        .short('e')
+        .long("safekeeper-extra-opt")
+        .num_args(1)
+        .action(ArgAction::Append)
+        .help("Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo")
+        .required(false);
+
    let tenant_id_arg = Arg::new("tenant-id")
        .long("tenant-id")
        .help("Tenant id. Represented as a hexadecimal string 32 symbols length")
@@ -1003,6 +1026,12 @@ fn cli() -> Command {
        .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
        .required(false);

+    let remote_ext_config_args = Arg::new("remote-ext-config")
+        .long("remote-ext-config")
+        .num_args(1)
+        .help("Configure the S3 bucket that we search for extensions in.")
+        .required(false);
+
    let lsn_arg = Arg::new("lsn")
        .long("lsn")
        .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
@@ -1116,6 +1145,7 @@ fn cli() -> Command {
                .subcommand(Command::new("start")
                            .about("Start local safekeeper")
                            .arg(safekeeper_id_arg.clone())
+                            .arg(safekeeper_extra_opt_arg.clone())
                )
                .subcommand(Command::new("stop")
                            .about("Stop local safekeeper")
@@ -1126,6 +1156,7 @@ fn cli() -> Command {
                            .about("Restart local safekeeper")
                            .arg(safekeeper_id_arg)
                            .arg(stop_mode_arg.clone())
+                            .arg(safekeeper_extra_opt_arg)
                )
        )
        .subcommand(
@@ -1161,6 +1192,7 @@ fn cli() -> Command {
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
                    .arg(safekeepers_arg)
+                    .arg(remote_ext_config_args)
                )
                .subcommand(
                    Command::new("stop")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -313,7 +313,7 @@ impl Endpoint {

                // TODO: use future host field from safekeeper spec
                // Pass the list of safekeepers to the replica so that it can connect to any of them,
-                // whichever is availiable.
+                // whichever is available.
                let sk_ports = self
                    .env
                    .safekeepers
@@ -420,7 +420,12 @@ impl Endpoint {
        Ok(())
    }

-    pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
+    pub fn start(
+        &self,
+        auth_token: &Option<String>,
+        safekeepers: Vec<NodeId>,
+        remote_ext_config: Option<&String>,
+    ) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
        }
@@ -488,6 +493,7 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
+            custom_extensions: Some(vec![]),
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -519,6 +525,11 @@ impl Endpoint {
            .stdin(std::process::Stdio::null())
            .stderr(logfile.try_clone()?)
            .stdout(logfile);
+
+        if let Some(remote_ext_config) = remote_ext_config {
+            cmd.args(["--remote-ext-config", remote_ext_config]);
+        }
+
        let child = cmd.spawn()?;

        // Write down the pid so we can wait for it when we want to stop
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -101,7 +101,7 @@ impl SafekeeperNode {
        self.datadir_path().join("safekeeper.pid")
    }

-    pub fn start(&self) -> anyhow::Result<Child> {
+    pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
        print!(
            "Starting safekeeper at '{}' in '{}'",
            self.pg_connection_config.raw_address(),
@@ -161,17 +161,28 @@ impl SafekeeperNode {

        let key_path = self.env.base_data_dir.join("auth_public_key.pem");
        if self.conf.auth_enabled {
+            let key_path_string = key_path
+                .to_str()
+                .with_context(|| {
+                    format!("Key path {key_path:?} cannot be represented as a unicode string")
+                })?
+                .to_owned();
            args.extend([
-                "--auth-validation-public-key-path".to_owned(),
-                key_path
-                    .to_str()
-                    .with_context(|| {
-                        format!("Key path {key_path:?} cannot be represented as a unicode string")
-                    })?
-                    .to_owned(),
+                "--pg-auth-public-key-path".to_owned(),
+                key_path_string.clone(),
+            ]);
+            args.extend([
+                "--pg-tenant-only-auth-public-key-path".to_owned(),
+                key_path_string.clone(),
+            ]);
+            args.extend([
+                "--http-auth-public-key-path".to_owned(),
+                key_path_string.clone(),
            ]);
        }

+        args.extend(extra_opts);
+
        background_process::start_process(
            &format!("safekeeper-{id}"),
            &datadir,
--- a/docs/rfcs/024-extension-loading.md
+++ b/docs/rfcs/024-extension-loading.md
@@ -0,0 +1,236 @@
+# Supporting custom user Extensions (Dynamic Extension Loading)
+Created 2023-05-03
+
+## Motivation
+
+There are many extensions in the PostgreSQL ecosystem, and not all extensions
+are of a quality that we can confidently support them. Additionally, our
+current extension inclusion mechanism has several problems because we build all
+extensions into the primary Compute image: We build the extensions every time
+we build the compute image regardless of whether we actually need to rebuild
+the image, and the inclusion of these extensions in the image adds a hard
+dependency on all supported extensions - thus increasing the image size, and
+with it the time it takes to download that image - increasing first start
+latency.
+
+This RFC proposes a dynamic loading mechanism that solves most of these
+problems.
+
+## Summary
+
+`compute_ctl` is made responsible for loading extensions on-demand into
+the container's file system for dynamically loaded extensions, and will also
+make sure that the extensions in `shared_preload_libraries` are downloaded
+before the compute node starts.
+
+## Components
+
+compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store
+
+## Requirements
+
+Compute nodes with no extra extensions should not be negatively impacted by
+the existence of support for many extensions.
+
+Installing an extension into PostgreSQL should be easy.
+
+Non-preloaded extensions shouldn't impact startup latency.
+
+Uninstalled extensions shouldn't impact query latency.
+
+A small latency penalty for dynamically loaded extensions is acceptable in
+the first seconds of compute startup, but not in steady-state operations.
+
+## Proposed implementation
+
+### On-demand, JIT-loading of extensions
+
+Before postgres starts we download 
+- control files for all extensions available to that compute node;
+- all `shared_preload_libraries`;
+
+After postgres is running, `compute_ctl` listens for requests to load files.
+When PostgreSQL requests a file, `compute_ctl` downloads it.
+
+PostgreSQL requests files in the following cases:
+- When loading a preload library set in `local_preload_libraries`
+- When explicitly loading a library with `LOAD`
+- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
+
+
+#### Summary
+
+Pros:
+ - Startup is only as slow as it takes to load all (shared_)preload_libraries
+ - Supports BYO Extension
+
+Cons:
+ - O(sizeof(extensions)) IO requirement for loading all extensions.
+
+### Alternative solutions
+
+1. Allow users to add their extensions to the base image
+   
+   Pros:
+    - Easy to deploy
+
+   Cons:
+    - Doesn't scale - first start size is dependent on image size;
+    - All extensions are shared across all users: It doesn't allow users to
+      bring their own restrictive-licensed extensions
+
+2. Bring Your Own compute image
+   
+   Pros:
+    - Still easy to deploy
+    - User can bring own patched version of PostgreSQL
+
+   Cons:
+    - First start latency is O(sizeof(extensions image))
+    - Warm instance pool for skipping pod schedule latency is not feasible with
+      O(n) custom images
+    - Support channels are difficult to manage
+
+3. Download all user extensions in bulk on compute start
+   
+   Pros:
+    - Easy to deploy
+    - No startup latency issues for "clean" users.
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - Downloading all extensions in advance takes a lot of time, thus startup
+      latency issues
+
+4. Store user's extensions in persistent storage
+   
+   Pros:
+    - Easy to deploy
+    - No startup latency issues
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - EC2 instances have only limited number of attachments shared between EBS
+      volumes, direct-attached NVMe drives, and ENIs.
+    - Compute instance migration isn't trivially solved for EBS mounts (e.g.
+      the device is unavailable whilst moving the mount between instances).
+    - EBS can only mount on one instance at a time (except the expensive IO2
+      device type).
+
+5. Store user's extensions in network drive
+   
+   Pros:
+    - Easy to deploy
+    - Few startup latency issues
+    - Warm instance pool for skipping pod schedule latency is possible
+
+   Cons:
+    - We'd need networked drives, and a lot of them, which would store many
+      duplicate extensions.
+    - **UNCHECKED:** Compute instance migration may not work nicely with
+      networked IOs
+
+
+### Idea extensions
+
+The extension store does not have to be S3 directly, but could be a Node-local
+caching service on top of S3. This would reduce the load on the network for
+popular extensions.
+
+## Extension Storage implementation
+
+The layout of the S3 bucket is as follows:
+```
+5615610098 // this is an extension build number
+├── v14
+│   ├── extensions
+│   │   ├── anon.tar.zst
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   ├── anon.tar.zst
+    │   └── embedding.tar.zst
+    └── ext_index.json
+5615261079
+├── v14
+│   ├── extensions
+│   │   └── anon.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── anon.tar.zst
+    └── ext_index.json
+5623261088
+├── v14
+│   ├── extensions
+│   │   └── embedding.tar.zst
+│   └── ext_index.json
+└── v15
+    ├── extensions
+    │   └── embedding.tar.zst
+    └── ext_index.json
+```
+
+Note that build number cannot be part of prefix because we might need extensions
+from other build numbers.
+
+`ext_index.json` stores the control files and location of extension archives. 
+It also stores a list of public extensions and a library_index
+
+We don't need to duplicate `extension.tar.zst`` files.
+We only need to upload a new one if it is updated.
+(Although currently we just upload every time anyways, hopefully will change
+this sometime)
+
+*access* is controlled by spec
+
+More specifically, here is an example ext_index.json
+```
+{
+    "public_extensions": [
+        "anon",
+        "pg_buffercache"
+    ],
+    "library_index": {
+        "anon": "anon",
+        "pg_buffercache": "pg_buffercache"
+        // for more complex extensions like postgis
+        // we might have something like:
+        // address_standardizer: postgis
+        // postgis_tiger: postgis
+    },
+    "extension_data": {
+        "pg_buffercache": {
+            "control_data": {
+                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
+            },
+            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
+        },
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
+        }
+    }
+}
+```
+
+### How to add new extension to the Extension Storage?
+
+Simply upload build artifacts to the S3 bucket.
+Implement a CI step for that. Splitting it from compute-node-image build.
+
+### How do we deal with extension versions and updates?
+
+Currently, we rebuild extensions on every compute-node-image build and store them in the <build-version> prefix.
+This is needed to ensure that `/share` and `/lib` files are in sync.
+
+For extension updates, we rely on the PostgreSQL extension versioning mechanism (sql update scripts) and extension authors to not break backwards compatibility within one major version of PostgreSQL.
+
+### Alternatives
+
+For extensions written on trusted languages we can also adopt
+`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase.
+This will increase the amount supported extensions and decrease the amount of work required to support them.
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -21,7 +21,6 @@ pub struct ComputeStatusResponse {
    #[serde(serialize_with = "rfc3339_serialize")]
    pub last_active: Option<DateTime<Utc>>,
    pub error: Option<String>,
-    pub metrics: ComputeMetrics,
 }

 #[derive(Deserialize, Serialize)]
@@ -67,16 +66,48 @@ where
 }

 /// Response of the /metrics.json API
-#[derive(Clone, Debug, Default, Serialize, Deserialize)]
+#[derive(Clone, Debug, Default, Serialize)]
 pub struct ComputeMetrics {
+    /// Time spent waiting in pool
    pub wait_for_spec_ms: u64,
-    pub sync_safekeepers_ms: u64,
+
+    /// Time spent checking if safekeepers are synced
    pub sync_sk_check_ms: u64,
+
+    /// Time spent syncing safekeepers (walproposer.c).
+    /// In most cases this should be zero.
+    pub sync_safekeepers_ms: u64,
+
+    /// Time it took to establish a pg connection to the pageserver.
+    /// This is two roundtrips, so it's a good proxy for compute-pageserver
+    /// latency. The latency is usually 0.2ms, but it's not safe to assume
+    /// that.
+    pub pageserver_connect_micros: u64,
+
+    /// Time to get basebackup from pageserver and write it to disk.
    pub basebackup_ms: u64,
+
+    /// Compressed size of basebackup received.
    pub basebackup_bytes: u64,
+
+    /// Time spent starting potgres. This includes initialization of shared
+    /// buffers, preloading extensions, and other pg operations.
    pub start_postgres_ms: u64,
+
+    /// Time spent applying pg catalog updates that were made in the console
+    /// UI. This should be 0 when startup time matters, since cplane tries
+    /// to do these updates eagerly, and passes the skip_pg_catalog_updates
+    /// when it's safe to skip this step.
    pub config_ms: u64,
+
+    /// Total time, from when we receive the spec to when we're ready to take
+    /// pg connections.
    pub total_startup_ms: u64,
+    pub load_ext_ms: u64,
+    pub num_ext_downloaded: u64,
+    pub largest_ext_size: u64, // these are measured in bytes
+    pub total_ext_download_size: u64,
+    pub prep_extensions_ms: u64,
 }

 /// Response of the `/computes/{compute_id}/spec` control-plane API.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -60,6 +60,9 @@ pub struct ComputeSpec {
    /// If set, 'storage_auth_token' is used as the password to authenticate to
    /// the pageserver and safekeepers.
    pub storage_auth_token: Option<String>,
+
+    // list of prefixes to search for custom extensions in remote extension storage
+    pub custom_extensions: Option<Vec<String>>,
 }

 #[serde_as]
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -145,6 +145,13 @@ pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
 pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
+pub const XLH_INSERT_STORE_CID: u8 = (1 << 7) as u8;
+pub const XLH_UPDATE_STORE_CID: u8 = (1 << 7) as u8;
+pub const XLH_DELETE_STORE_CID: u8 = (1 << 7) as u8;
+pub const XLH_LOCK_STORE_CID: u8 = (1 << 7) as u8;
+
+pub const SIZE_OF_HEAP_LOCK: usize = 14;
+pub const SIZE_OF_HEAP_DELETE: usize = 14;

 // From replication/message.h
 pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00;
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -20,6 +20,7 @@ tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
 tokio-util.workspace = true
 toml_edit.workspace = true
 tracing.workspace = true
+scopeguard.workspace = true
 metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -65,6 +65,10 @@ impl RemotePath {
        Ok(Self(relative_path.to_path_buf()))
    }

+    pub fn from_string(relative_path: &str) -> anyhow::Result<Self> {
+        Self::new(Path::new(relative_path))
+    }
+
    pub fn with_base(&self, base_path: &Path) -> PathBuf {
        base_path.join(&self.0)
    }
@@ -190,6 +194,20 @@ pub enum GenericRemoteStorage {
 }

 impl GenericRemoteStorage {
+    // A function for listing all the files in a "directory"
+    // Example:
+    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
+    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        match self {
+            Self::LocalFs(s) => s.list_files(folder).await,
+            Self::AwsS3(s) => s.list_files(folder).await,
+            Self::Unreliable(s) => s.list_files(folder).await,
+        }
+    }
+
+    // lists common *prefixes*, if any of files
+    // Example:
+    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
    pub async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
@@ -201,14 +219,6 @@ impl GenericRemoteStorage {
        }
    }

-    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        match self {
-            Self::LocalFs(s) => s.list_files(folder).await,
-            Self::AwsS3(s) => s.list_files(folder).await,
-            Self::Unreliable(s) => s.list_files(folder).await,
-        }
-    }
-
    pub async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -10,6 +10,7 @@ use anyhow::Context;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
+    provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
 };
 use aws_credential_types::cache::CredentialsCache;
 use aws_sdk_s3::{
@@ -22,6 +23,7 @@ use aws_sdk_s3::{
 };
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
+use scopeguard::ScopeGuard;
 use tokio::{
    io::{self, AsyncRead},
    sync::Semaphore,
@@ -36,82 +38,9 @@ use crate::{

 const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;

-pub(super) mod metrics {
-    use metrics::{register_int_counter_vec, IntCounterVec};
-    use once_cell::sync::Lazy;
+pub(super) mod metrics;

-    static S3_REQUESTS_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
-        register_int_counter_vec!(
-            "remote_storage_s3_requests_count",
-            "Number of s3 requests of particular type",
-            &["request_type"],
-        )
-        .expect("failed to define a metric")
-    });
-
-    static S3_REQUESTS_FAIL_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
-        register_int_counter_vec!(
-            "remote_storage_s3_failures_count",
-            "Number of failed s3 requests of particular type",
-            &["request_type"],
-        )
-        .expect("failed to define a metric")
-    });
-
-    pub fn inc_get_object() {
-        S3_REQUESTS_COUNT.with_label_values(&["get_object"]).inc();
-    }
-
-    pub fn inc_get_object_fail() {
-        S3_REQUESTS_FAIL_COUNT
-            .with_label_values(&["get_object"])
-            .inc();
-    }
-
-    pub fn inc_put_object() {
-        S3_REQUESTS_COUNT.with_label_values(&["put_object"]).inc();
-    }
-
-    pub fn inc_put_object_fail() {
-        S3_REQUESTS_FAIL_COUNT
-            .with_label_values(&["put_object"])
-            .inc();
-    }
-
-    pub fn inc_delete_object() {
-        S3_REQUESTS_COUNT
-            .with_label_values(&["delete_object"])
-            .inc();
-    }
-
-    pub fn inc_delete_objects(count: u64) {
-        S3_REQUESTS_COUNT
-            .with_label_values(&["delete_object"])
-            .inc_by(count);
-    }
-
-    pub fn inc_delete_object_fail() {
-        S3_REQUESTS_FAIL_COUNT
-            .with_label_values(&["delete_object"])
-            .inc();
-    }
-
-    pub fn inc_delete_objects_fail(count: u64) {
-        S3_REQUESTS_FAIL_COUNT
-            .with_label_values(&["delete_object"])
-            .inc_by(count);
-    }
-
-    pub fn inc_list_objects() {
-        S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
-    }
-
-    pub fn inc_list_objects_fail() {
-        S3_REQUESTS_FAIL_COUNT
-            .with_label_values(&["list_objects"])
-            .inc();
-    }
-}
+use self::metrics::{AttemptOutcome, RequestKind};

 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -139,18 +68,29 @@ impl S3Bucket {
            aws_config.bucket_name
        );

+        let region = Some(Region::new(aws_config.bucket_region.clone()));
+
        let credentials_provider = {
            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
            CredentialsProviderChain::first_try(
                "env",
                EnvironmentVariableCredentialsProvider::new(),
            )
+            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
+            // needed to access remote extensions bucket
+            .or_else("token", {
+                let provider_conf = ProviderConfig::without_region().with_region(region.clone());
+
+                WebIdentityTokenCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build()
+            })
            // uses imds v2
            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };

        let mut config_builder = Config::builder()
-            .region(Region::new(aws_config.bucket_region.clone()))
+            .region(region)
            .credentials_cache(CredentialsCache::lazy())
            .credentials_provider(credentials_provider);

@@ -213,16 +153,43 @@ impl S3Bucket {
        }
    }

-    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
+    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
+        let started_at = start_counting_cancelled_wait(kind);
+        let permit = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .expect("semaphore is never closed");
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        metrics::BUCKET_METRICS
+            .wait_seconds
+            .observe_elapsed(kind, started_at);
+
+        permit
+    }
+
+    async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit {
+        let started_at = start_counting_cancelled_wait(kind);
        let permit = self
            .concurrency_limiter
            .clone()
            .acquire_owned()
            .await
-            .context("Concurrency limiter semaphore got closed during S3 download")
-            .map_err(DownloadError::Other)?;
+            .expect("semaphore is never closed");

-        metrics::inc_get_object();
+        let started_at = ScopeGuard::into_inner(started_at);
+        metrics::BUCKET_METRICS
+            .wait_seconds
+            .observe_elapsed(kind, started_at);
+        permit
+    }
+
+    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
+        let kind = RequestKind::Get;
+        let permit = self.owned_permit(kind).await;
+
+        let started_at = start_measuring_requests(kind);

        let get_object = self
            .client
@@ -233,26 +200,33 @@ impl S3Bucket {
            .send()
            .await;

+        let started_at = ScopeGuard::into_inner(started_at);
+
+        if get_object.is_err() {
+            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                kind,
+                AttemptOutcome::Err,
+                started_at,
+            );
+        }
+
        match get_object {
            Ok(object_output) => {
                let metadata = object_output.metadata().cloned().map(StorageMetadata);
                Ok(Download {
                    metadata,
-                    download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
-                        permit,
-                        object_output.body.into_async_read(),
+                    download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
+                        started_at,
+                        RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
                    ))),
                })
            }
            Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
                Err(DownloadError::NotFound)
            }
-            Err(e) => {
-                metrics::inc_get_object_fail();
-                Err(DownloadError::Other(anyhow::anyhow!(
-                    "Failed to download S3 object: {e}"
-                )))
-            }
+            Err(e) => Err(DownloadError::Other(
+                anyhow::Error::new(e).context("download s3 object"),
+            )),
        }
    }
 }
@@ -283,6 +257,54 @@ impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
    }
 }

+pin_project_lite::pin_project! {
+    /// Times and tracks the outcome of the request.
+    struct TimedDownload<S> {
+        started_at: std::time::Instant,
+        outcome: metrics::AttemptOutcome,
+        #[pin]
+        inner: S
+    }
+
+    impl<S> PinnedDrop for TimedDownload<S> {
+        fn drop(mut this: Pin<&mut Self>) {
+            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
+        }
+    }
+}
+
+impl<S: AsyncRead> TimedDownload<S> {
+    fn new(started_at: std::time::Instant, inner: S) -> Self {
+        TimedDownload {
+            started_at,
+            outcome: metrics::AttemptOutcome::Cancelled,
+            inner,
+        }
+    }
+}
+
+impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut io::ReadBuf<'_>,
+    ) -> std::task::Poll<std::io::Result<()>> {
+        let this = self.project();
+        let before = buf.filled().len();
+        let read = std::task::ready!(this.inner.poll_read(cx, buf));
+
+        let read_eof = buf.filled().len() == before;
+
+        match read {
+            Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
+            Ok(()) => { /* still in progress */ }
+            Err(_) => *this.outcome = AttemptOutcome::Err,
+        }
+
+        std::task::Poll::Ready(read)
+    }
+}
+
 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
    /// See the doc for `RemoteStorage::list_prefixes`
@@ -291,6 +313,8 @@ impl RemoteStorage for S3Bucket {
        &self,
        prefix: Option<&RemotePath>,
    ) -> Result<Vec<RemotePath>, DownloadError> {
+        let kind = RequestKind::List;
+
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_s3_object(p))
@@ -307,15 +331,10 @@ impl RemoteStorage for S3Bucket {
        let mut document_keys = Vec::new();

        let mut continuation_token = None;
-        loop {
-            let _guard = self
-                .concurrency_limiter
-                .acquire()
-                .await
-                .context("Concurrency limiter semaphore got closed during S3 list")
-                .map_err(DownloadError::Other)?;

-            metrics::inc_list_objects();
+        loop {
+            let _guard = self.permit(kind).await;
+            let started_at = start_measuring_requests(kind);

            let fetch_response = self
                .client
@@ -327,12 +346,16 @@ impl RemoteStorage for S3Bucket {
                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
-                .map_err(|e| {
-                    metrics::inc_list_objects_fail();
-                    e
-                })
                .context("Failed to list S3 prefixes")
-                .map_err(DownloadError::Other)?;
+                .map_err(DownloadError::Other);
+
+            let started_at = ScopeGuard::into_inner(started_at);
+
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &fetch_response, started_at);
+
+            let fetch_response = fetch_response?;

            document_keys.extend(
                fetch_response
@@ -342,10 +365,10 @@ impl RemoteStorage for S3Bucket {
                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
            );

-            match fetch_response.next_continuation_token {
-                Some(new_token) => continuation_token = Some(new_token),
+            continuation_token = match fetch_response.next_continuation_token {
+                Some(new_token) => Some(new_token),
                None => break,
-            }
+            };
        }

        Ok(document_keys)
@@ -353,6 +376,8 @@ impl RemoteStorage for S3Bucket {

    /// See the doc for `RemoteStorage::list_files`
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        let kind = RequestKind::List;
+
        let folder_name = folder
            .map(|p| self.relative_path_to_s3_object(p))
            .or_else(|| self.prefix_in_bucket.clone());
@@ -361,12 +386,8 @@ impl RemoteStorage for S3Bucket {
        let mut continuation_token = None;
        let mut all_files = vec![];
        loop {
-            let _guard = self
-                .concurrency_limiter
-                .acquire()
-                .await
-                .context("Concurrency limiter semaphore got closed during S3 list_files")?;
-            metrics::inc_list_objects();
+            let _guard = self.permit(kind).await;
+            let started_at = start_measuring_requests(kind);

            let response = self
                .client
@@ -377,11 +398,14 @@ impl RemoteStorage for S3Bucket {
                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
-                .map_err(|e| {
-                    metrics::inc_list_objects_fail();
-                    e
-                })
-                .context("Failed to list files in S3 bucket")?;
+                .context("Failed to list files in S3 bucket");
+
+            let started_at = ScopeGuard::into_inner(started_at);
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &response, started_at);
+
+            let response = response?;

            for object in response.contents().unwrap_or_default() {
                let object_path = object.key().expect("response does not contain a key");
@@ -403,18 +427,16 @@ impl RemoteStorage for S3Bucket {
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()> {
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 upload")?;
+        let kind = RequestKind::Put;
+        let _guard = self.permit(kind).await;

-        metrics::inc_put_object();
+        let started_at = start_measuring_requests(kind);

        let body = Body::wrap_stream(ReaderStream::new(from));
        let bytes_stream = ByteStream::new(SdkBody::from(body));

-        self.client
+        let res = self
+            .client
            .put_object()
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(to))
@@ -422,11 +444,15 @@ impl RemoteStorage for S3Bucket {
            .content_length(from_size_bytes.try_into()?)
            .body(bytes_stream)
            .send()
-            .await
-            .map_err(|e| {
-                metrics::inc_put_object_fail();
-                e
-            })?;
+            .await;
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+
+        res?;
+
        Ok(())
    }

@@ -463,11 +489,8 @@ impl RemoteStorage for S3Bucket {
        .await
    }
    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 delete")?;
+        let kind = RequestKind::Delete;
+        let _guard = self.permit(kind).await;

        let mut delete_objects = Vec::with_capacity(paths.len());
        for path in paths {
@@ -478,7 +501,7 @@ impl RemoteStorage for S3Bucket {
        }

        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
-            metrics::inc_delete_objects(chunk.len() as u64);
+            let started_at = start_measuring_requests(kind);

            let resp = self
                .client
@@ -488,10 +511,17 @@ impl RemoteStorage for S3Bucket {
                .send()
                .await;

+            let started_at = ScopeGuard::into_inner(started_at);
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &resp, started_at);
+
            match resp {
                Ok(resp) => {
+                    metrics::BUCKET_METRICS
+                        .deleted_objects_total
+                        .inc_by(chunk.len() as u64);
                    if let Some(errors) = resp.errors {
-                        metrics::inc_delete_objects_fail(errors.len() as u64);
                        return Err(anyhow::format_err!(
                            "Failed to delete {} objects",
                            errors.len()
@@ -499,7 +529,6 @@ impl RemoteStorage for S3Bucket {
                    }
                }
                Err(e) => {
-                    metrics::inc_delete_objects_fail(chunk.len() as u64);
                    return Err(e.into());
                }
            }
@@ -508,28 +537,33 @@ impl RemoteStorage for S3Bucket {
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 delete")?;
-
-        metrics::inc_delete_object();
-
-        self.client
-            .delete_object()
-            .bucket(self.bucket_name.clone())
-            .key(self.relative_path_to_s3_object(path))
-            .send()
-            .await
-            .map_err(|e| {
-                metrics::inc_delete_object_fail();
-                e
-            })?;
-        Ok(())
+        let paths = std::array::from_ref(path);
+        self.delete_objects(paths).await
    }
 }

+/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
+fn start_counting_cancelled_wait(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
+        metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
+    })
+}
+
+/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
+fn start_measuring_requests(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
+        metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+            kind,
+            AttemptOutcome::Cancelled,
+            started_at,
+        )
+    })
+}
+
 #[cfg(test)]
 mod tests {
    use std::num::NonZeroUsize;
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -0,0 +1,191 @@
+use metrics::{
+    register_histogram_vec, register_int_counter, register_int_counter_vec, Histogram, IntCounter,
+};
+use once_cell::sync::Lazy;
+
+pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);
+
+#[derive(Clone, Copy, Debug)]
+pub(super) enum RequestKind {
+    Get = 0,
+    Put = 1,
+    Delete = 2,
+    List = 3,
+}
+
+use RequestKind::*;
+
+impl RequestKind {
+    const fn as_str(&self) -> &'static str {
+        match self {
+            Get => "get_object",
+            Put => "put_object",
+            Delete => "delete_object",
+            List => "list_objects",
+        }
+    }
+    const fn as_index(&self) -> usize {
+        *self as usize
+    }
+}
+
+pub(super) struct RequestTyped<C>([C; 4]);
+
+impl<C> RequestTyped<C> {
+    pub(super) fn get(&self, kind: RequestKind) -> &C {
+        &self.0[kind.as_index()]
+    }
+
+    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
+        use RequestKind::*;
+        let mut it = [Get, Put, Delete, List].into_iter();
+        let arr = std::array::from_fn::<C, 4, _>(|index| {
+            let next = it.next().unwrap();
+            assert_eq!(index, next.as_index());
+            f(next)
+        });
+
+        if let Some(next) = it.next() {
+            panic!("unexpected {next:?}");
+        }
+
+        RequestTyped(arr)
+    }
+}
+
+impl RequestTyped<Histogram> {
+    pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
+        self.get(kind).observe(started_at.elapsed().as_secs_f64())
+    }
+}
+
+pub(super) struct PassFailCancelledRequestTyped<C> {
+    success: RequestTyped<C>,
+    fail: RequestTyped<C>,
+    cancelled: RequestTyped<C>,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub(super) enum AttemptOutcome {
+    Ok,
+    Err,
+    Cancelled,
+}
+
+impl<T, E> From<&Result<T, E>> for AttemptOutcome {
+    fn from(value: &Result<T, E>) -> Self {
+        match value {
+            Ok(_) => AttemptOutcome::Ok,
+            Err(_) => AttemptOutcome::Err,
+        }
+    }
+}
+
+impl AttemptOutcome {
+    pub(super) fn as_str(&self) -> &'static str {
+        match self {
+            AttemptOutcome::Ok => "ok",
+            AttemptOutcome::Err => "err",
+            AttemptOutcome::Cancelled => "cancelled",
+        }
+    }
+}
+
+impl<C> PassFailCancelledRequestTyped<C> {
+    pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
+        let target = match outcome {
+            AttemptOutcome::Ok => &self.success,
+            AttemptOutcome::Err => &self.fail,
+            AttemptOutcome::Cancelled => &self.cancelled,
+        };
+        target.get(kind)
+    }
+
+    fn build_with(mut f: impl FnMut(RequestKind, AttemptOutcome) -> C) -> Self {
+        let success = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Ok));
+        let fail = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Err));
+        let cancelled = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Cancelled));
+
+        PassFailCancelledRequestTyped {
+            success,
+            fail,
+            cancelled,
+        }
+    }
+}
+
+impl PassFailCancelledRequestTyped<Histogram> {
+    pub(super) fn observe_elapsed(
+        &self,
+        kind: RequestKind,
+        outcome: impl Into<AttemptOutcome>,
+        started_at: std::time::Instant,
+    ) {
+        self.get(kind, outcome.into())
+            .observe(started_at.elapsed().as_secs_f64())
+    }
+}
+
+pub(super) struct BucketMetrics {
+    /// Full request duration until successful completion, error or cancellation.
+    pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
+    /// Total amount of seconds waited on queue.
+    pub(super) wait_seconds: RequestTyped<Histogram>,
+
+    /// Track how many semaphore awaits were cancelled per request type.
+    ///
+    /// This is in case cancellations are happening more than expected.
+    pub(super) cancelled_waits: RequestTyped<IntCounter>,
+
+    /// Total amount of deleted objects in batches or single requests.
+    pub(super) deleted_objects_total: IntCounter,
+}
+
+impl Default for BucketMetrics {
+    fn default() -> Self {
+        let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
+
+        let req_seconds = register_histogram_vec!(
+            "remote_storage_s3_request_seconds",
+            "Seconds to complete a request",
+            &["request_type", "result"],
+            buckets.to_vec(),
+        )
+        .unwrap();
+        let req_seconds = PassFailCancelledRequestTyped::build_with(|kind, outcome| {
+            req_seconds.with_label_values(&[kind.as_str(), outcome.as_str()])
+        });
+
+        let wait_seconds = register_histogram_vec!(
+            "remote_storage_s3_wait_seconds",
+            "Seconds rate limited",
+            &["request_type"],
+            buckets.to_vec(),
+        )
+        .unwrap();
+        let wait_seconds =
+            RequestTyped::build_with(|kind| wait_seconds.with_label_values(&[kind.as_str()]));
+
+        let cancelled_waits = register_int_counter_vec!(
+            "remote_storage_s3_cancelled_waits_total",
+            "Times a semaphore wait has been cancelled per request type",
+            &["request_type"],
+        )
+        .unwrap();
+        let cancelled_waits =
+            RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()]));
+
+        let deleted_objects_total = register_int_counter!(
+            "remote_storage_s3_deleted_objects_total",
+            "Amount of deleted objects in total",
+        )
+        .unwrap();
+
+        Self {
+            req_seconds,
+            wait_seconds,
+            cancelled_waits,
+            deleted_objects_total,
+        }
+    }
+}
--- a/libs/utils/src/backoff.rs
+++ b/libs/utils/src/backoff.rs
@@ -0,0 +1,188 @@
+use std::fmt::{Debug, Display};
+
+use futures::Future;
+
+pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
+pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
+
+pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
+    let backoff_duration_seconds =
+        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
+    if backoff_duration_seconds > 0.0 {
+        tracing::info!(
+            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
+        );
+        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
+    }
+}
+
+pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
+    if n == 0 {
+        0.0
+    } else {
+        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
+    }
+}
+
+/// retries passed operation until one of the following conditions are met:
+/// Encountered error is considered as permanent (non-retryable)
+/// Retries have been exhausted.
+/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
+/// When attempts cross `warn_threshold` function starts to emit log warnings.
+/// `description` argument is added to log messages. Its value should identify the `op` is doing
+pub async fn retry<T, O, F, E>(
+    mut op: O,
+    is_permanent: impl Fn(&E) -> bool,
+    warn_threshold: u32,
+    max_retries: u32,
+    description: &str,
+) -> Result<T, E>
+where
+    // Not std::error::Error because anyhow::Error doesnt implement it.
+    // For context see https://github.com/dtolnay/anyhow/issues/63
+    E: Display + Debug,
+    O: FnMut() -> F,
+    F: Future<Output = Result<T, E>>,
+{
+    let mut attempts = 0;
+    loop {
+        let result = op().await;
+        match result {
+            Ok(_) => {
+                if attempts > 0 {
+                    tracing::info!("{description} succeeded after {attempts} retries");
+                }
+                return result;
+            }
+
+            // These are "permanent" errors that should not be retried.
+            Err(ref e) if is_permanent(e) => {
+                return result;
+            }
+            // Assume that any other failure might be transient, and the operation might
+            // succeed if we just keep trying.
+            Err(err) if attempts < warn_threshold => {
+                tracing::info!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(err) if attempts < max_retries => {
+                tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(ref err) => {
+                // Operation failed `max_attempts` times. Time to give up.
+                tracing::warn!(
+                    "{description} still failed after {attempts} retries, giving up: {err:?}"
+                );
+                return result;
+            }
+        }
+        // sleep and retry
+        exponential_backoff(
+            attempts,
+            DEFAULT_BASE_BACKOFF_SECONDS,
+            DEFAULT_MAX_BACKOFF_SECONDS,
+        )
+        .await;
+        attempts += 1;
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io;
+
+    use tokio::sync::Mutex;
+
+    use super::*;
+
+    #[test]
+    fn backoff_defaults_produce_growing_backoff_sequence() {
+        let mut current_backoff_value = None;
+
+        for i in 0..10_000 {
+            let new_backoff_value = exponential_backoff_duration_seconds(
+                i,
+                DEFAULT_BASE_BACKOFF_SECONDS,
+                DEFAULT_MAX_BACKOFF_SECONDS,
+            );
+
+            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
+                assert!(
+                    old_backoff_value <= new_backoff_value,
+                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
+                )
+            }
+        }
+
+        assert_eq!(
+            current_backoff_value.expect("Should have produced backoff values to compare"),
+            DEFAULT_MAX_BACKOFF_SECONDS,
+            "Given big enough of retries, backoff should reach its allowed max value"
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn retry_always_error() {
+        let count = Mutex::new(0);
+        let err_result = retry(
+            || async {
+                *count.lock().await += 1;
+                Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
+            },
+            |_e| false,
+            1,
+            1,
+            "work",
+        )
+        .await;
+
+        assert!(err_result.is_err());
+
+        assert_eq!(*count.lock().await, 2);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn retry_ok_after_err() {
+        let count = Mutex::new(0);
+        retry(
+            || async {
+                let mut locked = count.lock().await;
+                if *locked > 1 {
+                    Ok(())
+                } else {
+                    *locked += 1;
+                    Err(io::Error::from(io::ErrorKind::Other))
+                }
+            },
+            |_e| false,
+            2,
+            2,
+            "work",
+        )
+        .await
+        .unwrap();
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn dont_retry_permanent_errors() {
+        let count = Mutex::new(0);
+        let _ = retry(
+            || async {
+                let mut locked = count.lock().await;
+                if *locked > 1 {
+                    Ok(())
+                } else {
+                    *locked += 1;
+                    Err(io::Error::from(io::ErrorKind::Other))
+                }
+            },
+            |_e| true,
+            2,
+            2,
+            "work",
+        )
+        .await
+        .unwrap_err();
+
+        assert_eq!(*count.lock().await, 1);
+    }
+}
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -111,6 +111,10 @@ pub fn fsync(path: &Path) -> io::Result<()> {
        .map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
 }

+pub async fn fsync_async(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
+    tokio::fs::File::open(path).await?.sync_all().await
+}
+
 #[cfg(test)]
 mod tests {
    use tempfile::tempdir;
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -24,6 +24,20 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
    Ok(dir.next_entry().await?.is_none())
 }

+pub async fn list_dir(path: impl AsRef<Path>) -> anyhow::Result<Vec<String>> {
+    let mut dir = tokio::fs::read_dir(&path)
+        .await
+        .context(format!("read_dir({})", path.as_ref().display()))?;
+
+    let mut content = vec![];
+    while let Some(next) = dir.next_entry().await? {
+        let file_name = next.file_name();
+        content.push(file_name.to_string_lossy().to_string());
+    }
+
+    Ok(content)
+}
+
 pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
    if e.kind() == io::ErrorKind::NotFound {
        Ok(())
@@ -43,7 +57,7 @@ where
 mod test {
    use std::path::PathBuf;

-    use crate::fs_ext::is_directory_empty;
+    use crate::fs_ext::{is_directory_empty, list_dir};

    use super::ignore_absent_files;

@@ -109,4 +123,25 @@ mod test {

        assert!(!file_path.exists());
    }
+
+    #[tokio::test]
+    async fn list_dir_works() {
+        let dir = tempfile::tempdir().unwrap();
+        let dir_path = dir.path();
+
+        assert!(list_dir(dir_path).await.unwrap().is_empty());
+
+        let file_path: PathBuf = dir_path.join("testfile");
+        let _ = std::fs::File::create(&file_path).unwrap();
+
+        assert_eq!(&list_dir(dir_path).await.unwrap(), &["testfile"]);
+
+        let another_dir_path: PathBuf = dir_path.join("testdir");
+        std::fs::create_dir(another_dir_path).unwrap();
+
+        let expected = &["testdir", "testfile"];
+        let mut actual = list_dir(dir_path).await.unwrap();
+        actual.sort();
+        assert_eq!(actual, expected);
+    }
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,6 +1,8 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.

+pub mod backoff;
+
 /// `Lsn` type implements common tasks on Log Sequence Numbers
 pub mod lsn;
 /// SeqWait allows waiting for a future sequence number to arrive
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -23,6 +23,7 @@
 //!      <https://grafana.com/tutorials/build-a-panel-plugin/>
 use anyhow::Result;
 use pageserver::repository::Key;
+use pageserver::METADATA_FILE_NAME;
 use std::cmp::Ordering;
 use std::io::{self, BufRead};
 use std::path::PathBuf;
@@ -71,6 +72,10 @@ pub fn main() -> Result<()> {
        let line = PathBuf::from_str(&line).unwrap();
        let filename = line.file_name().unwrap();
        let filename = filename.to_str().unwrap();
+        if filename == METADATA_FILE_NAME {
+            // Don't try and parse "metadata" like a key-lsn range
+            continue;
+        }
        let range = parse_filename(filename);
        ranges.push(range);
    }
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -107,23 +107,25 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    // min-heap (reserve space for one more element added before eviction)
    let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
    let mut prev_key: Option<Key> = None;
-    tree_reader.visit(
-        &[0u8; DELTA_KEY_SIZE],
-        VisitDirection::Forwards,
-        |key, _value| {
-            let curr = Key::from_slice(&key[..KEY_SIZE]);
-            if let Some(prev) = prev_key {
-                if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
-                    heap.push(Hole(prev..curr));
-                    if heap.len() > max_holes {
-                        heap.pop(); // remove smallest hole
+    tree_reader
+        .visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |key, _value| {
+                let curr = Key::from_slice(&key[..KEY_SIZE]);
+                if let Some(prev) = prev_key {
+                    if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
+                        heap.push(Hole(prev..curr));
+                        if heap.len() > max_holes {
+                            heap.pop(); // remove smallest hole
+                        }
                    }
                }
-            }
-            prev_key = Some(curr.next());
-            true
-        },
-    )?;
+                prev_key = Some(curr.next());
+                true
+            },
+        )
+        .await?;
    let mut holes = heap.into_vec();
    holes.sort_by_key(|hole| hole.0.start);
    Ok(holes)
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -59,18 +59,20 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    );
    // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
    let mut all = vec![];
-    tree_reader.visit(
-        &[0u8; DELTA_KEY_SIZE],
-        VisitDirection::Forwards,
-        |key, value_offset| {
-            let curr = Key::from_slice(&key[..KEY_SIZE]);
-            all.push((curr, BlobRef(value_offset)));
-            true
-        },
-    )?;
+    tree_reader
+        .visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |key, value_offset| {
+                let curr = Key::from_slice(&key[..KEY_SIZE]);
+                all.push((curr, BlobRef(value_offset)));
+                true
+            },
+        )
+        .await?;
    let cursor = BlockCursor::new(&file);
    for (k, v) in all {
-        let value = cursor.read_blob(v.pos())?;
+        let value = cursor.read_blob(v.pos()).await?;
        println!("key:{} value_len:{}", k, value.len());
    }
    // TODO(chi): special handling for last key?
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -9,8 +9,10 @@ use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
+use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use remote_storage::GenericRemoteStorage;
+use tokio::time::Instant;
 use tracing::*;

 use metrics::set_build_info_metric;
@@ -38,8 +40,6 @@ const PID_FILE_NAME: &str = "pageserver.pid";
 const FEATURES: &[&str] = &[
    #[cfg(feature = "testing")]
    "testing",
-    #[cfg(feature = "fail/failpoints")]
-    "fail/failpoints",
 ];

 fn version() -> String {
@@ -226,6 +226,19 @@ fn start_pageserver(
    launch_ts: &'static LaunchTimestamp,
    conf: &'static PageServerConf,
 ) -> anyhow::Result<()> {
+    // Monotonic time for later calculating startup duration
+    let started_startup_at = Instant::now();
+
+    let startup_checkpoint = move |phase: &str, human_phase: &str| {
+        let elapsed = started_startup_at.elapsed();
+        let secs = elapsed.as_secs_f64();
+        STARTUP_DURATION.with_label_values(&[phase]).set(secs);
+        info!(
+            elapsed_ms = elapsed.as_millis(),
+            "{human_phase} ({secs:.3}s since start)"
+        )
+    };
+
    // Print version and launch timestamp to the log,
    // and expose them as prometheus metrics.
    // A changed version string indicates changed software.
@@ -335,6 +348,11 @@ fn start_pageserver(
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;

+    // Up to this point no significant I/O has been done: this should have been fast.  Record
+    // duration prior to starting I/O intensive phase of startup.
+    startup_checkpoint("initial", "Starting loading tenants");
+    STARTUP_IS_LOADING.set(1);
+
    // Startup staging or optimizing:
    //
    // We want to minimize downtime for `page_service` connections, and trying not to overload
@@ -355,12 +373,11 @@ fn start_pageserver(
    let order = pageserver::InitializationOrder {
        initial_tenant_load: Some(init_done_tx),
        initial_logical_size_can_start: init_done_rx.clone(),
-        initial_logical_size_attempt: init_logical_size_done_tx,
+        initial_logical_size_attempt: Some(init_logical_size_done_tx),
        background_jobs_can_start: background_jobs_barrier.clone(),
    };

    // Scan the local 'tenants/' directory and start loading the tenants
-    let init_started_at = std::time::Instant::now();
    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();

    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
@@ -378,18 +395,13 @@ fn start_pageserver(
            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));

            init_done_rx.wait().await;
+            startup_checkpoint("initial_tenant_load", "Initial load completed");
+            STARTUP_IS_LOADING.set(0);
+
            // initial logical sizes can now start, as they were waiting on init_done_rx.

            scopeguard::ScopeGuard::into_inner(guard);

-            let init_done = std::time::Instant::now();
-            let elapsed = init_done - init_started_at;
-
-            tracing::info!(
-                elapsed_millis = elapsed.as_millis(),
-                "Initial load completed"
-            );
-
            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());

            let timeout = conf.background_task_maximum_delay;
@@ -398,12 +410,7 @@ fn start_pageserver(

            let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
                Ok(_) => {
-                    let now = std::time::Instant::now();
-                    tracing::info!(
-                        from_init_done_millis = (now - init_done).as_millis(),
-                        from_init_millis = (now - init_started_at).as_millis(),
-                        "Initial logical sizes completed"
-                    );
+                    startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
                    None
                }
                Err(_) => {
@@ -419,6 +426,7 @@ fn start_pageserver(

            // allow background jobs to start
            drop(background_jobs_can_start);
+            startup_checkpoint("background_jobs_can_start", "Starting background jobs");

            if let Some(init_sizes_done) = init_sizes_done {
                // ending up here is not a bug; at the latest logical sizes will be queried by
@@ -428,14 +436,11 @@ fn start_pageserver(

                scopeguard::ScopeGuard::into_inner(guard);

-                let now = std::time::Instant::now();
-                tracing::info!(
-                    from_init_done_millis = (now - init_done).as_millis(),
-                    from_init_millis = (now - init_started_at).as_millis(),
-                    "Initial logical sizes completed after timeout (background jobs already started)"
-                );
+                startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");

            }
+
+            startup_checkpoint("complete", "Startup complete");
        };

        async move {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -31,7 +31,9 @@ use utils::{
 use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
+use crate::tenant::{
+    TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
+};
 use crate::{
    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
    TIMELINE_UNINIT_MARK_SUFFIX,
@@ -613,6 +615,11 @@ impl PageServerConf {
        )
    }

+    pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
+        self.tenant_path(tenant_id)
+            .join(TENANT_DELETED_MARKER_FILE_NAME)
+    }
+
    pub fn traces_path(&self) -> PathBuf {
        self.workdir.join("traces")
    }
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -85,6 +85,7 @@
 //! The solution is that all code paths are infected with precisely one
 //! [`RequestContext`] argument. Functions in the middle of the call chain
 //! only need to pass it on.
+
 use crate::task_mgr::TaskKind;

 // The main structure of this module, see module-level comment.
@@ -92,6 +93,7 @@ use crate::task_mgr::TaskKind;
 pub struct RequestContext {
    task_kind: TaskKind,
    download_behavior: DownloadBehavior,
+    access_stats_behavior: AccessStatsBehavior,
 }

 /// Desired behavior if the operation requires an on-demand download
@@ -109,6 +111,67 @@ pub enum DownloadBehavior {
    Error,
 }

+/// Whether this request should update access times used in LRU eviction
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub(crate) enum AccessStatsBehavior {
+    /// Update access times: this request's access to data should be taken
+    /// as a hint that the accessed layer is likely to be accessed again
+    Update,
+
+    /// Do not update access times: this request is accessing the layer
+    /// but does not want to indicate that the layer should be retained in cache,
+    /// perhaps because the requestor is a compaction routine that will soon cover
+    /// this layer with another.
+    Skip,
+}
+
+pub struct RequestContextBuilder {
+    inner: RequestContext,
+}
+
+impl RequestContextBuilder {
+    /// A new builder with default settings
+    pub fn new(task_kind: TaskKind) -> Self {
+        Self {
+            inner: RequestContext {
+                task_kind,
+                download_behavior: DownloadBehavior::Download,
+                access_stats_behavior: AccessStatsBehavior::Update,
+            },
+        }
+    }
+
+    pub fn extend(original: &RequestContext) -> Self {
+        Self {
+            // This is like a Copy, but avoid implementing Copy because ordinary users of
+            // RequestContext should always move or ref it.
+            inner: RequestContext {
+                task_kind: original.task_kind,
+                download_behavior: original.download_behavior,
+                access_stats_behavior: original.access_stats_behavior,
+            },
+        }
+    }
+
+    /// Configure the DownloadBehavior of the context: whether to
+    /// download missing layers, and/or warn on the download.
+    pub fn download_behavior(mut self, b: DownloadBehavior) -> Self {
+        self.inner.download_behavior = b;
+        self
+    }
+
+    /// Configure the AccessStatsBehavior of the context: whether layer
+    /// accesses should update the access time of the layer.
+    pub(crate) fn access_stats_behavior(mut self, b: AccessStatsBehavior) -> Self {
+        self.inner.access_stats_behavior = b;
+        self
+    }
+
+    pub fn build(self) -> RequestContext {
+        self.inner
+    }
+}
+
 impl RequestContext {
    /// Create a new RequestContext that has no parent.
    ///
@@ -123,10 +186,9 @@ impl RequestContext {
    /// because someone explicitly canceled it.
    /// It has no parent, so it cannot inherit cancellation from there.
    pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        RequestContext {
-            task_kind,
-            download_behavior,
-        }
+        RequestContextBuilder::new(task_kind)
+            .download_behavior(download_behavior)
+            .build()
    }

    /// Create a detached child context for a task that may outlive `self`.
@@ -187,10 +249,7 @@ impl RequestContext {
    }

    fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        RequestContext {
-            task_kind,
-            download_behavior,
-        }
+        Self::new(task_kind, download_behavior)
    }

    pub fn task_kind(&self) -> TaskKind {
@@ -200,4 +259,8 @@ impl RequestContext {
    pub fn download_behavior(&self) -> DownloadBehavior {
        self.download_behavior
    }
+
+    pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior {
+        self.access_stats_behavior
+    }
 }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -304,17 +304,18 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // Debug-log the list of candidates
    let now = SystemTime::now();
    for (i, (partition, candidate)) in candidates.iter().enumerate() {
+        let desc = candidate.layer.layer_desc();
        debug!(
            "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
            i + 1,
            candidates.len(),
-            candidate.layer.file_size(),
+            desc.file_size,
            now.duration_since(candidate.last_activity_ts)
                .unwrap()
                .as_micros(),
            partition,
-            candidate.layer.get_tenant_id(),
-            candidate.layer.get_timeline_id(),
+            desc.tenant_id,
+            desc.timeline_id,
            candidate.layer,
        );
    }
@@ -346,7 +347,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            warned = Some(usage_planned);
        }

-        usage_planned.add_available_bytes(candidate.layer.file_size());
+        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);

        batched
            .entry(TimelineKey(candidate.timeline))
@@ -389,15 +390,16 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                Ok(results) => {
                    assert_eq!(results.len(), batch.len());
                    for (result, layer) in results.into_iter().zip(batch.iter()) {
+                        let file_size = layer.layer_desc().file_size;
                        match result {
                            Some(Ok(())) => {
-                                usage_assumed.add_available_bytes(layer.file_size());
+                                usage_assumed.add_available_bytes(file_size);
                            }
                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
                            }
                            Some(Err(EvictionError::FileNotFound)) => {
-                                evictions_failed.file_sizes += layer.file_size();
+                                evictions_failed.file_sizes += file_size;
                                evictions_failed.count += 1;
                            }
                            Some(Err(
@@ -406,7 +408,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                            )) => {
                                let e = utils::error::report_compact_sources(&e);
                                warn!(%layer, "failed to evict layer: {e}");
-                                evictions_failed.file_sizes += layer.file_size();
+                                evictions_failed.file_sizes += file_size;
                                evictions_failed.count += 1;
                            }
                            None => {
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -93,6 +93,47 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+    delete:
+      description: |
+        Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
+        404 means that deletion successfully finished"
+      responses:
+        "400":
+          description: Error when no tenant id found in path
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "404":
+          description: Tenant not found
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+        "409":
+          description: Deletion is already in progress, continue polling
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"

  /v1/tenant/{tenant_id}/timeline:
    parameters:
@@ -820,6 +861,7 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+
  /v1/tenant/config:
    put:
      description: |
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -187,7 +187,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
                format!("Cannot delete timeline which has child timelines: {children:?}")
                    .into_boxed_str(),
            ),
-            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
+            a @ AlreadyInProgress(_) => ApiError::Conflict(a.to_string()),
            Other(e) => ApiError::InternalServerError(e),
        }
    }
@@ -208,6 +208,19 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
    }
 }

+impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
+    fn from(value: crate::tenant::delete::DeleteTenantError) -> Self {
+        use crate::tenant::delete::DeleteTenantError::*;
+        match value {
+            Get(g) => ApiError::from(g),
+            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
+            Timeline(t) => ApiError::from(t),
+            Other(o) => ApiError::InternalServerError(o),
+            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
+        }
+    }
+}
+
 // Helper function to construct a TimelineInfo struct for a timeline
 async fn build_timeline_info(
    timeline: &Arc<Timeline>,
@@ -617,6 +630,23 @@ async fn tenant_status(
    json_response(StatusCode::OK, tenant_info)
 }

+async fn tenant_delete_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    // TODO openapi spec
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+
+    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
+        .instrument(info_span!("tenant_delete_handler", %tenant_id))
+        .await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
 /// HTTP endpoint to query the current tenant_size of a tenant.
 ///
 /// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
@@ -1345,6 +1375,9 @@ pub fn make_router(
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
+        .delete("/v1/tenant/:tenant_id", |r| {
+            api_handler(r, tenant_delete_handler)
+        })
        .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
            api_handler(r, tenant_size_handler)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -7,7 +7,7 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod keyspace;
-pub(crate) mod metrics;
+pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
@@ -47,50 +47,54 @@ pub use crate::metrics::preinitialize_metrics;

 #[tracing::instrument]
 pub async fn shutdown_pageserver(exit_code: i32) {
+    use std::time::Duration;
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
-    task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await;
+    timed(
+        task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None),
+        "shutdown LibpqEndpointListener",
+        Duration::from_secs(1),
+    )
+    .await;

    // Shut down any page service tasks.
-    task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await;
+    timed(
+        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
+        "shutdown PageRequestHandlers",
+        Duration::from_secs(1),
+    )
+    .await;

    // Shut down all the tenants. This flushes everything to disk and kills
    // the checkpoint and GC tasks.
-    tenant::mgr::shutdown_all_tenants().await;
+    timed(
+        tenant::mgr::shutdown_all_tenants(),
+        "shutdown all tenants",
+        Duration::from_secs(5),
+    )
+    .await;

    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
    // FIXME: We should probably stop accepting commands like attach/detach earlier.
-    task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await;
+    timed(
+        task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None),
+        "shutdown http",
+        Duration::from_secs(1),
+    )
+    .await;

    // There should be nothing left, but let's be sure
-    task_mgr::shutdown_tasks(None, None, None).await;
+    timed(
+        task_mgr::shutdown_tasks(None, None, None),
+        "shutdown leftovers",
+        Duration::from_secs(1),
+    )
+    .await;
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }

-const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
-const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
-
-async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
-    let backoff_duration_seconds =
-        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
-    if backoff_duration_seconds > 0.0 {
-        info!(
-            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
-        );
-        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
-    }
-}
-
-pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
-    if n == 0 {
-        0.0
-    } else {
-        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
-    }
-}
-
 /// The name of the metadata file pageserver creates per timeline.
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
 pub const METADATA_FILE_NAME: &str = "metadata";
@@ -164,7 +168,7 @@ pub struct InitializationOrder {

    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
    /// attempt. It is important to drop this once the attempt has completed.
-    pub initial_logical_size_attempt: utils::completion::Completion,
+    pub initial_logical_size_attempt: Option<utils::completion::Completion>,

    /// Barrier for when we can start any background jobs.
    ///
@@ -172,33 +176,75 @@ pub struct InitializationOrder {
    pub background_jobs_can_start: utils::completion::Barrier,
 }

-#[cfg(test)]
-mod backoff_defaults_tests {
-    use super::*;
+/// Time the future with a warning when it exceeds a threshold.
+async fn timed<Fut: std::future::Future>(
+    fut: Fut,
+    name: &str,
+    warn_at: std::time::Duration,
+) -> <Fut as std::future::Future>::Output {
+    let started = std::time::Instant::now();

-    #[test]
-    fn backoff_defaults_produce_growing_backoff_sequence() {
-        let mut current_backoff_value = None;
+    let mut fut = std::pin::pin!(fut);

-        for i in 0..10_000 {
-            let new_backoff_value = exponential_backoff_duration_seconds(
-                i,
-                DEFAULT_BASE_BACKOFF_SECONDS,
-                DEFAULT_MAX_BACKOFF_SECONDS,
+    match tokio::time::timeout(warn_at, &mut fut).await {
+        Ok(ret) => {
+            tracing::info!(
+                task = name,
+                elapsed_ms = started.elapsed().as_millis(),
+                "completed"
+            );
+            ret
+        }
+        Err(_) => {
+            tracing::info!(
+                task = name,
+                elapsed_ms = started.elapsed().as_millis(),
+                "still waiting, taking longer than expected..."
            );

-            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
-                assert!(
-                    old_backoff_value <= new_backoff_value,
-                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
-                )
-            }
-        }
+            let ret = fut.await;

-        assert_eq!(
-            current_backoff_value.expect("Should have produced backoff values to compare"),
-            DEFAULT_MAX_BACKOFF_SECONDS,
-            "Given big enough of retries, backoff should reach its allowed max value"
-        );
+            // this has a global allowed_errors
+            tracing::warn!(
+                task = name,
+                elapsed_ms = started.elapsed().as_millis(),
+                "completed, took longer than expected"
+            );
+
+            ret
+        }
+    }
+}
+
+#[cfg(test)]
+mod timed_tests {
+    use super::timed;
+    use std::time::Duration;
+
+    #[tokio::test]
+    async fn timed_completes_when_inner_future_completes() {
+        // A future that completes on time should have its result returned
+        let r1 = timed(
+            async move {
+                tokio::time::sleep(Duration::from_millis(10)).await;
+                123
+            },
+            "test 1",
+            Duration::from_millis(50),
+        )
+        .await;
+        assert_eq!(r1, 123);
+
+        // A future that completes too slowly should also have its result returned
+        let r1 = timed(
+            async move {
+                tokio::time::sleep(Duration::from_millis(50)).await;
+                456
+            },
+            "test 1",
+            Duration::from_millis(10),
+        )
+        .await;
+        assert_eq!(r1, 456);
    }
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,9 +1,9 @@
 use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
-    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
-    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
-    register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
-    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
+    register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
+    register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
+    HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use strum::VariantNames;
@@ -394,6 +394,35 @@ pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(||
    .expect("failed to define a metric")
 });

+/// How long did we take to start up?  Broken down by labels to describe
+/// different phases of startup.
+pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
+    register_gauge_vec!(
+        "pageserver_startup_duration_seconds",
+        "Time taken by phases of pageserver startup, in seconds",
+        &["phase"]
+    )
+    .expect("Failed to register pageserver_startup_duration_seconds metric")
+});
+
+pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_startup_is_loading",
+        "1 while in initial startup load of tenants, 0 at other times"
+    )
+    .expect("Failed to register pageserver_startup_is_loading")
+});
+
+/// How long did tenants take to go from construction to active state?
+pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_tenant_activation_seconds",
+        "Time taken by tenants to activate, in seconds",
+        CRITICAL_OP_BUCKETS.into()
+    )
+    .expect("Failed to register pageserver_tenant_activation_seconds metric")
+});
+
 /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
 pub struct EvictionsWithLowResidenceDuration {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -28,6 +28,7 @@ use std::cmp::min;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
+use std::fmt::Debug;
 use std::fs;
 use std::fs::File;
 use std::fs::OpenOptions;
@@ -46,8 +47,10 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

 use self::config::TenantConf;
+use self::delete::DeleteTenantFlow;
 use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
+use self::mgr::TenantsMap;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
@@ -56,6 +59,7 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir;
 use crate::is_uninit_mark;
+use crate::metrics::TENANT_ACTIVATION;
 use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
 use crate::repository::GcResult;
 use crate::task_mgr;
@@ -105,6 +109,7 @@ macro_rules! pausable_failpoint {

 pub mod blob_io;
 pub mod block_io;
+
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
@@ -117,6 +122,7 @@ mod remote_timeline_client;
 pub mod storage_layer;

 pub mod config;
+pub mod delete;
 pub mod mgr;
 pub mod tasks;
 pub mod upload_queue;
@@ -144,6 +150,8 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines";

 pub const TENANT_ATTACHING_MARKER_FILENAME: &str = "attaching";

+pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
+
 ///
 /// Tenant consists of multiple timelines. Keep them in a hash table.
 ///
@@ -182,6 +190,8 @@ pub struct Tenant {
    cached_synthetic_tenant_size: Arc<AtomicU64>,

    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
+
+    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
 }

 // We should not blindly overwrite local metadata with remote one.
@@ -273,7 +283,7 @@ pub enum LoadLocalTimelineError {
    ResumeDeletion(#[source] anyhow::Error),
 }

-#[derive(Debug, thiserror::Error)]
+#[derive(thiserror::Error)]
 pub enum DeleteTimelineError {
    #[error("NotFound")]
    NotFound,
@@ -282,17 +292,37 @@ pub enum DeleteTimelineError {
    HasChildren(Vec<TimelineId>),

    #[error("Timeline deletion is already in progress")]
-    AlreadyInProgress,
+    AlreadyInProgress(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>),

    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }

+impl Debug for DeleteTimelineError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::NotFound => write!(f, "NotFound"),
+            Self::HasChildren(c) => f.debug_tuple("HasChildren").field(c).finish(),
+            Self::AlreadyInProgress(_) => f.debug_tuple("AlreadyInProgress").finish(),
+            Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
+        }
+    }
+}
+
 pub enum SetStoppingError {
    AlreadyStopping(completion::Barrier),
    Broken,
 }

+impl Debug for SetStoppingError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::AlreadyStopping(_) => f.debug_tuple("AlreadyStopping").finish(),
+            Self::Broken => write!(f, "Broken"),
+        }
+    }
+}
+
 struct RemoteStartupData {
    index_part: IndexPart,
    remote_metadata: TimelineMetadata,
@@ -615,7 +645,7 @@ impl Tenant {
        // For every timeline, download the metadata file, scan the local directory,
        // and build a layer map that contains an entry for each remote and local
        // layer file.
-        let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
+        let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
        for (timeline_id, remote_metadata) in sorted_timelines {
            let (index_part, remote_client) = remote_index_and_client
                .remove(&timeline_id)
@@ -739,12 +769,13 @@ impl Tenant {
    /// If the loading fails for some reason, the Tenant will go into Broken
    /// state.
    #[instrument(skip_all, fields(tenant_id=%tenant_id))]
-    pub fn spawn_load(
+    pub(crate) fn spawn_load(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
        broker_client: storage_broker::BrokerClientChannel,
        remote_storage: Option<GenericRemoteStorage>,
        init_order: Option<InitializationOrder>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        ctx: &RequestContext,
    ) -> Arc<Tenant> {
        span::debug_assert_current_span_has_tenant_id();
@@ -764,7 +795,7 @@ impl Tenant {
            tenant_conf,
            wal_redo_manager,
            tenant_id,
-            remote_storage,
+            remote_storage.clone(),
        );
        let tenant = Arc::new(tenant);

@@ -780,27 +811,83 @@ impl Tenant {
            "initial tenant load",
            false,
            async move {
+                let make_broken = |t: &Tenant, err: anyhow::Error| {
+                    error!("load failed, setting tenant state to Broken: {err:?}");
+                    t.state.send_modify(|state| {
+                        assert!(
+                            matches!(*state, TenantState::Loading | TenantState::Stopping { .. }),
+                            "the loading task owns the tenant state until activation is complete"
+                        );
+                        *state = TenantState::broken_from_reason(err.to_string());
+                    });
+                };
+
                let mut init_order = init_order;

                // take the completion because initial tenant loading will complete when all of
                // these tasks complete.
-                let _completion = init_order.as_mut().and_then(|x| x.initial_tenant_load.take());
+                let _completion = init_order
+                    .as_mut()
+                    .and_then(|x| x.initial_tenant_load.take());
+
+                // Dont block pageserver startup on figuring out deletion status
+                let pending_deletion = {
+                    match DeleteTenantFlow::should_resume_deletion(
+                        conf,
+                        remote_storage.as_ref(),
+                        &tenant_clone,
+                    )
+                    .await
+                    {
+                        Ok(should_resume_deletion) => should_resume_deletion,
+                        Err(err) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            return Ok(());
+                        }
+                    }
+                };
+
+                info!("pending deletion {}", pending_deletion.is_some());
+
+                if let Some(deletion) = pending_deletion {
+                    // as we are no longer loading, signal completion by dropping
+                    // the completion while we resume deletion
+                    drop(_completion);
+                    // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
+                    let _ = init_order
+                        .as_mut()
+                        .and_then(|x| x.initial_logical_size_attempt.take());
+
+                    match DeleteTenantFlow::resume(
+                        deletion,
+                        &tenant_clone,
+                        init_order.as_ref(),
+                        tenants,
+                        &ctx,
+                    )
+                    .await
+                    {
+                        Err(err) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            return Ok(());
+                        }
+                        Ok(()) => return Ok(()),
+                    }
+                }
+
+                let background_jobs_can_start =
+                    init_order.as_ref().map(|x| &x.background_jobs_can_start);

                match tenant_clone.load(init_order.as_ref(), &ctx).await {
                    Ok(()) => {
-                        debug!("load finished, activating");
-                        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
+                        debug!("load finished",);
+
                        tenant_clone.activate(broker_client, background_jobs_can_start, &ctx);
                    }
-                    Err(err) => {
-                        error!("load failed, setting tenant state to Broken: {err:?}");
-                        tenant_clone.state.send_modify(|state| {
-                            assert_eq!(*state, TenantState::Loading, "the loading task owns the tenant state until activation is complete");
-                            *state = TenantState::broken_from_reason(err.to_string());
-                        });
-                    }
+                    Err(err) => make_broken(&tenant_clone, err),
                }
-               Ok(())
+
+                Ok(())
            }
            .instrument({
                let span = tracing::info_span!(parent: None, "load", tenant_id=%tenant_id);
@@ -876,6 +963,8 @@ impl Tenant {
                        )
                    })?;

+                info!("Found deletion mark for timeline {}", timeline_id);
+
                match load_metadata(self.conf, &self.tenant_id, &timeline_id) {
                    Ok(metadata) => {
                        timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
@@ -965,9 +1054,11 @@ impl Tenant {

        // Sort the array of timeline IDs into tree-order, so that parent comes before
        // all its children.
-        tree_sort_timelines(timelines_to_load).map(|sorted_timelines| TenantDirectoryScan {
-            sorted_timelines_to_load: sorted_timelines,
-            timelines_to_resume_deletion,
+        tree_sort_timelines(timelines_to_load, |m| m.ancestor_timeline()).map(|sorted_timelines| {
+            TenantDirectoryScan {
+                sorted_timelines_to_load: sorted_timelines,
+                timelines_to_resume_deletion,
+            }
        })
    }

@@ -1639,6 +1730,8 @@ impl Tenant {
                    post_state = <&'static str>::from(&*current_state),
                    "activation attempt finished"
                );
+
+                TENANT_ACTIVATION.observe(elapsed.as_secs_f64());
            });
        }
    }
@@ -1679,7 +1772,7 @@ impl Tenant {
        // It's mesed up.
        // we just ignore the failure to stop

-        match self.set_stopping(shutdown_progress).await {
+        match self.set_stopping(shutdown_progress, false).await {
            Ok(()) => {}
            Err(SetStoppingError::Broken) => {
                // assume that this is acceptable
@@ -1719,18 +1812,25 @@ impl Tenant {
    /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
    ///
    /// This function is not cancel-safe!
-    async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> {
+    ///
+    /// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant.
+    async fn set_stopping(
+        &self,
+        progress: completion::Barrier,
+        allow_transition_from_loading: bool,
+    ) -> Result<(), SetStoppingError> {
        let mut rx = self.state.subscribe();

        // cannot stop before we're done activating, so wait out until we're done activating
        rx.wait_for(|state| match state {
-            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+            TenantState::Activating(_) | TenantState::Attaching => {
                info!(
                    "waiting for {} to turn Active|Broken|Stopping",
                    <&'static str>::from(state)
                );
                false
            }
+            TenantState::Loading => allow_transition_from_loading,
            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
        })
        .await
@@ -1739,9 +1839,16 @@ impl Tenant {
        // we now know we're done activating, let's see whether this task is the winner to transition into Stopping
        let mut err = None;
        let stopping = self.state.send_if_modified(|current_state| match current_state {
-            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+            TenantState::Activating(_) | TenantState::Attaching => {
                unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
            }
+            TenantState::Loading => {
+                if !allow_transition_from_loading {
+                    unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
+                };
+                *current_state = TenantState::Stopping { progress };
+                true
+            }
            TenantState::Active => {
                // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
                // are created after the transition to Stopping. That's harmless, as the Timelines
@@ -1810,6 +1917,10 @@ impl Tenant {
        .expect("cannot drop self.state while on a &self method");

        // we now know we're done activating, let's see whether this task is the winner to transition into Broken
+        self.set_broken_no_wait(reason)
+    }
+
+    pub(crate) fn set_broken_no_wait(&self, reason: String) {
        self.state.send_modify(|current_state| {
            match *current_state {
                TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
@@ -1875,22 +1986,28 @@ impl Tenant {
 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
 /// perform a topological sort, so that the parent of each timeline comes
 /// before the children.
-fn tree_sort_timelines(
-    timelines: HashMap<TimelineId, TimelineMetadata>,
-) -> anyhow::Result<Vec<(TimelineId, TimelineMetadata)>> {
+/// E extracts the ancestor from T
+/// This allows for T to be different. It can be TimelineMetadata, can be Timeline itself, etc.
+fn tree_sort_timelines<T, E>(
+    timelines: HashMap<TimelineId, T>,
+    extractor: E,
+) -> anyhow::Result<Vec<(TimelineId, T)>>
+where
+    E: Fn(&T) -> Option<TimelineId>,
+{
    let mut result = Vec::with_capacity(timelines.len());

    let mut now = Vec::with_capacity(timelines.len());
    // (ancestor, children)
-    let mut later: HashMap<TimelineId, Vec<(TimelineId, TimelineMetadata)>> =
+    let mut later: HashMap<TimelineId, Vec<(TimelineId, T)>> =
        HashMap::with_capacity(timelines.len());

-    for (timeline_id, metadata) in timelines {
-        if let Some(ancestor_id) = metadata.ancestor_timeline() {
+    for (timeline_id, value) in timelines {
+        if let Some(ancestor_id) = extractor(&value) {
            let children = later.entry(ancestor_id).or_default();
-            children.push((timeline_id, metadata));
+            children.push((timeline_id, value));
        } else {
-            now.push((timeline_id, metadata));
+            now.push((timeline_id, value));
        }
    }

@@ -2059,7 +2176,7 @@ impl Tenant {
            remote_client,
            pg_version,
            initial_logical_size_can_start.cloned(),
-            initial_logical_size_attempt.cloned(),
+            initial_logical_size_attempt.cloned().flatten(),
            state,
        );

@@ -2143,6 +2260,7 @@ impl Tenant {
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
+            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
        }
    }

@@ -2159,6 +2277,7 @@ impl Tenant {
        // FIXME If the config file is not found, assume that we're attaching
        // a detached tenant and config is passed via attach command.
        // https://github.com/neondatabase/neon/issues/1555
+        // OR: we're loading after incomplete deletion that managed to remove config.
        if !target_config_path.exists() {
            info!("tenant config not found in {target_config_display}");
            return Ok(TenantConfOpt::default());
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -21,14 +21,14 @@ where
    R: BlockReader,
 {
    /// Read a blob into a new buffer.
-    pub fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
-        self.read_blob_into_buf(offset, &mut buf)?;
+        self.read_blob_into_buf(offset, &mut buf).await?;
        Ok(buf)
    }
    /// Read blob into the given buffer. Any previous contents in the buffer
    /// are overwritten.
-    pub fn read_blob_into_buf(
+    pub async fn read_blob_into_buf(
        &self,
        offset: u64,
        dstbuf: &mut Vec<u8>,
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -2,8 +2,7 @@
 //! Low-level Block-oriented I/O functions
 //!

-use crate::page_cache;
-use crate::page_cache::{ReadBufResult, PAGE_SZ};
+use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
 use bytes::Bytes;
 use std::ops::{Deref, DerefMut};
 use std::os::unix::fs::FileExt;
@@ -15,14 +14,12 @@ use std::sync::atomic::AtomicU64;
 /// There are currently two implementations: EphemeralFile, and FileBlockReader
 /// below.
 pub trait BlockReader {
-    type BlockLease: Deref<Target = [u8; PAGE_SZ]> + 'static;
-
    ///
    /// Read a block. Returns a "lease" object that can be used to
    /// access to the contents of the page. (For the page cache, the
    /// lease object represents a lock on the buffer.)
    ///
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error>;
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error>;

    ///
    /// Create a new "cursor" for reading from this reader.
@@ -41,13 +38,48 @@ impl<B> BlockReader for &B
 where
    B: BlockReader,
 {
-    type BlockLease = B::BlockLease;
-
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        (*self).read_blk(blknum)
    }
 }

+/// A block accessible for reading
+///
+/// During builds with `#[cfg(test)]`, this is a proper enum
+/// with two variants to support testing code. During normal
+/// builds, it just has one variant and is thus a cheap newtype
+/// wrapper of [`PageReadGuard`]
+pub enum BlockLease {
+    PageReadGuard(PageReadGuard<'static>),
+    #[cfg(test)]
+    Rc(std::rc::Rc<[u8; PAGE_SZ]>),
+}
+
+impl From<PageReadGuard<'static>> for BlockLease {
+    fn from(value: PageReadGuard<'static>) -> Self {
+        BlockLease::PageReadGuard(value)
+    }
+}
+
+#[cfg(test)]
+impl From<std::rc::Rc<[u8; PAGE_SZ]>> for BlockLease {
+    fn from(value: std::rc::Rc<[u8; PAGE_SZ]>) -> Self {
+        BlockLease::Rc(value)
+    }
+}
+
+impl Deref for BlockLease {
+    type Target = [u8; PAGE_SZ];
+
+    fn deref(&self) -> &Self::Target {
+        match self {
+            BlockLease::PageReadGuard(v) => v.deref(),
+            #[cfg(test)]
+            BlockLease::Rc(v) => v.deref(),
+        }
+    }
+}
+
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
@@ -80,7 +112,7 @@ where
        BlockCursor { reader }
    }

-    pub fn read_blk(&self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
+    pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        self.reader.read_blk(blknum)
    }
 }
@@ -118,9 +150,7 @@ impl<F> BlockReader for FileBlockReader<F>
 where
    F: FileExt,
 {
-    type BlockLease = page_cache::PageReadGuard<'static>;
-
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        // Look up the right page
        let cache = page_cache::get();
        loop {
@@ -132,7 +162,7 @@ where
                        format!("Failed to read immutable buf: {e:#}"),
                    )
                })? {
-                ReadBufResult::Found(guard) => break Ok(guard),
+                ReadBufResult::Found(guard) => break Ok(guard.into()),
                ReadBufResult::NotFound(mut write_guard) => {
                    // Read the page from disk into the buffer
                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -0,0 +1,572 @@
+use std::{
+    path::{Path, PathBuf},
+    sync::Arc,
+};
+
+use anyhow::Context;
+use pageserver_api::models::TenantState;
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use tokio::sync::OwnedMutexGuard;
+use tracing::{error, info, instrument, warn, Instrument, Span};
+
+use utils::{
+    backoff, completion, crashsafe, fs_ext,
+    id::{TenantId, TimelineId},
+};
+
+use crate::{
+    config::PageServerConf,
+    context::RequestContext,
+    task_mgr::{self, TaskKind},
+    InitializationOrder,
+};
+
+use super::{
+    mgr::{GetTenantError, TenantsMap},
+    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
+    span,
+    timeline::delete::DeleteTimelineFlow,
+    tree_sort_timelines, DeleteTimelineError, Tenant,
+};
+
+const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
+
+#[derive(Debug, thiserror::Error)]
+pub enum DeleteTenantError {
+    #[error("GetTenant {0}")]
+    Get(#[from] GetTenantError),
+
+    #[error("Invalid state {0}. Expected Active or Broken")]
+    InvalidState(TenantState),
+
+    #[error("Tenant deletion is already in progress")]
+    AlreadyInProgress,
+
+    #[error("Timeline {0}")]
+    Timeline(#[from] DeleteTimelineError),
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
+
+fn remote_tenant_delete_mark_path(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+) -> anyhow::Result<RemotePath> {
+    let tenant_remote_path = conf
+        .tenant_path(tenant_id)
+        .strip_prefix(&conf.workdir)
+        .context("Failed to strip workdir prefix")
+        .and_then(RemotePath::new)
+        .context("tenant path")?;
+    Ok(tenant_remote_path.join(Path::new("deleted")))
+}
+
+async fn create_remote_delete_mark(
+    conf: &PageServerConf,
+    remote_storage: &GenericRemoteStorage,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
+
+    let data: &[u8] = &[];
+    backoff::retry(
+        || async {
+            remote_storage
+                .upload(data, 0, &remote_mark_path, None)
+                .await
+        },
+        |_e| false,
+        FAILED_UPLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        "mark_upload",
+    )
+    .await
+    .context("mark_upload")?;
+
+    Ok(())
+}
+
+async fn create_local_delete_mark(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    let marker_path = conf.tenant_deleted_mark_file_path(tenant_id);
+
+    // Note: we're ok to replace existing file.
+    let _ = std::fs::OpenOptions::new()
+        .write(true)
+        .create(true)
+        .open(&marker_path)
+        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
+
+    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
+
+    Ok(())
+}
+
+async fn schedule_ordered_timeline_deletions(
+    tenant: &Arc<Tenant>,
+) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
+    // Tenant is stopping at this point. We know it will be deleted.
+    // No new timelines should be created.
+    // Tree sort timelines to delete from leafs to the root.
+    // NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
+    // can complete and remove timeline from the map in between our call to clone
+    // and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
+    // timelines.lock is currently synchronous so we cant hold it across await point.
+    // So just ignore NotFound error if we get it from `run`.
+    // Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
+    let timelines = tenant.timelines.lock().unwrap().clone();
+    let sorted =
+        tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
+
+    let mut already_running_deletions = vec![];
+
+    for (timeline_id, _) in sorted.into_iter().rev() {
+        if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await {
+            match e {
+                DeleteTimelineError::NotFound => {
+                    // Timeline deletion finished after call to clone above but before call
+                    // to `DeleteTimelineFlow::run` and removed timeline from the map.
+                    continue;
+                }
+                DeleteTimelineError::AlreadyInProgress(guard) => {
+                    already_running_deletions.push((guard, timeline_id));
+                    continue;
+                }
+                e => return Err(DeleteTenantError::Timeline(e)),
+            }
+        }
+    }
+
+    Ok(already_running_deletions)
+}
+
+async fn ensure_timelines_dir_empty(timelines_path: &Path) -> Result<(), DeleteTenantError> {
+    // Assert timelines dir is empty.
+    if !fs_ext::is_directory_empty(timelines_path).await? {
+        // Display first 10 items in directory
+        let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
+        return Err(DeleteTenantError::Other(anyhow::anyhow!(
+            "Timelines directory is not empty after all timelines deletion: {list:?}"
+        )));
+    }
+
+    Ok(())
+}
+
+async fn remove_tenant_remote_delete_mark(
+    conf: &PageServerConf,
+    remote_storage: Option<&GenericRemoteStorage>,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    if let Some(remote_storage) = remote_storage {
+        let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
+        backoff::retry(
+            || async { remote_storage.delete(&path).await },
+            |_e| false,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "remove_tenant_remote_delete_mark",
+        )
+        .await
+        .context("remove_tenant_remote_delete_mark")?;
+    }
+    Ok(())
+}
+
+// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
+async fn cleanup_remaining_fs_traces(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    let rm = |p: PathBuf, is_dir: bool| async move {
+        if is_dir {
+            tokio::fs::remove_dir(&p).await
+        } else {
+            tokio::fs::remove_file(&p).await
+        }
+        .or_else(fs_ext::ignore_not_found)
+        .with_context(|| {
+            let to_display = p.display();
+            format!("failed to delete {to_display}")
+        })
+    };
+
+    rm(conf.tenant_config_path(tenant_id), false).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-timelines-dir"
+        ))?
+    });
+
+    rm(conf.timelines_path(tenant_id), true).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-deleted-mark"
+        ))?
+    });
+
+    // Make sure previous deletions are ordered before mark removal.
+    // Otherwise there is no guarantee that they reach the disk before mark deletion.
+    // So its possible for mark to reach disk first and for other deletions
+    // to be reordered later and thus missed if a crash occurs.
+    // Note that we dont need to sync after mark file is removed
+    // because we can tolerate the case when mark file reappears on startup.
+    let tenant_path = &conf.tenant_path(tenant_id);
+    if tenant_path.exists() {
+        crashsafe::fsync_async(&conf.tenant_path(tenant_id))
+            .await
+            .context("fsync_pre_mark_remove")?;
+    }
+
+    rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-tenant-dir"
+        ))?
+    });
+
+    rm(conf.tenant_path(tenant_id), true).await?;
+
+    Ok(())
+}
+
+/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
+/// and deletes its data from both disk and s3.
+/// The sequence of steps:
+/// 1. Upload remote deletion mark.
+/// 2. Create local mark file.
+/// 3. Shutdown tasks
+/// 4. Run ordered timeline deletions
+/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
+/// 6. Remove remote mark
+/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
+/// It is resumable from any step in case a crash/restart occurs.
+/// There are three entrypoints to the process:
+/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
+/// 2. [`DeleteTenantFlow::resume`] is called during restarts when local or remote deletion marks are still there.
+/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
+#[derive(Default)]
+pub enum DeleteTenantFlow {
+    #[default]
+    NotStarted,
+    InProgress,
+    Finished,
+}
+
+impl DeleteTenantFlow {
+    // These steps are run in the context of management api request handler.
+    // Long running steps are continued to run in the background.
+    // NB: If this fails half-way through, and is retried, the retry will go through
+    // all the same steps again. Make sure the code here is idempotent, and don't
+    // error out if some of the shutdown tasks have already been completed!
+    // NOTE: static needed for background part.
+    // We assume that calling code sets up the span with tenant_id.
+    #[instrument(skip_all)]
+    pub(crate) async fn run(
+        conf: &'static PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
+    ) -> Result<(), DeleteTenantError> {
+        span::debug_assert_current_span_has_tenant_id();
+
+        let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
+
+        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
+            tenant.set_broken(format!("{e:#}")).await;
+            return Err(e);
+        }
+
+        Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
+
+        Ok(())
+    }
+
+    // Helper function needed to be able to match once on returned error and transition tenant into broken state.
+    // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
+    // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
+    // So the solution is to set tenant state to broken.
+    async fn run_inner(
+        guard: &mut OwnedMutexGuard<Self>,
+        conf: &'static PageServerConf,
+        remote_storage: Option<&GenericRemoteStorage>,
+        tenant: &Tenant,
+    ) -> Result<(), DeleteTenantError> {
+        guard.mark_in_progress()?;
+
+        fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-create-remote-mark"
+            ))?
+        });
+
+        // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
+        // Though sounds scary, different mark name?
+        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
+        if let Some(remote_storage) = &remote_storage {
+            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
+                .await
+                .context("remote_mark")?
+        }
+
+        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-create-local-mark"
+            ))?
+        });
+
+        create_local_delete_mark(conf, &tenant.tenant_id)
+            .await
+            .context("local delete mark")?;
+
+        fail::fail_point!("tenant-delete-before-background", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-background"
+            ))?
+        });
+
+        Ok(())
+    }
+
+    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
+        match self {
+            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
+            Self::InProgress { .. } => { /* We're in a retry */ }
+            Self::NotStarted => { /* Fresh start */ }
+        }
+
+        *self = Self::InProgress;
+
+        Ok(())
+    }
+
+    pub async fn should_resume_deletion(
+        conf: &'static PageServerConf,
+        remote_storage: Option<&GenericRemoteStorage>,
+        tenant: &Tenant,
+    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
+        let acquire = |t: &Tenant| {
+            Some(
+                Arc::clone(&t.delete_progress)
+                    .try_lock_owned()
+                    .expect("we're the only owner during init"),
+            )
+        };
+
+        let tenant_id = tenant.tenant_id;
+        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
+        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
+            return Ok(acquire(tenant));
+        }
+
+        let remote_storage = match remote_storage {
+            Some(remote_storage) => remote_storage,
+            None => return Ok(None),
+        };
+
+        // If remote storage is there we rely on it
+        let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?;
+
+        let result = backoff::retry(
+            || async { remote_storage.download(&remote_mark_path).await },
+            |e| matches!(e, DownloadError::NotFound),
+            SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+            SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+            "fetch_tenant_deletion_mark",
+        )
+        .await;
+
+        match result {
+            Ok(_) => Ok(acquire(tenant)),
+            Err(DownloadError::NotFound) => Ok(None),
+            Err(e) => Err(anyhow::anyhow!(e)).context("should_resume_deletion")?,
+        }
+    }
+
+    pub(crate) async fn resume(
+        guard: DeletionGuard,
+        tenant: &Arc<Tenant>,
+        init_order: Option<&InitializationOrder>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        ctx: &RequestContext,
+    ) -> Result<(), DeleteTenantError> {
+        let (_, progress) = completion::channel();
+
+        tenant
+            .set_stopping(progress, true)
+            .await
+            .expect("cant be stopping or broken");
+
+        // Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
+        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
+        if let Some(background) = background_jobs_can_start {
+            info!("waiting for backgound jobs barrier");
+            background.clone().wait().await;
+            info!("ready for backgound jobs barrier");
+        }
+
+        // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
+        let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
+        if timelines_path.exists() {
+            tenant.load(init_order, ctx).await.context("load")?;
+        }
+
+        Self::background(
+            guard,
+            tenant.conf,
+            tenant.remote_storage.clone(),
+            tenants,
+            tenant,
+        )
+        .await
+    }
+
+    async fn prepare(
+        tenants: &tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
+    ) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
+        let m = tenants.read().await;
+
+        let tenant = m
+            .get(&tenant_id)
+            .ok_or(GetTenantError::NotFound(tenant_id))?;
+
+        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
+        // so at least for now allow deletions only for active tenants. TODO recheck
+        // Broken and Stopping is needed for retries.
+        if !matches!(
+            tenant.current_state(),
+            TenantState::Active | TenantState::Broken { .. }
+        ) {
+            return Err(DeleteTenantError::InvalidState(tenant.current_state()));
+        }
+
+        let guard = Arc::clone(&tenant.delete_progress)
+            .try_lock_owned()
+            .map_err(|_| DeleteTenantError::AlreadyInProgress)?;
+
+        fail::fail_point!("tenant-delete-before-shutdown", |_| {
+            Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
+        });
+
+        // make pageserver shutdown not to wait for our completion
+        let (_, progress) = completion::channel();
+
+        // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
+        // i e it is an error to do:
+        // tenant.set_stopping
+        // tenant.shutdown
+        // Its also bad that we're holding tenants.read here.
+        // TODO relax set_stopping to be idempotent?
+        if tenant.shutdown(progress, false).await.is_err() {
+            return Err(DeleteTenantError::Other(anyhow::anyhow!(
+                "tenant shutdown is already in progress"
+            )));
+        }
+
+        Ok((Arc::clone(tenant), guard))
+    }
+
+    fn schedule_background(
+        guard: OwnedMutexGuard<Self>,
+        conf: &'static PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant: Arc<Tenant>,
+    ) {
+        let tenant_id = tenant.tenant_id;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::TimelineDeletionWorker,
+            Some(tenant_id),
+            None,
+            "tenant_delete",
+            false,
+            async move {
+                if let Err(err) =
+                    Self::background(guard, conf, remote_storage, tenants, &tenant).await
+                {
+                    error!("Error: {err:#}");
+                    tenant.set_broken(format!("{err:#}")).await;
+                };
+                Ok(())
+            }
+            .instrument({
+                let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
+                span.follows_from(Span::current());
+                span
+            }),
+        );
+    }
+
+    async fn background(
+        mut guard: OwnedMutexGuard<Self>,
+        conf: &PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant: &Arc<Tenant>,
+    ) -> Result<(), DeleteTenantError> {
+        // Tree sort timelines, schedule delete for them. Mention retries from the console side.
+        // Note that if deletion fails we dont mark timelines as broken,
+        // the whole tenant will become broken as by `Self::schedule_background` logic
+        let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
+            .await
+            .context("schedule_ordered_timeline_deletions")?;
+
+        fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-polling-ongoing-deletions"
+            ))?
+        });
+
+        // Wait for deletions that were already running at the moment when tenant deletion was requested.
+        // When we can lock deletion guard it means that corresponding timeline deletion finished.
+        for (guard, timeline_id) in already_running_timeline_deletions {
+            let flow = guard.lock().await;
+            if !flow.is_finished() {
+                return Err(DeleteTenantError::Other(anyhow::anyhow!(
+                    "already running timeline deletion failed: {timeline_id}"
+                )));
+            }
+        }
+
+        let timelines_path = conf.timelines_path(&tenant.tenant_id);
+        // May not exist if we fail in cleanup_remaining_fs_traces after removing it
+        if timelines_path.exists() {
+            // sanity check to guard against layout changes
+            ensure_timelines_dir_empty(&timelines_path)
+                .await
+                .context("timelines dir not empty")?;
+        }
+
+        remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
+
+        fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
+            ))?
+        });
+
+        cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
+            .await
+            .context("cleanup_remaining_fs_traces")?;
+
+        let mut locked = tenants.write().await;
+        if locked.remove(&tenant.tenant_id).is_none() {
+            warn!("Tenant got removed from tenants map during deletion");
+        };
+
+        *guard = Self::Finished;
+
+        Ok(())
+    }
+}
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -20,6 +20,7 @@
 //!
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
+use either::Either;
 use hex;
 use std::{cmp::Ordering, io, result};
 use thiserror::Error;
@@ -230,14 +231,15 @@ where
    ///
    /// Read the value for given key. Returns the value, or None if it doesn't exist.
    ///
-    pub fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
+    pub async fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
        let mut result: Option<u64> = None;
        self.visit(search_key, VisitDirection::Forwards, |key, value| {
            if key == search_key {
                result = Some(value);
            }
            false
-        })?;
+        })
+        .await?;
        Ok(result)
    }

@@ -246,7 +248,7 @@ where
    /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning
    /// backwards)
    ///
-    pub fn visit<V>(
+    pub async fn visit<V>(
        &self,
        search_key: &[u8; L],
        dir: VisitDirection,
@@ -255,117 +257,77 @@ where
    where
        V: FnMut(&[u8], u64) -> bool,
    {
-        self.search_recurse(self.root_blk, search_key, dir, &mut visitor)
-    }
+        let mut stack = Vec::new();
+        stack.push((self.root_blk, None));
+        while let Some((node_blknum, opt_iter)) = stack.pop() {
+            // Locate the node.
+            let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;

-    fn search_recurse<V>(
-        &self,
-        node_blknum: u32,
-        search_key: &[u8; L],
-        dir: VisitDirection,
-        visitor: &mut V,
-    ) -> Result<bool>
-    where
-        V: FnMut(&[u8], u64) -> bool,
-    {
-        // Locate the node.
-        let blk = self.reader.read_blk(self.start_blk + node_blknum)?;
+            let node = OnDiskNode::deparse(node_buf.as_ref())?;
+            let prefix_len = node.prefix_len as usize;
+            let suffix_len = node.suffix_len as usize;

-        // Search all entries on this node
-        self.search_node(blk.as_ref(), search_key, dir, visitor)
-    }
+            assert!(node.num_children > 0);

-    fn search_node<V>(
-        &self,
-        node_buf: &[u8],
-        search_key: &[u8; L],
-        dir: VisitDirection,
-        visitor: &mut V,
-    ) -> Result<bool>
-    where
-        V: FnMut(&[u8], u64) -> bool,
-    {
-        let node = OnDiskNode::deparse(node_buf)?;
-        let prefix_len = node.prefix_len as usize;
-        let suffix_len = node.suffix_len as usize;
+            let mut keybuf = Vec::new();
+            keybuf.extend(node.prefix);
+            keybuf.resize(prefix_len + suffix_len, 0);

-        assert!(node.num_children > 0);
-
-        let mut keybuf = Vec::new();
-        keybuf.extend(node.prefix);
-        keybuf.resize(prefix_len + suffix_len, 0);
-
-        if dir == VisitDirection::Forwards {
-            // Locate the first match
-            let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
-                Ok(idx) => idx,
-                Err(idx) => {
-                    if node.level == 0 {
-                        // Imagine that the node contains the following keys:
-                        //
-                        // 1
-                        // 3  <-- idx
-                        // 5
-                        //
-                        // If the search key is '2' and there is exact match,
-                        // the binary search would return the index of key
-                        // '3'. That's cool, '3' is the first key to return.
+            let mut iter = if let Some(iter) = opt_iter {
+                iter
+            } else if dir == VisitDirection::Forwards {
+                // Locate the first match
+                let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
+                    Ok(idx) => idx,
+                    Err(idx) => {
+                        if node.level == 0 {
+                            // Imagine that the node contains the following keys:
+                            //
+                            // 1
+                            // 3  <-- idx
+                            // 5
+                            //
+                            // If the search key is '2' and there is exact match,
+                            // the binary search would return the index of key
+                            // '3'. That's cool, '3' is the first key to return.
+                            idx
+                        } else {
+                            // This is an internal page, so each key represents a lower
+                            // bound for what's in the child page. If there is no exact
+                            // match, we have to return the *previous* entry.
+                            //
+                            // 1  <-- return this
+                            // 3  <-- idx
+                            // 5
+                            idx.saturating_sub(1)
+                        }
+                    }
+                };
+                Either::Left(idx..node.num_children.into())
+            } else {
+                let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
+                    Ok(idx) => {
+                        // Exact match. That's the first entry to return, and walk
+                        // backwards from there.
                        idx
-                    } else {
-                        // This is an internal page, so each key represents a lower
-                        // bound for what's in the child page. If there is no exact
-                        // match, we have to return the *previous* entry.
-                        //
-                        // 1  <-- return this
-                        // 3  <-- idx
-                        // 5
-                        idx.saturating_sub(1)
                    }
-                }
-            };
-            // idx points to the first match now. Keep going from there
-            let mut key_off = idx * suffix_len;
-            while idx < node.num_children as usize {
-                let suffix = &node.keys[key_off..key_off + suffix_len];
-                keybuf[prefix_len..].copy_from_slice(suffix);
-                let value = node.value(idx);
-                #[allow(clippy::collapsible_if)]
-                if node.level == 0 {
-                    // leaf
-                    if !visitor(&keybuf, value.to_u64()) {
-                        return Ok(false);
+                    Err(idx) => {
+                        // No exact match. The binary search returned the index of the
+                        // first key that's > search_key. Back off by one, and walk
+                        // backwards from there.
+                        if let Some(idx) = idx.checked_sub(1) {
+                            idx
+                        } else {
+                            return Ok(false);
+                        }
                    }
-                } else {
-                    #[allow(clippy::collapsible_if)]
-                    if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
-                        return Ok(false);
-                    }
-                }
-                idx += 1;
-                key_off += suffix_len;
-            }
-        } else {
-            let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
-                Ok(idx) => {
-                    // Exact match. That's the first entry to return, and walk
-                    // backwards from there. (The loop below starts from 'idx -
-                    // 1', so add one here to compensate.)
-                    idx + 1
-                }
-                Err(idx) => {
-                    // No exact match. The binary search returned the index of the
-                    // first key that's > search_key. Back off by one, and walk
-                    // backwards from there. (The loop below starts from idx - 1,
-                    // so we don't need to subtract one here)
-                    idx
-                }
+                };
+                Either::Right((0..=idx).rev())
            };

-            // idx points to the first match + 1 now. Keep going from there.
-            let mut key_off = idx * suffix_len;
-            while idx > 0 {
-                idx -= 1;
-                key_off -= suffix_len;
+            // idx points to the first match now. Keep going from there
+            while let Some(idx) = iter.next() {
+                let key_off = idx * suffix_len;
                let suffix = &node.keys[key_off..key_off + suffix_len];
                keybuf[prefix_len..].copy_from_slice(suffix);
                let value = node.value(idx);
@@ -376,12 +338,8 @@ where
                        return Ok(false);
                    }
                } else {
-                    #[allow(clippy::collapsible_if)]
-                    if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
-                        return Ok(false);
-                    }
-                }
-                if idx == 0 {
+                    stack.push((node_blknum, Some(iter)));
+                    stack.push((value.to_blknum(), None));
                    break;
                }
            }
@@ -727,6 +685,7 @@ impl<const L: usize> BuildNode<L> {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use crate::tenant::block_io::BlockLease;
    use rand::Rng;
    use std::collections::BTreeMap;
    use std::sync::atomic::{AtomicUsize, Ordering};
@@ -741,12 +700,10 @@ mod tests {
        }
    }
    impl BlockReader for TestDisk {
-        type BlockLease = std::rc::Rc<[u8; PAGE_SZ]>;
-
-        fn read_blk(&self, blknum: u32) -> io::Result<Self::BlockLease> {
+        fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
            let mut buf = [0u8; PAGE_SZ];
            buf.copy_from_slice(&self.blocks[blknum as usize]);
-            Ok(std::rc::Rc::new(buf))
+            Ok(std::rc::Rc::new(buf).into())
        }
    }
    impl BlockWriter for &mut TestDisk {
@@ -782,12 +739,12 @@ mod tests {

        // Test the `get` function on all the keys.
        for (key, val) in all_data.iter() {
-            assert_eq!(reader.get(key)?, Some(*val));
+            assert_eq!(reader.get(key).await?, Some(*val));
        }
        // And on some keys that don't exist
-        assert_eq!(reader.get(b"aaaaaa")?, None);
-        assert_eq!(reader.get(b"zzzzzz")?, None);
-        assert_eq!(reader.get(b"xaaabx")?, None);
+        assert_eq!(reader.get(b"aaaaaa").await?, None);
+        assert_eq!(reader.get(b"zzzzzz").await?, None);
+        assert_eq!(reader.get(b"xaaabx").await?, None);

        // Test search with `visit` function
        let search_key = b"xabaaa";
@@ -798,10 +755,12 @@ mod tests {
            .collect();

        let mut data = Vec::new();
-        reader.visit(search_key, VisitDirection::Forwards, |key, value| {
-            data.push((key.to_vec(), value));
-            true
-        })?;
+        reader
+            .visit(search_key, VisitDirection::Forwards, |key, value| {
+                data.push((key.to_vec(), value));
+                true
+            })
+            .await?;
        assert_eq!(data, expected);

        // Test a backwards scan
@@ -812,16 +771,20 @@ mod tests {
            .collect();
        expected.reverse();
        let mut data = Vec::new();
-        reader.visit(search_key, VisitDirection::Backwards, |key, value| {
-            data.push((key.to_vec(), value));
-            true
-        })?;
+        reader
+            .visit(search_key, VisitDirection::Backwards, |key, value| {
+                data.push((key.to_vec(), value));
+                true
+            })
+            .await?;
        assert_eq!(data, expected);

        // Backward scan where nothing matches
-        reader.visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
-            panic!("found unexpected key {}: {}", hex::encode(key), value);
-        })?;
+        reader
+            .visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
+                panic!("found unexpected key {}: {}", hex::encode(key), value);
+            })
+            .await?;

        // Full scan
        let expected: Vec<(Vec<u8>, u64)> = all_data
@@ -829,10 +792,12 @@ mod tests {
            .map(|(key, value)| (key.to_vec(), *value))
            .collect();
        let mut data = Vec::new();
-        reader.visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
-            data.push((key.to_vec(), value));
-            true
-        })?;
+        reader
+            .visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
+                data.push((key.to_vec(), value));
+                true
+            })
+            .await?;
        assert_eq!(data, expected);

        Ok(())
@@ -880,13 +845,15 @@ mod tests {
        for search_key_int in 0..(NUM_KEYS * 2 + 10) {
            let search_key = u64::to_be_bytes(search_key_int);
            assert_eq!(
-                reader.get(&search_key)?,
+                reader.get(&search_key).await?,
                all_data.get(&search_key_int).cloned()
            );

            // Test a forward scan starting with this key
            result.lock().unwrap().clear();
-            reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
+            reader
+                .visit(&search_key, VisitDirection::Forwards, take_ten)
+                .await?;
            let expected = all_data
                .range(search_key_int..)
                .take(10)
@@ -896,7 +863,9 @@ mod tests {

            // And a backwards scan
            result.lock().unwrap().clear();
-            reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
+            reader
+                .visit(&search_key, VisitDirection::Backwards, take_ten)
+                .await?;
            let expected = all_data
                .range(..=search_key_int)
                .rev()
@@ -910,7 +879,9 @@ mod tests {
        let search_key = u64::to_be_bytes(0);
        limit.store(usize::MAX, Ordering::Relaxed);
        result.lock().unwrap().clear();
-        reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
+        reader
+            .visit(&search_key, VisitDirection::Forwards, take_ten)
+            .await?;
        let expected = all_data
            .iter()
            .map(|(&key, &val)| (key, val))
@@ -921,7 +892,9 @@ mod tests {
        let search_key = u64::to_be_bytes(u64::MAX);
        limit.store(usize::MAX, Ordering::Relaxed);
        result.lock().unwrap().clear();
-        reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
+        reader
+            .visit(&search_key, VisitDirection::Backwards, take_ten)
+            .await?;
        let expected = all_data
            .iter()
            .rev()
@@ -932,8 +905,8 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn random_data() -> Result<()> {
+    #[tokio::test]
+    async fn random_data() -> Result<()> {
        // Generate random keys with exponential distribution, to
        // exercise the prefix compression
        const NUM_KEYS: usize = 100000;
@@ -960,19 +933,23 @@ mod tests {
        // Test get() operation on all the keys
        for (&key, &val) in all_data.iter() {
            let search_key = u128::to_be_bytes(key);
-            assert_eq!(reader.get(&search_key)?, Some(val));
+            assert_eq!(reader.get(&search_key).await?, Some(val));
        }

        // Test get() operations on random keys, most of which will not exist
        for _ in 0..100000 {
            let key_int = rand::thread_rng().gen::<u128>();
            let search_key = u128::to_be_bytes(key_int);
-            assert!(reader.get(&search_key)? == all_data.get(&key_int).cloned());
+            assert!(reader.get(&search_key).await? == all_data.get(&key_int).cloned());
        }

        // Test boundary cases
-        assert!(reader.get(&u128::to_be_bytes(u128::MIN))? == all_data.get(&u128::MIN).cloned());
-        assert!(reader.get(&u128::to_be_bytes(u128::MAX))? == all_data.get(&u128::MAX).cloned());
+        assert!(
+            reader.get(&u128::to_be_bytes(u128::MIN)).await? == all_data.get(&u128::MIN).cloned()
+        );
+        assert!(
+            reader.get(&u128::to_be_bytes(u128::MAX)).await? == all_data.get(&u128::MAX).cloned()
+        );

        Ok(())
    }
@@ -1014,15 +991,17 @@ mod tests {

        // Test get() operation on all the keys
        for (key, val) in disk_btree_test_data::TEST_DATA {
-            assert_eq!(reader.get(&key)?, Some(val));
+            assert_eq!(reader.get(&key).await?, Some(val));
        }

        // Test full scan
        let mut count = 0;
-        reader.visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
-            count += 1;
-            true
-        })?;
+        reader
+            .visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
+                count += 1;
+                true
+            })
+            .await?;
        assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

        reader.dump().await?;
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -4,7 +4,7 @@
 use crate::config::PageServerConf;
 use crate::page_cache::{self, ReadBufResult, WriteBufResult, PAGE_SZ};
 use crate::tenant::blob_io::BlobWriter;
-use crate::tenant::block_io::BlockReader;
+use crate::tenant::block_io::{BlockLease, BlockReader};
 use crate::virtual_file::VirtualFile;
 use once_cell::sync::Lazy;
 use std::cmp::min;
@@ -266,11 +266,17 @@ impl Drop for EphemeralFile {
        // unlink the file
        let res = std::fs::remove_file(&self.file.path);
        if let Err(e) = res {
-            warn!(
-                "could not remove ephemeral file '{}': {}",
-                self.file.path.display(),
-                e
-            );
+            if e.kind() != std::io::ErrorKind::NotFound {
+                // just never log the not found errors, we cannot do anything for them; on detach
+                // the tenant directory is already gone.
+                //
+                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
+                error!(
+                    "could not remove ephemeral file '{}': {}",
+                    self.file.path.display(),
+                    e
+                );
+            }
        }
    }
 }
@@ -297,9 +303,7 @@ pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error>
 }

 impl BlockReader for EphemeralFile {
-    type BlockLease = page_cache::PageReadGuard<'static>;
-
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, io::Error> {
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
        // Look up the right page
        let cache = page_cache::get();
        loop {
@@ -307,7 +311,7 @@ impl BlockReader for EphemeralFile {
                .read_ephemeral_buf(self.file_id, blknum)
                .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
            {
-                ReadBufResult::Found(guard) => return Ok(guard),
+                ReadBufResult::Found(guard) => return Ok(guard.into()),
                ReadBufResult::NotFound(mut write_guard) => {
                    // Read the page from disk into the buffer
                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
@@ -395,17 +399,26 @@ mod tests {
        Ok(())
    }

-    #[test]
-    fn test_ephemeral_blobs() -> Result<(), io::Error> {
+    #[tokio::test]
+    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;

        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;

        let pos_foo = file.write_blob(b"foo")?;
-        assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
+        assert_eq!(
+            b"foo",
+            file.block_cursor().read_blob(pos_foo).await?.as_slice()
+        );
        let pos_bar = file.write_blob(b"bar")?;
-        assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
-        assert_eq!(b"bar", file.block_cursor().read_blob(pos_bar)?.as_slice());
+        assert_eq!(
+            b"foo",
+            file.block_cursor().read_blob(pos_foo).await?.as_slice()
+        );
+        assert_eq!(
+            b"bar",
+            file.block_cursor().read_blob(pos_bar).await?.as_slice()
+        );

        let mut blobs = Vec::new();
        for i in 0..10000 {
@@ -422,7 +435,7 @@ mod tests {

        let cursor = BlockCursor::new(&file);
        for (pos, expected) in blobs {
-            let actual = cursor.read_blob(pos)?;
+            let actual = cursor.read_blob(pos).await?;
            assert_eq!(actual, expected);
        }

@@ -431,7 +444,7 @@ mod tests {
        large_data.resize(20000, 0);
        thread_rng().fill_bytes(&mut large_data);
        let pos_large = file.write_blob(&large_data)?;
-        let result = file.block_cursor().read_blob(pos_large)?;
+        let result = file.block_cursor().read_blob(pos_large).await?;
        assert_eq!(result, large_data);

        Ok(())
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -121,7 +121,7 @@ impl BatchedUpdates<'_> {
    ///
    /// This should be called when the corresponding file on disk has been deleted.
    ///
-    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc) {
+    pub fn remove_historic(&mut self, layer_desc: &PersistentLayerDesc) {
        self.layer_map.remove_historic_noflush(layer_desc)
    }

@@ -253,11 +253,11 @@ impl LayerMap {
    ///
    /// Helper function for BatchedUpdates::remove_historic
    ///
-    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
+    pub fn remove_historic_noflush(&mut self, layer_desc: &PersistentLayerDesc) {
        self.historic
-            .remove(historic_layer_coverage::LayerKey::from(&layer_desc));
+            .remove(historic_layer_coverage::LayerKey::from(layer_desc));
        let layer_key = layer_desc.key();
-        if Self::is_l0(&layer_desc) {
+        if Self::is_l0(layer_desc) {
            let len_before = self.l0_delta_layers.len();
            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
            l0_delta_layers.retain(|other| other.key() != layer_key);
@@ -766,8 +766,7 @@ mod tests {
                expected_in_counts
            );

-            map.batch_update()
-                .remove_historic(downloaded.layer_desc().clone());
+            map.batch_update().remove_historic(downloaded.layer_desc());
            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
        }

--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -20,17 +20,19 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
+use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};

 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};

+use super::delete::DeleteTenantError;
 use super::timeline::delete::DeleteTimelineFlow;

 /// The tenants known to the pageserver.
 /// The enum variants are used to distinguish the different states that the pageserver can be in.
-enum TenantsMap {
+pub(crate) enum TenantsMap {
    /// [`init_tenant_mgr`] is not done yet.
    Initializing,
    /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
@@ -42,13 +44,13 @@ enum TenantsMap {
 }

 impl TenantsMap {
-    fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
+    pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id),
        }
    }
-    fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
+    pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
@@ -97,7 +99,9 @@ pub async fn init_tenant_mgr(
                        );
                    }
                } else {
-                    // This case happens if we crash during attach before creating the attach marker file
+                    // This case happens if we:
+                    // * crash during attach before creating the attach marker file
+                    // * crash during tenant delete before removing tenant directory
                    let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
                        format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
                    })?;
@@ -124,6 +128,7 @@ pub async fn init_tenant_mgr(
                        broker_client.clone(),
                        remote_storage.clone(),
                        Some(init_order.clone()),
+                        &TENANTS,
                        &ctx,
                    ) {
                        Ok(tenant) => {
@@ -154,12 +159,13 @@ pub async fn init_tenant_mgr(
    Ok(())
 }

-pub fn schedule_local_tenant_processing(
+pub(crate) fn schedule_local_tenant_processing(
    conf: &'static PageServerConf,
    tenant_path: &Path,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
    init_order: Option<InitializationOrder>,
+    tenants: &'static tokio::sync::RwLock<TenantsMap>,
    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
    anyhow::ensure!(
@@ -219,6 +225,7 @@ pub fn schedule_local_tenant_processing(
            broker_client,
            remote_storage,
            init_order,
+            tenants,
            ctx,
        )
    };
@@ -266,71 +273,77 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
        }
    };

+    let started_at = std::time::Instant::now();
    let mut join_set = JoinSet::new();
    for (tenant_id, tenant) in tenants_to_shut_down {
        join_set.spawn(
            async move {
-                // ordering shouldn't matter for this, either we store true right away or never
-                let ordering = std::sync::atomic::Ordering::Relaxed;
-                let joined_other = std::sync::atomic::AtomicBool::new(false);
+                let freeze_and_flush = true;

-                let mut shutdown = std::pin::pin!(async {
-                    let freeze_and_flush = true;
-
-                    let res = {
-                        let (_guard, shutdown_progress) = completion::channel();
-                        tenant.shutdown(shutdown_progress, freeze_and_flush).await
-                    };
-
-                    if let Err(other_progress) = res {
-                        // join the another shutdown in progress
-                        joined_other.store(true, ordering);
-                        other_progress.wait().await;
-                    }
-                });
-
-                // in practice we might not have a lot time to go, since systemd is going to
-                // SIGKILL us at 10s, but we can try. delete tenant might take a while, so put out
-                // a warning.
-                let warning = std::time::Duration::from_secs(5);
-                let mut warning = std::pin::pin!(tokio::time::sleep(warning));
-
-                tokio::select! {
-                    _ = &mut shutdown => {},
-                    _ = &mut warning => {
-                        let joined_other = joined_other.load(ordering);
-                        warn!(%joined_other, "waiting for the shutdown to complete");
-                        shutdown.await;
-                    }
+                let res = {
+                    let (_guard, shutdown_progress) = completion::channel();
+                    tenant.shutdown(shutdown_progress, freeze_and_flush).await
                };

+                if let Err(other_progress) = res {
+                    // join the another shutdown in progress
+                    other_progress.wait().await;
+                }
+
+                // we cannot afford per tenant logging here, because if s3 is degraded, we are
+                // going to log too many lines
+
                debug!("tenant successfully stopped");
            }
            .instrument(info_span!("shutdown", %tenant_id)),
        );
    }

+    let total = join_set.len();
    let mut panicked = 0;
+    let mut buffering = true;
+    const BUFFER_FOR: std::time::Duration = std::time::Duration::from_millis(500);
+    let mut buffered = std::pin::pin!(tokio::time::sleep(BUFFER_FOR));

-    while let Some(res) = join_set.join_next().await {
-        match res {
-            Ok(()) => {}
-            Err(join_error) if join_error.is_cancelled() => {
-                unreachable!("we are not cancelling any of the futures");
-            }
-            Err(join_error) if join_error.is_panic() => {
-                // cannot really do anything, as this panic is likely a bug
-                panicked += 1;
-            }
-            Err(join_error) => {
-                warn!("unknown kind of JoinError: {join_error}");
+    while !join_set.is_empty() {
+        tokio::select! {
+            Some(joined) = join_set.join_next() => {
+                match joined {
+                    Ok(()) => {}
+                    Err(join_error) if join_error.is_cancelled() => {
+                        unreachable!("we are not cancelling any of the futures");
+                    }
+                    Err(join_error) if join_error.is_panic() => {
+                        // cannot really do anything, as this panic is likely a bug
+                        panicked += 1;
+                    }
+                    Err(join_error) => {
+                        warn!("unknown kind of JoinError: {join_error}");
+                    }
+                }
+                if !buffering {
+                    // buffer so that every 500ms since the first update (or starting) we'll log
+                    // how far away we are; this is because we will get SIGKILL'd at 10s, and we
+                    // are not able to log *then*.
+                    buffering = true;
+                    buffered.as_mut().reset(tokio::time::Instant::now() + BUFFER_FOR);
+                }
+            },
+            _ = &mut buffered, if buffering => {
+                buffering = false;
+                info!(remaining = join_set.len(), total, elapsed_ms = started_at.elapsed().as_millis(), "waiting for tenants to shutdown");
            }
        }
    }

    if panicked > 0 {
-        warn!(panicked, "observed panicks while shutting down tenants");
+        warn!(
+            panicked,
+            total, "observed panicks while shutting down tenants"
+        );
    }
+
+    // caller will log how long we took
 }

 pub async fn create_tenant(
@@ -350,7 +363,7 @@ pub async fn create_tenant(
        //       See https://github.com/neondatabase/neon/issues/4233

        let created_tenant =
-            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
+            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -411,6 +424,14 @@ pub async fn get_tenant(
    }
 }

+pub async fn delete_tenant(
+    conf: &'static PageServerConf,
+    remote_storage: Option<GenericRemoteStorage>,
+    tenant_id: TenantId,
+) -> Result<(), DeleteTenantError> {
+    DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant_id).await
+}
+
 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
    #[error("Tenant {0}")]
@@ -426,7 +447,7 @@ pub async fn delete_timeline(
    _ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
    let tenant = get_tenant(tenant_id, true).await?;
-    DeleteTimelineFlow::run(&tenant, timeline_id).await?;
+    DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
    Ok(())
 }

@@ -501,7 +522,7 @@ pub async fn load_tenant(
                .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
        }

-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, &TENANTS, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;
@@ -582,7 +603,7 @@ pub async fn attach_tenant(
            .context("check for attach marker file existence")?;
        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");

-        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -211,6 +211,9 @@ use chrono::{NaiveDateTime, Utc};
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;
+use utils::backoff::{
+    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
+};

 use std::collections::{HashMap, VecDeque};
 use std::path::Path;
@@ -241,7 +244,6 @@ use crate::{
    tenant::upload_queue::{
        UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
    },
-    {exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
 };

 use utils::id::{TenantId, TimelineId};
@@ -256,12 +258,12 @@ use super::upload_queue::SetDeletedFlagProgress;
 // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
 // level instead, as repeated failures can mean a more serious problem. If it
 // fails more than FAILED_DOWNLOAD_RETRIES times, we give up
-const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
-const FAILED_DOWNLOAD_RETRIES: u32 = 10;
+pub(crate) const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
+pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;

 // Similarly log failed uploads and deletions at WARN level, after this many
 // retries. Uploads and deletions are retried forever, though.
-const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;

 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
@@ -752,12 +754,24 @@ impl RemoteTimelineClient {

        pausable_failpoint!("persist_deleted_index_part");

-        upload::upload_index_part(
-            self.conf,
-            &self.storage_impl,
-            &self.tenant_id,
-            &self.timeline_id,
-            &index_part_with_deleted_at,
+        backoff::retry(
+            || async {
+                upload::upload_index_part(
+                    self.conf,
+                    &self.storage_impl,
+                    &self.tenant_id,
+                    &self.timeline_id,
+                    &index_part_with_deleted_at,
+                )
+                .await
+            },
+            |_e| false,
+            1,
+            // have just a couple of attempts
+            // when executed as part of timeline deletion this happens in context of api call
+            // when executed as part of tenant deletion this happens in the background
+            2,
+            "persist_index_part_with_deleted_flag",
        )
        .await?;

@@ -834,10 +848,19 @@ impl RemoteTimelineClient {
        let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
        let timeline_storage_path = self.conf.remote_path(&timeline_path)?;

-        let remaining = self
-            .storage_impl
-            .list_prefixes(Some(&timeline_storage_path))
-            .await?;
+        let remaining = backoff::retry(
+            || async {
+                self.storage_impl
+                    .list_prefixes(Some(&timeline_storage_path))
+                    .await
+            },
+            |_e| false,
+            FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "list_prefixes",
+        )
+        .await
+        .context("list prefixes")?;

        let remaining: Vec<RemotePath> = remaining
            .into_iter()
@@ -852,7 +875,15 @@ impl RemoteTimelineClient {
            .collect();

        if !remaining.is_empty() {
-            self.storage_impl.delete_objects(&remaining).await?;
+            backoff::retry(
+                || async { self.storage_impl.delete_objects(&remaining).await },
+                |_e| false,
+                FAILED_UPLOAD_WARN_THRESHOLD,
+                FAILED_REMOTE_OP_RETRIES,
+                "delete_objects",
+            )
+            .await
+            .context("delete_objects")?;
        }

        fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -864,7 +895,16 @@ impl RemoteTimelineClient {
        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));

        debug!("deleting index part");
-        self.storage_impl.delete(&index_file_path).await?;
+
+        backoff::retry(
+            || async { self.storage_impl.delete(&index_file_path).await },
+            |_e| false,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "delete_index",
+        )
+        .await
+        .context("delete_index")?;

        fail::fail_point!("timeline-delete-after-index-delete", |_| {
            Err(anyhow::anyhow!(
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -11,23 +11,17 @@ use std::time::Duration;
 use anyhow::{anyhow, Context};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
-
-use tracing::{info, warn};
+use utils::{backoff, crashsafe};

 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};

 use super::index::{IndexPart, LayerFileMetadata};
-use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
-
-async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
-    fs::File::open(path).await?.sync_all().await
-}
+use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};

 static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);

@@ -152,7 +146,7 @@ pub async fn download_layer_file<'a>(
        })
        .map_err(DownloadError::Other)?;

-    fsync_path(&local_path)
+    crashsafe::fsync_async(&local_path)
        .await
        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
        .map_err(DownloadError::Other)?;
@@ -268,7 +262,6 @@ pub(super) async fn download_index_part(
    Ok(index_part)
 }

-///
 /// Helper function to handle retries for a download operation.
 ///
 /// Remote operations can fail due to rate limits (IAM, S3), spurious network
@@ -276,47 +269,17 @@ pub(super) async fn download_index_part(
 /// with backoff.
 ///
 /// (See similar logic for uploads in `perform_upload_task`)
-async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
+async fn download_retry<T, O, F>(op: O, description: &str) -> Result<T, DownloadError>
 where
    O: FnMut() -> F,
    F: Future<Output = Result<T, DownloadError>>,
 {
-    let mut attempts = 0;
-    loop {
-        let result = op().await;
-        match result {
-            Ok(_) => {
-                if attempts > 0 {
-                    info!("{description} succeeded after {attempts} retries");
-                }
-                return result;
-            }
-
-            // These are "permanent" errors that should not be retried.
-            Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
-                return result;
-            }
-            // Assume that any other failure might be transient, and the operation might
-            // succeed if we just keep trying.
-            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
-                info!("{description} failed, will retry (attempt {attempts}): {err:#}");
-            }
-            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
-                warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
-            }
-            Err(DownloadError::Other(ref err)) => {
-                // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
-                warn!("{description} still failed after {attempts} retries, giving up: {err:?}");
-                return result;
-            }
-        }
-        // sleep and retry
-        exponential_backoff(
-            attempts,
-            DEFAULT_BASE_BACKOFF_SECONDS,
-            DEFAULT_MAX_BACKOFF_SECONDS,
-        )
-        .await;
-        attempts += 1;
-    }
+    backoff::retry(
+        op,
+        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
+        FAILED_DOWNLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        description,
+    )
+    .await
 }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,7 +8,7 @@ mod layer_desc;
 mod remote_layer;

 use crate::config::PageServerConf;
-use crate::context::RequestContext;
+use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Key;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
@@ -241,10 +241,14 @@ impl LayerAccessStats {
        });
    }

-    fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) {
+    fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
+        if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
+            return;
+        }
+
        let this_access = LayerAccessStatFullDetails {
            when: SystemTime::now(),
-            task_kind,
+            task_kind: ctx.task_kind(),
            access_kind,
        };

@@ -252,7 +256,7 @@ impl LayerAccessStats {
        locked.iter_mut().for_each(|inner| {
            inner.first_access.get_or_insert(this_access);
            inner.count_by_access_kind[access_kind] += 1;
-            inner.task_kind_flag |= task_kind;
+            inner.task_kind_flag |= ctx.task_kind();
            inner.last_accesses.write(this_access);
        })
    }
@@ -401,16 +405,6 @@ pub trait AsLayerDesc {
 /// An image layer is a snapshot of all the data in a key-range, at a single
 /// LSN.
 pub trait PersistentLayer: Layer + AsLayerDesc {
-    /// Identify the tenant this layer belongs to
-    fn get_tenant_id(&self) -> TenantId {
-        self.layer_desc().tenant_id
-    }
-
-    /// Identify the timeline this layer belongs to
-    fn get_timeline_id(&self) -> TimelineId {
-        self.layer_desc().timeline_id
-    }
-
    /// File name used for this layer, both in the pageserver's local filesystem
    /// state as well as in the remote storage.
    fn filename(&self) -> LayerFileName {
@@ -436,14 +430,6 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
        false
    }

-    /// Returns None if the layer file size is not known.
-    ///
-    /// Should not change over the lifetime of the layer object because
-    /// current_physical_size is computed as the som of this value.
-    fn file_size(&self) -> u64 {
-        self.layer_desc().file_size
-    }
-
    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;

    fn access_stats(&self) -> &LayerAccessStats;
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -29,10 +29,10 @@
 //!
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::page_cache::{PageReadGuard, PAGE_SZ};
+use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
-use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
+use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
    PersistentLayer, ValueReconstructResult, ValueReconstructState,
@@ -41,7 +41,6 @@ use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
-use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -52,6 +51,8 @@ use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
+use tokio::runtime::Handle;
+use tokio::sync::OnceCell;
 use tracing::*;

 use utils::{
@@ -90,14 +91,30 @@ pub struct Summary {

 impl From<&DeltaLayer> for Summary {
    fn from(layer: &DeltaLayer) -> Self {
+        Self::expected(
+            layer.desc.tenant_id,
+            layer.desc.timeline_id,
+            layer.desc.key_range.clone(),
+            layer.desc.lsn_range.clone(),
+        )
+    }
+}
+
+impl Summary {
+    pub(super) fn expected(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        keys: Range<Key>,
+        lsns: Range<Lsn>,
+    ) -> Self {
        Self {
            magic: DELTA_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,

-            tenant_id: layer.desc.tenant_id,
-            timeline_id: layer.desc.timeline_id,
-            key_range: layer.desc.key_range.clone(),
-            lsn_range: layer.desc.lsn_range.clone(),
+            tenant_id,
+            timeline_id,
+            key_range: keys,
+            lsn_range: lsns,

            index_start_blk: 0,
            index_root_blk: 0,
@@ -108,12 +125,10 @@ impl From<&DeltaLayer> for Summary {
 // Flag indicating that this version initialize the page
 const WILL_INIT: u64 = 1;

-///
 /// Struct representing reference to BLOB in layers. Reference contains BLOB
 /// offset, and for WAL records it also contains `will_init` flag. The flag
 /// helps to determine the range of records that needs to be applied, without
 /// reading/deserializing records themselves.
-///
 #[derive(Debug, Serialize, Deserialize, Copy, Clone)]
 pub struct BlobRef(pub u64);

@@ -138,10 +153,8 @@ impl BlobRef {
 pub const DELTA_KEY_SIZE: usize = KEY_SIZE + 8;
 struct DeltaKey([u8; DELTA_KEY_SIZE]);

-///
 /// This is the key of the B-tree index stored in the delta layer. It consists
 /// of the serialized representation of a Key and LSN.
-///
 impl DeltaKey {
    fn from_slice(buf: &[u8]) -> Self {
        let mut bytes: [u8; DELTA_KEY_SIZE] = [0u8; DELTA_KEY_SIZE];
@@ -214,6 +227,12 @@ pub struct DeltaLayerInner {
    file: FileBlockReader<VirtualFile>,
 }

+impl AsRef<DeltaLayerInner> for DeltaLayerInner {
+    fn as_ref(&self) -> &DeltaLayerInner {
+        self
+    }
+}
+
 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
@@ -242,7 +261,7 @@ impl Layer for DeltaLayer {
            return Ok(());
        }

-        let inner = self.load(LayerAccessKind::Dump, ctx)?;
+        let inner = self.load(LayerAccessKind::Dump, ctx).await?;

        println!(
            "index_start_blk: {}, root {}",
@@ -262,7 +281,8 @@ impl Layer for DeltaLayer {

        // A subroutine to dump a single blob
        let dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
-            let buf = cursor.read_blob(blob_ref.pos())?;
+            // TODO this is not ideal, but on the other hand we are in dumping code...
+            let buf = Handle::current().block_on(cursor.read_blob(blob_ref.pos()))?;
            let val = Value::des(&buf)?;
            let desc = match val {
                Value::Image(img) => {
@@ -281,22 +301,24 @@ impl Layer for DeltaLayer {
            Ok(desc)
        };

-        tree_reader.visit(
-            &[0u8; DELTA_KEY_SIZE],
-            VisitDirection::Forwards,
-            |delta_key, val| {
-                let blob_ref = BlobRef(val);
-                let key = DeltaKey::extract_key_from_buf(delta_key);
-                let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
+        tree_reader
+            .visit(
+                &[0u8; DELTA_KEY_SIZE],
+                VisitDirection::Forwards,
+                |delta_key, val| {
+                    let blob_ref = BlobRef(val);
+                    let key = DeltaKey::extract_key_from_buf(delta_key);
+                    let lsn = DeltaKey::extract_lsn_from_buf(delta_key);

-                let desc = match dump_blob(blob_ref) {
-                    Ok(desc) => desc,
-                    Err(err) => format!("ERROR: {}", err),
-                };
-                println!("  key {} at {}: {}", key, lsn, desc);
-                true
-            },
-        )?;
+                    let desc = match dump_blob(blob_ref) {
+                        Ok(desc) => desc,
+                        Err(err) => format!("ERROR: {}", err),
+                    };
+                    println!("  key {} at {}: {}", key, lsn, desc);
+                    true
+                },
+            )
+            .await?;

        Ok(())
    }
@@ -309,82 +331,15 @@ impl Layer for DeltaLayer {
        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
        ensure!(lsn_range.start >= self.desc.lsn_range.start);
-        let mut need_image = true;

        ensure!(self.desc.key_range.contains(&key));

-        {
-            // Open the file and lock the metadata in memory
-            let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
-
-            // Scan the page versions backwards, starting from `lsn`.
-            let file = &inner.file;
-            let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-                inner.index_start_blk,
-                inner.index_root_blk,
-                file,
-            );
-            let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
-
-            let mut offsets: Vec<(Lsn, u64)> = Vec::new();
-
-            tree_reader.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
-                let blob_ref = BlobRef(value);
-                if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
-                    return false;
-                }
-                let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
-                if entry_lsn < lsn_range.start {
-                    return false;
-                }
-                offsets.push((entry_lsn, blob_ref.pos()));
-
-                !blob_ref.will_init()
-            })?;
-
-            // Ok, 'offsets' now contains the offsets of all the entries we need to read
-            let cursor = file.block_cursor();
-            let mut buf = Vec::new();
-            for (entry_lsn, pos) in offsets {
-                cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
-                    format!(
-                        "Failed to read blob from virtual file {}",
-                        file.file.path.display()
-                    )
-                })?;
-                let val = Value::des(&buf).with_context(|| {
-                    format!(
-                        "Failed to deserialize file blob from virtual file {}",
-                        file.file.path.display()
-                    )
-                })?;
-                match val {
-                    Value::Image(img) => {
-                        reconstruct_state.img = Some((entry_lsn, img));
-                        need_image = false;
-                        break;
-                    }
-                    Value::WalRecord(rec) => {
-                        let will_init = rec.will_init();
-                        reconstruct_state.records.push((entry_lsn, rec));
-                        if will_init {
-                            // This WAL record initializes the page, so no need to go further back
-                            need_image = false;
-                            break;
-                        }
-                    }
-                }
-            }
-            // release metadata lock and close the file
-        }
-
-        // If an older page image is needed to reconstruct the page, let the
-        // caller know.
-        if need_image {
-            Ok(ValueReconstructResult::Continue)
-        } else {
-            Ok(ValueReconstructResult::Complete)
-        }
+        let inner = self
+            .load(LayerAccessKind::GetValueReconstructData, ctx)
+            .await?;
+        inner
+            .get_value_reconstruct_data(key, lsn_range, reconstruct_state)
+            .await
    }

    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
@@ -493,59 +448,43 @@ impl DeltaLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(
+    async fn load(
        &self,
        access_kind: LayerAccessKind,
        ctx: &RequestContext,
    ) -> Result<&Arc<DeltaLayerInner>> {
-        self.access_stats
-            .record_access(access_kind, ctx.task_kind());
+        self.access_stats.record_access(access_kind, ctx);
        // Quick exit if already loaded
        self.inner
            .get_or_try_init(|| self.load_inner())
+            .await
            .with_context(|| format!("Failed to load delta layer {}", self.path().display()))
    }

-    fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
+    async fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();

-        let file = VirtualFile::open(&path)
-            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-        let file = FileBlockReader::new(file);
+        let summary = match &self.path_or_conf {
+            PathOrConf::Conf(_) => Some(Summary::from(self)),
+            PathOrConf::Path(_) => None,
+        };

-        let summary_blk = file.read_blk(0)?;
-        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+        let loaded = DeltaLayerInner::load(&path, summary)?;

-        match &self.path_or_conf {
-            PathOrConf::Conf(_) => {
-                let mut expected_summary = Summary::from(self);
-                expected_summary.index_start_blk = actual_summary.index_start_blk;
-                expected_summary.index_root_blk = actual_summary.index_root_blk;
-                if actual_summary != expected_summary {
-                    bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
-                }
-            }
-            PathOrConf::Path(path) => {
-                let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
-                let expected_filename = self.filename().file_name();
+        if let PathOrConf::Path(ref path) = self.path_or_conf {
+            // not production code

-                if actual_filename != expected_filename {
-                    println!(
-                        "warning: filename does not match what is expected from in-file summary"
-                    );
-                    println!("actual: {:?}", actual_filename);
-                    println!("expected: {:?}", expected_filename);
-                }
+            let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
+            let expected_filename = self.filename().file_name();
+
+            if actual_filename != expected_filename {
+                println!("warning: filename does not match what is expected from in-file summary");
+                println!("actual: {:?}", actual_filename);
+                println!("expected: {:?}", expected_filename);
            }
        }

-        debug!("loaded from {}", &path.display());
-
-        Ok(Arc::new(DeltaLayerInner {
-            file,
-            index_start_blk: actual_summary.index_start_blk,
-            index_root_blk: actual_summary.index_root_blk,
-        }))
+        Ok(Arc::new(loaded))
    }

    /// Create a DeltaLayer struct representing an existing file on disk.
@@ -567,7 +506,7 @@ impl DeltaLayer {
                file_size,
            ),
            access_stats,
-            inner: once_cell::sync::OnceCell::new(),
+            inner: OnceCell::new(),
        }
    }

@@ -594,7 +533,7 @@ impl DeltaLayer {
                metadata.len(),
            ),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: once_cell::sync::OnceCell::new(),
+            inner: OnceCell::new(),
        })
    }

@@ -614,19 +553,28 @@ impl DeltaLayer {
    /// Obtains all keys and value references stored in the layer
    ///
    /// The value can be obtained via the [`ValueRef::load`] function.
-    pub fn load_val_refs(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, ValueRef)>> {
+    pub async fn load_val_refs(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<Vec<(Key, Lsn, ValueRef<Arc<DeltaLayerInner>>)>> {
        let inner = self
-            .load(LayerAccessKind::KeyIter, ctx)
+            .load(LayerAccessKind::Iter, ctx)
+            .await
            .context("load delta layer")?;
-        DeltaLayerInner::load_val_refs(inner).context("Layer index is corrupted")
+        DeltaLayerInner::load_val_refs(inner)
+            .await
+            .context("Layer index is corrupted")
    }

    /// Loads all keys stored in the layer. Returns key, lsn and value size.
-    pub fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, u64)>> {
+    pub async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, u64)>> {
        let inner = self
            .load(LayerAccessKind::KeyIter, ctx)
+            .await
            .context("load delta layer keys")?;
-        inner.load_keys().context("Layer index is corrupted")
+        DeltaLayerInner::load_keys(inner)
+            .await
+            .context("Layer index is corrupted")
    }
 }

@@ -776,7 +724,7 @@ impl DeltaLayerWriterInner {
                metadata.len(),
            ),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: once_cell::sync::OnceCell::new(),
+            inner: OnceCell::new(),
        };

        // fsync the file
@@ -899,32 +847,143 @@ impl Drop for DeltaLayerWriter {
 }

 impl DeltaLayerInner {
-    fn load_val_refs(this: &Arc<DeltaLayerInner>) -> Result<Vec<(Key, Lsn, ValueRef)>> {
-        let file = &this.file;
+    pub(super) fn load(path: &std::path::Path, summary: Option<Summary>) -> anyhow::Result<Self> {
+        let file = VirtualFile::open(path)
+            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+        let file = FileBlockReader::new(file);
+
+        let summary_blk = file.read_blk(0)?;
+        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+
+        if let Some(mut expected_summary) = summary {
+            // production code path
+            expected_summary.index_start_blk = actual_summary.index_start_blk;
+            expected_summary.index_root_blk = actual_summary.index_root_blk;
+            if actual_summary != expected_summary {
+                bail!(
+                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
+                    actual_summary,
+                    expected_summary
+                );
+            }
+        }
+
+        Ok(DeltaLayerInner {
+            file,
+            index_start_blk: actual_summary.index_start_blk,
+            index_root_blk: actual_summary.index_root_blk,
+        })
+    }
+
+    pub(super) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        lsn_range: Range<Lsn>,
+        reconstruct_state: &mut ValueReconstructState,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        let mut need_image = true;
+        // Scan the page versions backwards, starting from `lsn`.
+        let file = &self.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            this.index_start_blk,
-            this.index_root_blk,
+            self.index_start_blk,
+            self.index_root_blk,
            file,
        );
+        let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));

-        let mut all_offsets = Vec::<(Key, Lsn, ValueRef)>::new();
-        tree_reader.visit(
-            &[0u8; DELTA_KEY_SIZE],
-            VisitDirection::Forwards,
-            |key, value| {
-                let delta_key = DeltaKey::from_slice(key);
-                let val_ref = ValueRef {
-                    blob_ref: BlobRef(value),
-                    reader: BlockCursor::new(Adapter(this.clone())),
-                };
-                all_offsets.push((delta_key.key(), delta_key.lsn(), val_ref));
-                true
-            },
-        )?;
+        let mut offsets: Vec<(Lsn, u64)> = Vec::new();
+
+        tree_reader
+            .visit(&search_key.0, VisitDirection::Backwards, |key, value| {
+                let blob_ref = BlobRef(value);
+                if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
+                    return false;
+                }
+                let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
+                if entry_lsn < lsn_range.start {
+                    return false;
+                }
+                offsets.push((entry_lsn, blob_ref.pos()));
+
+                !blob_ref.will_init()
+            })
+            .await?;
+
+        // Ok, 'offsets' now contains the offsets of all the entries we need to read
+        let cursor = file.block_cursor();
+        let mut buf = Vec::new();
+        for (entry_lsn, pos) in offsets {
+            cursor
+                .read_blob_into_buf(pos, &mut buf)
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to read blob from virtual file {}",
+                        file.file.path.display()
+                    )
+                })?;
+            let val = Value::des(&buf).with_context(|| {
+                format!(
+                    "Failed to deserialize file blob from virtual file {}",
+                    file.file.path.display()
+                )
+            })?;
+            match val {
+                Value::Image(img) => {
+                    reconstruct_state.img = Some((entry_lsn, img));
+                    need_image = false;
+                    break;
+                }
+                Value::WalRecord(rec) => {
+                    let will_init = rec.will_init();
+                    reconstruct_state.records.push((entry_lsn, rec));
+                    if will_init {
+                        // This WAL record initializes the page, so no need to go further back
+                        need_image = false;
+                        break;
+                    }
+                }
+            }
+        }
+
+        // If an older page image is needed to reconstruct the page, let the
+        // caller know.
+        if need_image {
+            Ok(ValueReconstructResult::Continue)
+        } else {
+            Ok(ValueReconstructResult::Complete)
+        }
+    }
+
+    pub(super) async fn load_val_refs<T: AsRef<DeltaLayerInner> + Clone>(
+        this: &T,
+    ) -> Result<Vec<(Key, Lsn, ValueRef<T>)>> {
+        let dl = this.as_ref();
+        let file = &dl.file;
+        let tree_reader =
+            DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);
+
+        let mut all_offsets = Vec::<(Key, Lsn, ValueRef<T>)>::new();
+        tree_reader
+            .visit(
+                &[0u8; DELTA_KEY_SIZE],
+                VisitDirection::Forwards,
+                |key, value| {
+                    let delta_key = DeltaKey::from_slice(key);
+                    let val_ref = ValueRef {
+                        blob_ref: BlobRef(value),
+                        reader: BlockCursor::new(Adapter(this.clone())),
+                    };
+                    all_offsets.push((delta_key.key(), delta_key.lsn(), val_ref));
+                    true
+                },
+            )
+            .await?;

        Ok(all_offsets)
    }
-    fn load_keys(&self) -> Result<Vec<(Key, Lsn, u64)>> {
+
+    pub(super) async fn load_keys(&self) -> Result<Vec<(Key, Lsn, u64)>> {
        let file = &self.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
            self.index_start_blk,
@@ -933,26 +992,28 @@ impl DeltaLayerInner {
        );

        let mut all_keys: Vec<(Key, Lsn, u64)> = Vec::new();
-        tree_reader.visit(
-            &[0u8; DELTA_KEY_SIZE],
-            VisitDirection::Forwards,
-            |key, value| {
-                let delta_key = DeltaKey::from_slice(key);
-                let pos = BlobRef(value).pos();
-                if let Some(last) = all_keys.last_mut() {
-                    if last.0 == delta_key.key() {
-                        return true;
-                    } else {
-                        // subtract offset of new key BLOB and first blob of this key
-                        // to get total size if values associated with this key
-                        let first_pos = last.2;
-                        last.2 = pos - first_pos;
+        tree_reader
+            .visit(
+                &[0u8; DELTA_KEY_SIZE],
+                VisitDirection::Forwards,
+                |key, value| {
+                    let delta_key = DeltaKey::from_slice(key);
+                    let pos = BlobRef(value).pos();
+                    if let Some(last) = all_keys.last_mut() {
+                        if last.0 == delta_key.key() {
+                            return true;
+                        } else {
+                            // subtract offset of new key BLOB and first blob of this key
+                            // to get total size if values associated with this key
+                            let first_pos = last.2;
+                            last.2 = pos - first_pos;
+                        }
                    }
-                }
-                all_keys.push((delta_key.key(), delta_key.lsn(), pos));
-                true
-            },
-        )?;
+                    all_keys.push((delta_key.key(), delta_key.lsn(), pos));
+                    true
+                },
+            )
+            .await?;
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of layer
            last.2 = std::fs::metadata(&file.file.path)?.len() - last.2;
@@ -962,26 +1023,25 @@ impl DeltaLayerInner {
 }

 /// Reference to an on-disk value
-pub struct ValueRef {
+pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
    blob_ref: BlobRef,
-    reader: BlockCursor<Adapter>,
+    reader: BlockCursor<Adapter<T>>,
 }

-impl ValueRef {
+impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
    /// Loads the value from disk
-    pub fn load(&self) -> Result<Value> {
-        let buf = self.reader.read_blob(self.blob_ref.pos())?;
+    pub async fn load(&self) -> Result<Value> {
+        // theoretically we *could* record an access time for each, but it does not really matter
+        let buf = self.reader.read_blob(self.blob_ref.pos()).await?;
        let val = Value::des(&buf)?;
        Ok(val)
    }
 }

-struct Adapter(Arc<DeltaLayerInner>);
+struct Adapter<T: AsRef<DeltaLayerInner>>(T);

-impl BlockReader for Adapter {
-    type BlockLease = PageReadGuard<'static>;
-
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
-        self.0.file.read_blk(blknum)
+impl<T: AsRef<DeltaLayerInner>> BlockReader for Adapter<T> {
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+        self.0.as_ref().file.read_blk(blknum)
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -38,7 +38,6 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use hex;
-use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -48,6 +47,7 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
+use tokio::sync::OnceCell;
 use tracing::*;

 use utils::{
@@ -66,7 +66,7 @@ use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLay
 /// the 'index' starts at the block indicated by 'index_start_blk'
 ///
 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
-struct Summary {
+pub(super) struct Summary {
    /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
    magic: u16,
    format_version: u16,
@@ -85,13 +85,29 @@ struct Summary {

 impl From<&ImageLayer> for Summary {
    fn from(layer: &ImageLayer) -> Self {
+        Self::expected(
+            layer.desc.tenant_id,
+            layer.desc.timeline_id,
+            layer.desc.key_range.clone(),
+            layer.lsn,
+        )
+    }
+}
+
+impl Summary {
+    pub(super) fn expected(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        key_range: Range<Key>,
+        lsn: Lsn,
+    ) -> Self {
        Self {
            magic: IMAGE_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,
-            tenant_id: layer.desc.tenant_id,
-            timeline_id: layer.desc.timeline_id,
-            key_range: layer.desc.key_range.clone(),
-            lsn: layer.lsn,
+            tenant_id,
+            timeline_id,
+            key_range,
+            lsn,

            index_start_blk: 0,
            index_root_blk: 0,
@@ -136,6 +152,8 @@ pub struct ImageLayerInner {
    index_start_blk: u32,
    index_root_blk: u32,

+    lsn: Lsn,
+
    /// Reader object for reading blocks from the file.
    file: FileBlockReader<VirtualFile>,
 }
@@ -168,17 +186,19 @@ impl Layer for ImageLayer {
            return Ok(());
        }

-        let inner = self.load(LayerAccessKind::Dump, ctx)?;
+        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
        let file = &inner.file;
        let tree_reader =
            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);

        tree_reader.dump().await?;

-        tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
-            println!("key: {} offset {}", hex::encode(key), value);
-            true
-        })?;
+        tree_reader
+            .visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
+                println!("key: {} offset {}", hex::encode(key), value);
+                true
+            })
+            .await?;

        Ok(())
    }
@@ -195,28 +215,14 @@ impl Layer for ImageLayer {
        assert!(lsn_range.start >= self.lsn);
        assert!(lsn_range.end >= self.lsn);

-        let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
-
-        let file = &inner.file;
-        let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
-
-        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
-        key.write_to_byte_slice(&mut keybuf);
-        if let Some(offset) = tree_reader.get(&keybuf)? {
-            let blob = file.block_cursor().read_blob(offset).with_context(|| {
-                format!(
-                    "failed to read value from data file {} at offset {}",
-                    self.path().display(),
-                    offset
-                )
-            })?;
-            let value = Bytes::from(blob);
-
-            reconstruct_state.img = Some((self.lsn, value));
-            Ok(ValueReconstructResult::Complete)
-        } else {
-            Ok(ValueReconstructResult::Missing)
-        }
+        let inner = self
+            .load(LayerAccessKind::GetValueReconstructData, ctx)
+            .await?;
+        inner
+            .get_value_reconstruct_data(key, reconstruct_state)
+            .await
+            // FIXME: makes no sense to dump paths
+            .with_context(|| format!("read {}", self.path().display()))
    }

    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
@@ -312,58 +318,41 @@ impl ImageLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&ImageLayerInner> {
-        self.access_stats
-            .record_access(access_kind, ctx.task_kind());
-        loop {
-            if let Some(inner) = self.inner.get() {
-                return Ok(inner);
-            }
-            self.inner
-                .get_or_try_init(|| self.load_inner())
-                .with_context(|| format!("Failed to load image layer {}", self.path().display()))?;
-        }
+    async fn load(
+        &self,
+        access_kind: LayerAccessKind,
+        ctx: &RequestContext,
+    ) -> Result<&ImageLayerInner> {
+        self.access_stats.record_access(access_kind, ctx);
+        self.inner
+            .get_or_try_init(|| self.load_inner())
+            .await
+            .with_context(|| format!("Failed to load image layer {}", self.path().display()))
    }

-    fn load_inner(&self) -> Result<ImageLayerInner> {
+    async fn load_inner(&self) -> Result<ImageLayerInner> {
        let path = self.path();

-        // Open the file if it's not open already.
-        let file = VirtualFile::open(&path)
-            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0)?;
-        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+        let expected_summary = match &self.path_or_conf {
+            PathOrConf::Conf(_) => Some(Summary::from(self)),
+            PathOrConf::Path(_) => None,
+        };

-        match &self.path_or_conf {
-            PathOrConf::Conf(_) => {
-                let mut expected_summary = Summary::from(self);
-                expected_summary.index_start_blk = actual_summary.index_start_blk;
-                expected_summary.index_root_blk = actual_summary.index_root_blk;
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary)?;

-                if actual_summary != expected_summary {
-                    bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
-                }
-            }
-            PathOrConf::Path(path) => {
-                let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
-                let expected_filename = self.filename().file_name();
+        if let PathOrConf::Path(ref path) = self.path_or_conf {
+            // not production code
+            let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
+            let expected_filename = self.filename().file_name();

-                if actual_filename != expected_filename {
-                    println!(
-                        "warning: filename does not match what is expected from in-file summary"
-                    );
-                    println!("actual: {:?}", actual_filename);
-                    println!("expected: {:?}", expected_filename);
-                }
+            if actual_filename != expected_filename {
+                println!("warning: filename does not match what is expected from in-file summary");
+                println!("actual: {:?}", actual_filename);
+                println!("expected: {:?}", expected_filename);
            }
        }

-        Ok(ImageLayerInner {
-            index_start_blk: actual_summary.index_start_blk,
-            index_root_blk: actual_summary.index_root_blk,
-            file,
-        })
+        Ok(loaded)
    }

    /// Create an ImageLayer struct representing an existing file on disk
@@ -433,6 +422,66 @@ impl ImageLayer {
    }
 }

+impl ImageLayerInner {
+    pub(super) fn load(
+        path: &std::path::Path,
+        lsn: Lsn,
+        summary: Option<Summary>,
+    ) -> anyhow::Result<Self> {
+        let file = VirtualFile::open(path)
+            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+        let file = FileBlockReader::new(file);
+        let summary_blk = file.read_blk(0)?;
+        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+
+        if let Some(mut expected_summary) = summary {
+            // production code path
+            expected_summary.index_start_blk = actual_summary.index_start_blk;
+            expected_summary.index_root_blk = actual_summary.index_root_blk;
+
+            if actual_summary != expected_summary {
+                bail!(
+                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
+                    actual_summary,
+                    expected_summary
+                );
+            }
+        }
+
+        Ok(ImageLayerInner {
+            index_start_blk: actual_summary.index_start_blk,
+            index_root_blk: actual_summary.index_root_blk,
+            lsn,
+            file,
+        })
+    }
+
+    pub(super) async fn get_value_reconstruct_data(
+        &self,
+        key: Key,
+        reconstruct_state: &mut ValueReconstructState,
+    ) -> anyhow::Result<ValueReconstructResult> {
+        let file = &self.file;
+        let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
+
+        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+        key.write_to_byte_slice(&mut keybuf);
+        if let Some(offset) = tree_reader.get(&keybuf).await? {
+            let blob = file
+                .block_cursor()
+                .read_blob(offset)
+                .await
+                .with_context(|| format!("failed to read value from offset {}", offset))?;
+            let value = Bytes::from(blob);
+
+            reconstruct_state.img = Some((self.lsn, value));
+            Ok(ValueReconstructResult::Complete)
+        } else {
+            Ok(ValueReconstructResult::Missing)
+        }
+    }
+}
+
 /// A builder object for constructing a new image layer.
 ///
 /// Usage:
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -16,6 +16,7 @@ use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
 use std::cell::RefCell;
 use std::collections::HashMap;
+use std::sync::OnceLock;
 use tracing::*;
 use utils::{
    bin_ser::BeSer,
@@ -27,7 +28,7 @@ use utils::{
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::ops::Range;
-use std::sync::RwLock;
+use tokio::sync::RwLock;

 use super::{DeltaLayer, DeltaLayerWriter, Layer};

@@ -42,14 +43,16 @@ pub struct InMemoryLayer {
    tenant_id: TenantId,
    timeline_id: TimelineId,

-    ///
    /// This layer contains all the changes from 'start_lsn'. The
    /// start is inclusive.
-    ///
    start_lsn: Lsn,

-    /// The above fields never change. The parts that do change are in 'inner',
-    /// and protected by mutex.
+    /// Frozen layers have an exclusive end LSN.
+    /// Writes are only allowed when this is `None`.
+    end_lsn: OnceLock<Lsn>,
+
+    /// The above fields never change, except for `end_lsn`, which is only set once.
+    /// All other changing parts are in `inner`, and protected by a mutex.
    inner: RwLock<InMemoryLayerInner>,
 }

@@ -57,21 +60,16 @@ impl std::fmt::Debug for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("InMemoryLayer")
            .field("start_lsn", &self.start_lsn)
+            .field("end_lsn", &self.end_lsn)
            .field("inner", &self.inner)
            .finish()
    }
 }

 pub struct InMemoryLayerInner {
-    /// Frozen layers have an exclusive end LSN.
-    /// Writes are only allowed when this is None
-    end_lsn: Option<Lsn>,
-
-    ///
    /// All versions of all pages in the layer are kept here.  Indexed
    /// by block number and LSN. The value is an offset into the
    /// ephemeral file where the page version is stored.
-    ///
    index: HashMap<Key, VecMap<Lsn, u64>>,

    /// The values are stored in a serialized format in this file.
@@ -82,15 +80,7 @@ pub struct InMemoryLayerInner {

 impl std::fmt::Debug for InMemoryLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("InMemoryLayerInner")
-            .field("end_lsn", &self.end_lsn)
-            .finish()
-    }
-}
-
-impl InMemoryLayerInner {
-    fn assert_writeable(&self) {
-        assert!(self.end_lsn.is_none());
+        f.debug_struct("InMemoryLayerInner").finish()
    }
 }

@@ -101,13 +91,21 @@ impl InMemoryLayer {

    pub fn info(&self) -> InMemoryLayerInfo {
        let lsn_start = self.start_lsn;
-        let lsn_end = self.inner.read().unwrap().end_lsn;

-        match lsn_end {
-            Some(lsn_end) => InMemoryLayerInfo::Frozen { lsn_start, lsn_end },
-            None => InMemoryLayerInfo::Open { lsn_start },
+        if let Some(&lsn_end) = self.end_lsn.get() {
+            InMemoryLayerInfo::Frozen { lsn_start, lsn_end }
+        } else {
+            InMemoryLayerInfo::Open { lsn_start }
        }
    }
+
+    fn assert_writable(&self) {
+        assert!(self.end_lsn.get().is_none());
+    }
+
+    fn end_lsn_or_max(&self) -> Lsn {
+        self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
+    }
 }

 #[async_trait::async_trait]
@@ -117,14 +115,7 @@ impl Layer for InMemoryLayer {
    }

    fn get_lsn_range(&self) -> Range<Lsn> {
-        let inner = self.inner.read().unwrap();
-
-        let end_lsn = if let Some(end_lsn) = inner.end_lsn {
-            end_lsn
-        } else {
-            Lsn(u64::MAX)
-        };
-        self.start_lsn..end_lsn
+        self.start_lsn..self.end_lsn_or_max()
    }

    fn is_incremental(&self) -> bool {
@@ -134,13 +125,9 @@ impl Layer for InMemoryLayer {

    /// debugging function to print out the contents of the layer
    async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().await;

-        let end_str = inner
-            .end_lsn
-            .as_ref()
-            .map(Lsn::to_string)
-            .unwrap_or_default();
+        let end_str = self.end_lsn_or_max();

        println!(
            "----- in-memory layer for tli {} LSNs {}-{} ----",
@@ -156,7 +143,7 @@ impl Layer for InMemoryLayer {
        for (key, vec_map) in inner.index.iter() {
            for (lsn, pos) in vec_map.as_slice() {
                let mut desc = String::new();
-                cursor.read_blob_into_buf(*pos, &mut buf)?;
+                cursor.read_blob_into_buf(*pos, &mut buf).await?;
                let val = Value::des(&buf);
                match val {
                    Ok(Value::Image(img)) => {
@@ -194,7 +181,7 @@ impl Layer for InMemoryLayer {
        ensure!(lsn_range.start >= self.start_lsn);
        let mut need_image = true;

-        let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().await;

        let reader = inner.file.block_cursor();

@@ -202,7 +189,7 @@ impl Layer for InMemoryLayer {
        if let Some(vec_map) = inner.index.get(&key) {
            let slice = vec_map.slice_range(lsn_range);
            for (entry_lsn, pos) in slice.iter().rev() {
-                let buf = reader.read_blob(*pos)?;
+                let buf = reader.read_blob(*pos).await?;
                let value = Value::des(&buf)?;
                match value {
                    Value::Image(img) => {
@@ -236,9 +223,7 @@ impl Layer for InMemoryLayer {

 impl std::fmt::Display for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let inner = self.inner.read().unwrap();
-
-        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
+        let end_lsn = self.end_lsn_or_max();
        write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
    }
 }
@@ -247,8 +232,8 @@ impl InMemoryLayer {
    ///
    /// Get layer size on the disk
    ///
-    pub fn size(&self) -> Result<u64> {
-        let inner = self.inner.read().unwrap();
+    pub async fn size(&self) -> Result<u64> {
+        let inner = self.inner.read().await;
        Ok(inner.file.size)
    }

@@ -270,8 +255,8 @@ impl InMemoryLayer {
            timeline_id,
            tenant_id,
            start_lsn,
+            end_lsn: OnceLock::new(),
            inner: RwLock::new(InMemoryLayerInner {
-                end_lsn: None,
                index: HashMap::new(),
                file,
            }),
@@ -282,10 +267,10 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
+    pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-        let mut inner = self.inner.write().unwrap();
-        inner.assert_writeable();
+        let mut inner = self.inner.write().await;
+        self.assert_writable();

        let off = {
            SER_BUFFER.with(|x| -> Result<_> {
@@ -316,11 +301,11 @@ impl InMemoryLayer {
    /// Make the layer non-writeable. Only call once.
    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
-    pub fn freeze(&self, end_lsn: Lsn) {
-        let mut inner = self.inner.write().unwrap();
+    pub async fn freeze(&self, end_lsn: Lsn) {
+        let inner = self.inner.write().await;

        assert!(self.start_lsn < end_lsn);
-        inner.end_lsn = Some(end_lsn);
+        self.end_lsn.set(end_lsn).expect("end_lsn set only once");

        for vec_map in inner.index.values() {
            for (lsn, _pos) in vec_map.as_slice() {
@@ -332,7 +317,7 @@ impl InMemoryLayer {
    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub fn write_to_disk(&self) -> Result<DeltaLayer> {
+    pub async fn write_to_disk(&self) -> Result<DeltaLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -342,14 +327,16 @@ impl InMemoryLayer {
        // lock, it will see that it's not writeable anymore and retry, but it
        // would have to wait until we release it. That race condition is very
        // rare though, so we just accept the potential latency hit for now.
-        let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().await;
+
+        let end_lsn = *self.end_lsn.get().unwrap();

        let mut delta_layer_writer = DeltaLayerWriter::new(
            self.conf,
            self.timeline_id,
            self.tenant_id,
            Key::MIN,
-            self.start_lsn..inner.end_lsn.unwrap(),
+            self.start_lsn..end_lsn,
        )?;

        let mut buf = Vec::new();
@@ -363,7 +350,7 @@ impl InMemoryLayer {
            let key = **key;
            // Write all page versions
            for (lsn, pos) in vec_map.as_slice() {
-                cursor.read_blob_into_buf(*pos, &mut buf)?;
+                cursor.read_blob_into_buf(*pos, &mut buf).await?;
                let will_init = Value::des(&buf)?.will_init();
                delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
            }
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -73,17 +73,13 @@ pub fn start_background_loops(
 ///
 async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    let wait_duration = Duration::from_secs(2);
-    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
        let mut first = true;
        loop {
-            trace!("waking up");
-
            tokio::select! {
                _ = cancel.cancelled() => {
-                    info!("received cancellation request");
                    return;
                },
                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
@@ -126,15 +122,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                .await
                .is_ok()
            {
-                info!("received cancellation request during idling");
                break;
            }
        }
    }
    .await;
    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
-
-    trace!("compaction loop stopped.");
 }

 ///
@@ -142,7 +135,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 ///
 async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    let wait_duration = Duration::from_secs(2);
-    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        // GC might require downloading, to find the cutoff LSN that corresponds to the
@@ -151,11 +143,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
        let mut first = true;
        loop {
-            trace!("waking up");
-
            tokio::select! {
                _ = cancel.cancelled() => {
-                    info!("received cancellation request");
                    return;
                },
                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
@@ -200,14 +189,12 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                .await
                .is_ok()
            {
-                info!("received cancellation request during idling");
                break;
            }
        }
    }
    .await;
    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
-    trace!("GC loop stopped.");
 }

 async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
@@ -232,7 +219,6 @@ async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
                    }
                }
                Err(_sender_dropped_error) => {
-                    info!("Tenant dropped the state updates sender, quitting waiting for tenant and the task loop");
                    return ControlFlow::Break(());
                }
            }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -19,6 +19,7 @@ use pageserver_api::models::{
 use remote_storage::GenericRemoteStorage;
 use serde_with::serde_as;
 use storage_broker::BrokerClientChannel;
+use tokio::runtime::Handle;
 use tokio::sync::{oneshot, watch, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -34,7 +35,9 @@ use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};

-use crate::context::{DownloadBehavior, RequestContext};
+use crate::context::{
+    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
+};
 use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
 use crate::tenant::storage_layer::{
    DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
@@ -697,6 +700,9 @@ impl Timeline {
                Err(CompactionError::DownloadRequired(rls)) => {
                    anyhow::bail!("Compaction requires downloading multiple times (last was {} layers), possibly battling against eviction", rls.len())
                }
+                Err(CompactionError::ShuttingDown) => {
+                    return Ok(());
+                }
                Err(CompactionError::Other(e)) => {
                    return Err(e);
                }
@@ -778,7 +784,8 @@ impl Timeline {
        let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
        // Is the timeline being deleted?
        if self.is_stopping() {
-            return Err(anyhow::anyhow!("timeline is Stopping").into());
+            trace!("Dropping out of compaction on timeline shutdown");
+            return Err(CompactionError::ShuttingDown);
        }

        let target_file_size = self.get_checkpoint_distance();
@@ -794,10 +801,15 @@ impl Timeline {
            .await
        {
            Ok((partitioning, lsn)) => {
+                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
+                let image_ctx = RequestContextBuilder::extend(ctx)
+                    .access_stats_behavior(AccessStatsBehavior::Skip)
+                    .build();
+
                // 2. Create new image layers for partitions that have been modified
                // "enough".
                let layer_paths_to_upload = self
-                    .create_image_layers(&partitioning, lsn, false, ctx)
+                    .create_image_layers(&partitioning, lsn, false, &image_ctx)
                    .await
                    .map_err(anyhow::Error::from)?;
                if let Some(remote_client) = &self.remote_client {
@@ -870,7 +882,7 @@ impl Timeline {
            let Some(open_layer) = layers.open_layer.as_ref() else {
                return Ok(());
            };
-            open_layer.size()?
+            open_layer.size().await?
        };
        let last_freeze_at = self.last_freeze_at.load();
        let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
@@ -914,7 +926,7 @@ impl Timeline {
    pub fn set_state(&self, new_state: TimelineState) {
        match (self.current_state(), new_state) {
            (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
-                warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+                info!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
            }
            (st, TimelineState::Loading) => {
                error!("ignoring transition from {st:?} into Loading state");
@@ -1155,7 +1167,7 @@ impl Timeline {
            return Err(EvictionError::CannotEvictRemoteLayer);
        }

-        let layer_file_size = local_layer.file_size();
+        let layer_file_size = local_layer.layer_desc().file_size;

        let local_layer_mtime = local_layer
            .local_path()
@@ -1585,7 +1597,6 @@ impl Timeline {
    ///
    pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
        let mut guard = self.layers.write().await;
-        let mut num_layers = 0;

        let timer = self.metrics.load_layer_map_histo.start_timer();

@@ -1603,12 +1614,12 @@ impl Timeline {
            let fname = direntry.file_name();
            let fname = fname.to_string_lossy();

-            if let Some(imgfilename) = ImageFileName::parse_str(&fname) {
+            if let Some(filename) = ImageFileName::parse_str(&fname) {
                // create an ImageLayer struct for each image file.
-                if imgfilename.lsn > disk_consistent_lsn {
+                if filename.lsn > disk_consistent_lsn {
                    info!(
                        "found future image layer {} on timeline {} disk_consistent_lsn is {}",
-                        imgfilename, self.timeline_id, disk_consistent_lsn
+                        filename, self.timeline_id, disk_consistent_lsn
                    );

                    rename_to_backup(&direntry_path)?;
@@ -1616,31 +1627,31 @@ impl Timeline {
                }

                let file_size = direntry_path.metadata()?.len();
+                let stats =
+                    LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);

                let layer = ImageLayer::new(
                    self.conf,
                    self.timeline_id,
                    self.tenant_id,
-                    &imgfilename,
+                    &filename,
                    file_size,
-                    LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident),
+                    stats,
                );

-                trace!("found layer {}", layer.path().display());
                total_physical_size += file_size;
                loaded_layers.push(Arc::new(layer));
-                num_layers += 1;
-            } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
+            } else if let Some(filename) = DeltaFileName::parse_str(&fname) {
                // Create a DeltaLayer struct for each delta file.
                // The end-LSN is exclusive, while disk_consistent_lsn is
                // inclusive. For example, if disk_consistent_lsn is 100, it is
                // OK for a delta layer to have end LSN 101, but if the end LSN
                // is 102, then it might not have been fully flushed to disk
                // before crash.
-                if deltafilename.lsn_range.end > disk_consistent_lsn + 1 {
+                if filename.lsn_range.end > disk_consistent_lsn + 1 {
                    info!(
                        "found future delta layer {} on timeline {} disk_consistent_lsn is {}",
-                        deltafilename, self.timeline_id, disk_consistent_lsn
+                        filename, self.timeline_id, disk_consistent_lsn
                    );

                    rename_to_backup(&direntry_path)?;
@@ -1648,20 +1659,20 @@ impl Timeline {
                }

                let file_size = direntry_path.metadata()?.len();
+                let stats =
+                    LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);

                let layer = DeltaLayer::new(
                    self.conf,
                    self.timeline_id,
                    self.tenant_id,
-                    &deltafilename,
+                    &filename,
                    file_size,
-                    LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident),
+                    stats,
                );

-                trace!("found layer {}", layer.path().display());
                total_physical_size += file_size;
                loaded_layers.push(Arc::new(layer));
-                num_layers += 1;
            } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                // ignore these
            } else if remote_timeline_client::is_temp_download_file(&direntry_path) {
@@ -1686,6 +1697,7 @@ impl Timeline {
            }
        }

+        let num_layers = loaded_layers.len();
        guard.initialize_local_layers(loaded_layers, Lsn(disk_consistent_lsn.0) + 1);

        info!(
@@ -1786,13 +1798,15 @@ impl Timeline {
                    );
                        continue;
                    }
+                    let stats =
+                        LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);

                    let remote_layer = RemoteLayer::new_img(
                        self.tenant_id,
                        self.timeline_id,
                        imgfilename,
                        &remote_layer_metadata,
-                        LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted),
+                        stats,
                    );
                    let remote_layer = Arc::new(remote_layer);
                    added_remote_layers.push(remote_layer);
@@ -1811,12 +1825,15 @@ impl Timeline {
                        );
                        continue;
                    }
+                    let stats =
+                        LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
+
                    let remote_layer = RemoteLayer::new_delta(
                        self.tenant_id,
                        self.timeline_id,
                        deltafilename,
                        &remote_layer_metadata,
-                        LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted),
+                        stats,
                    );
                    let remote_layer = Arc::new(remote_layer);
                    added_remote_layers.push(remote_layer);
@@ -2264,15 +2281,16 @@ trait TraversalLayerExt {

 impl TraversalLayerExt for Arc<dyn PersistentLayer> {
    fn traversal_id(&self) -> TraversalId {
+        let timeline_id = self.layer_desc().timeline_id;
        match self.local_path() {
            Some(local_path) => {
-                debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", self.get_timeline_id())),
+                debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", timeline_id)),
                    "need timeline ID to uniquely identify the layer when traversal crosses ancestor boundary",
                );
                format!("{}", local_path.display())
            }
            None => {
-                format!("remote {}/{self}", self.get_timeline_id())
+                format!("remote {}/{self}", timeline_id)
            }
        }
    }
@@ -2636,7 +2654,7 @@ impl Timeline {
    async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
        //info!("PUT: key {} at {}", key, lsn);
        let layer = self.get_layer_for_write(lsn).await?;
-        layer.put_value(key, lsn, val)?;
+        layer.put_value(key, lsn, val).await?;
        Ok(())
    }

@@ -2662,7 +2680,9 @@ impl Timeline {
            Some(self.write_lock.lock().await)
        };
        let mut guard = self.layers.write().await;
-        guard.try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at);
+        guard
+            .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
+            .await;
    }

    /// Layer flusher task's main loop.
@@ -2808,7 +2828,10 @@ impl Timeline {
                // We will remove frozen layer and add delta layer in one atomic operation later.
                let layer = self.create_delta_layer(&frozen_layer).await?;
                (
-                    HashMap::from([(layer.filename(), LayerFileMetadata::new(layer.file_size()))]),
+                    HashMap::from([(
+                        layer.filename(),
+                        LayerFileMetadata::new(layer.layer_desc().file_size),
+                    )]),
                    Some(layer),
                )
            };
@@ -2828,7 +2851,7 @@ impl Timeline {
                );

                // update metrics
-                let sz = l.file_size();
+                let sz = l.layer_desc().file_size;
                self.metrics.resident_physical_size_gauge.add(sz);
                self.metrics.num_persistent_files_created.inc_by(1);
                self.metrics.persistent_bytes_written.inc_by(sz);
@@ -2941,7 +2964,11 @@ impl Timeline {
            let frozen_layer = Arc::clone(frozen_layer);
            move || {
                // Write it out
-                let new_delta = frozen_layer.write_to_disk()?;
+                // Keep this inside `spawn_blocking` and `Handle::current`
+                // as long as the write path is still sync and the read impl
+                // is still not fully async. Otherwise executor threads would
+                // be blocked.
+                let new_delta = Handle::current().block_on(frozen_layer.write_to_disk())?;
                let new_delta_path = new_delta.path();

                // Sync it to disk.
@@ -3235,6 +3262,8 @@ enum CompactionError {
    /// This should not happen repeatedly, but will be retried once by top-level
    /// `Timeline::compact`.
    DownloadRequired(Vec<Arc<RemoteLayer>>),
+    /// The timeline or pageserver is shutting down
+    ShuttingDown,
    /// Compaction cannot be done right now; page reconstruction and so on.
    Other(anyhow::Error),
 }
@@ -3445,14 +3474,14 @@ impl Timeline {
        // "gaps" in the sequence of level 0 files should only happen in case
        // of a crash, partial download from cloud storage, or something like
        // that, so it's not a big deal in practice.
-        level0_deltas.sort_by_key(|l| l.get_lsn_range().start);
+        level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start);
        let mut level0_deltas_iter = level0_deltas.iter();

        let first_level0_delta = level0_deltas_iter.next().unwrap();
-        let mut prev_lsn_end = first_level0_delta.get_lsn_range().end;
+        let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
        let mut deltas_to_compact = vec![Arc::clone(first_level0_delta)];
        for l in level0_deltas_iter {
-            let lsn_range = l.get_lsn_range();
+            let lsn_range = &l.layer_desc().lsn_range;

            if lsn_range.start != prev_lsn_end {
                break;
@@ -3461,8 +3490,13 @@ impl Timeline {
            prev_lsn_end = lsn_range.end;
        }
        let lsn_range = Range {
-            start: deltas_to_compact.first().unwrap().get_lsn_range().start,
-            end: deltas_to_compact.last().unwrap().get_lsn_range().end,
+            start: deltas_to_compact
+                .first()
+                .unwrap()
+                .layer_desc()
+                .lsn_range
+                .start,
+            end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end,
        };

        let remotes = deltas_to_compact
@@ -3512,16 +3546,30 @@ impl Timeline {
        // min-heap (reserve space for one more element added before eviction)
        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
        let mut prev: Option<Key> = None;
-        for (next_key, _next_lsn, _size) in itertools::process_results(
-            deltas_to_compact.iter().map(|l| -> Result<_> {
-                Ok(l.clone()
-                    .downcast_delta_layer()
-                    .expect("delta layer")
-                    .load_keys(ctx)?
-                    .into_iter())
-            }),
-            |iter_iter| iter_iter.kmerge_by(|a, b| a.0 < b.0),
-        )? {
+
+        let mut all_value_refs = Vec::new();
+        let mut all_keys = Vec::new();
+
+        for l in deltas_to_compact.iter() {
+            // TODO: replace this with an await once we fully go async
+            let delta = l.clone().downcast_delta_layer().expect("delta layer");
+            Handle::current().block_on(async {
+                all_value_refs.extend(delta.load_val_refs(ctx).await?);
+                all_keys.extend(delta.load_keys(ctx).await?);
+                anyhow::Ok(())
+            })?;
+        }
+
+        // The current stdlib sorting implementation is designed in a way where it is
+        // particularly fast where the slice is made up of sorted sub-ranges.
+        all_value_refs.sort_by_key(|(key, lsn, _value_ref)| (*key, *lsn));
+
+        // The current stdlib sorting implementation is designed in a way where it is
+        // particularly fast where the slice is made up of sorted sub-ranges.
+        all_keys.sort_by_key(|(key, lsn, _size)| (*key, *lsn));
+
+        for (next_key, _next_lsn, _size) in all_keys.iter() {
+            let next_key = *next_key;
            if let Some(prev_key) = prev {
                // just first fast filter
                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
@@ -3554,40 +3602,10 @@ impl Timeline {

        // This iterator walks through all key-value pairs from all the layers
        // we're compacting, in key, LSN order.
-        let all_values_iter = itertools::process_results(
-            deltas_to_compact.iter().map(|l| -> Result<_> {
-                Ok(l.clone()
-                    .downcast_delta_layer()
-                    .expect("delta layer")
-                    .load_val_refs(ctx)?
-                    .into_iter())
-            }),
-            |iter_iter| {
-                iter_iter.kmerge_by(|a, b| {
-                    let (a_key, a_lsn, _) = a;
-                    let (b_key, b_lsn, _) = b;
-                    (a_key, a_lsn) < (b_key, b_lsn)
-                })
-            },
-        )?;
+        let all_values_iter = all_value_refs.into_iter();

        // This iterator walks through all keys and is needed to calculate size used by each key
-        let mut all_keys_iter = itertools::process_results(
-            deltas_to_compact.iter().map(|l| -> Result<_> {
-                Ok(l.clone()
-                    .downcast_delta_layer()
-                    .expect("delta layer")
-                    .load_keys(ctx)?
-                    .into_iter())
-            }),
-            |iter_iter| {
-                iter_iter.kmerge_by(|a, b| {
-                    let (a_key, a_lsn, _) = a;
-                    let (b_key, b_lsn, _) = b;
-                    (a_key, a_lsn) < (b_key, b_lsn)
-                })
-            },
-        )?;
+        let mut all_keys_iter = all_keys.into_iter();

        stats.prepare_iterators_micros = stats.read_lock_drop_micros.till_now();

@@ -3641,98 +3659,105 @@ impl Timeline {
        let mut key_values_total_size = 0u64;
        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
-        for (key, lsn, value_ref) in all_values_iter {
-            let value = value_ref.load()?;
-            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
-            // We need to check key boundaries once we reach next key or end of layer with the same key
-            if !same_key || lsn == dup_end_lsn {
-                let mut next_key_size = 0u64;
-                let is_dup_layer = dup_end_lsn.is_valid();
-                dup_start_lsn = Lsn::INVALID;
-                if !same_key {
-                    dup_end_lsn = Lsn::INVALID;
+
+        // TODO remove this block_on wrapper once we fully go async
+        Handle::current().block_on(async {
+            for (key, lsn, value_ref) in all_values_iter {
+                let value = value_ref.load().await?;
+                let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
+                // We need to check key boundaries once we reach next key or end of layer with the same key
+                if !same_key || lsn == dup_end_lsn {
+                    let mut next_key_size = 0u64;
+                    let is_dup_layer = dup_end_lsn.is_valid();
+                    dup_start_lsn = Lsn::INVALID;
+                    if !same_key {
+                        dup_end_lsn = Lsn::INVALID;
+                    }
+                    // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
+                    for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
+                        next_key_size = next_size;
+                        if key != next_key {
+                            if dup_end_lsn.is_valid() {
+                                // We are writting segment with duplicates:
+                                // place all remaining values of this key in separate segment
+                                dup_start_lsn = dup_end_lsn; // new segments starts where old stops
+                                dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
+                            }
+                            break;
+                        }
+                        key_values_total_size += next_size;
+                        // Check if it is time to split segment: if total keys size is larger than target file size.
+                        // We need to avoid generation of empty segments if next_size > target_file_size.
+                        if key_values_total_size > target_file_size && lsn != next_lsn {
+                            // Split key between multiple layers: such layer can contain only single key
+                            dup_start_lsn = if dup_end_lsn.is_valid() {
+                                dup_end_lsn // new segment with duplicates starts where old one stops
+                            } else {
+                                lsn // start with the first LSN for this key
+                            };
+                            dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
+                            break;
+                        }
+                    }
+                    // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
+                    if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
+                        dup_start_lsn = dup_end_lsn;
+                        dup_end_lsn = lsn_range.end;
+                    }
+                    if writer.is_some() {
+                        let written_size = writer.as_mut().unwrap().size();
+                        let contains_hole =
+                            next_hole < holes.len() && key >= holes[next_hole].key_range.end;
+                        // check if key cause layer overflow or contains hole...
+                        if is_dup_layer
+                            || dup_end_lsn.is_valid()
+                            || written_size + key_values_total_size > target_file_size
+                            || contains_hole
+                        {
+                            // ... if so, flush previous layer and prepare to write new one
+                            new_layers.push(Arc::new(
+                                writer.take().unwrap().finish(prev_key.unwrap().next())?,
+                            ));
+                            writer = None;
+
+                            if contains_hole {
+                                // skip hole
+                                next_hole += 1;
+                            }
+                        }
+                    }
+                    // Remember size of key value because at next iteration we will access next item
+                    key_values_total_size = next_key_size;
                }
-                // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
-                for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
-                    next_key_size = next_size;
-                    if key != next_key {
+                if writer.is_none() {
+                    // Create writer if not initiaized yet
+                    writer = Some(DeltaLayerWriter::new(
+                        self.conf,
+                        self.timeline_id,
+                        self.tenant_id,
+                        key,
                        if dup_end_lsn.is_valid() {
-                            // We are writting segment with duplicates:
-                            // place all remaining values of this key in separate segment
-                            dup_start_lsn = dup_end_lsn; // new segments starts where old stops
-                            dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
-                        }
-                        break;
-                    }
-                    key_values_total_size += next_size;
-                    // Check if it is time to split segment: if total keys size is larger than target file size.
-                    // We need to avoid generation of empty segments if next_size > target_file_size.
-                    if key_values_total_size > target_file_size && lsn != next_lsn {
-                        // Split key between multiple layers: such layer can contain only single key
-                        dup_start_lsn = if dup_end_lsn.is_valid() {
-                            dup_end_lsn // new segment with duplicates starts where old one stops
+                            // this is a layer containing slice of values of the same key
+                            debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
+                            dup_start_lsn..dup_end_lsn
                        } else {
-                            lsn // start with the first LSN for this key
-                        };
-                        dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
-                        break;
-                    }
+                            debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+                            lsn_range.clone()
+                        },
+                    )?);
                }
-                // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
-                if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
-                    dup_start_lsn = dup_end_lsn;
-                    dup_end_lsn = lsn_range.end;
-                }
-                if writer.is_some() {
-                    let written_size = writer.as_mut().unwrap().size();
-                    let contains_hole =
-                        next_hole < holes.len() && key >= holes[next_hole].key_range.end;
-                    // check if key cause layer overflow or contains hole...
-                    if is_dup_layer
-                        || dup_end_lsn.is_valid()
-                        || written_size + key_values_total_size > target_file_size
-                        || contains_hole
-                    {
-                        // ... if so, flush previous layer and prepare to write new one
-                        new_layers.push(Arc::new(
-                            writer.take().unwrap().finish(prev_key.unwrap().next())?,
-                        ));
-                        writer = None;

-                        if contains_hole {
-                            // skip hole
-                            next_hole += 1;
-                        }
-                    }
-                }
-                // Remember size of key value because at next iteration we will access next item
-                key_values_total_size = next_key_size;
+                fail_point!("delta-layer-writer-fail-before-finish", |_| {
+                    Result::<_>::Err(anyhow::anyhow!(
+                        "failpoint delta-layer-writer-fail-before-finish"
+                    ))
+                });
+
+                writer.as_mut().unwrap().put_value(key, lsn, value)?;
+                prev_key = Some(key);
            }
-            if writer.is_none() {
-                // Create writer if not initiaized yet
-                writer = Some(DeltaLayerWriter::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_id,
-                    key,
-                    if dup_end_lsn.is_valid() {
-                        // this is a layer containing slice of values of the same key
-                        debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
-                        dup_start_lsn..dup_end_lsn
-                    } else {
-                        debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
-                        lsn_range.clone()
-                    },
-                )?);
-            }
-
-            fail_point!("delta-layer-writer-fail-before-finish", |_| {
-                Err(anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into())
-            });
-
-            writer.as_mut().unwrap().put_value(key, lsn, value)?;
-            prev_key = Some(key);
-        }
+            Ok(())
+        })?;
        if let Some(writer) = writer {
            new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
        }
@@ -4654,7 +4679,7 @@ impl std::fmt::Debug for LocalLayerInfoForDiskUsageEviction {

 impl LocalLayerInfoForDiskUsageEviction {
    pub fn file_size(&self) -> u64 {
-        self.layer.file_size()
+        self.layer.layer_desc().file_size
    }
 }

--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -219,27 +219,13 @@ async fn delete_local_layer_files(
            }
        };

-        let r = if metadata.is_dir() {
-            // There shouldnt be any directories inside timeline dir as of current layout.
+        if metadata.is_dir() {
+            warn!(path=%entry.path().display(), "unexpected directory under timeline dir");
            tokio::fs::remove_dir(entry.path()).await
        } else {
            tokio::fs::remove_file(entry.path()).await
-        };
-
-        if let Err(e) = r {
-            if e.kind() == std::io::ErrorKind::NotFound {
-                warn!(
-                    timeline_dir=?local_timeline_directory,
-                    path=?entry.path().display(),
-                    "got not found err while removing timeline dir, proceeding anyway"
-                );
-                continue;
-            }
-            anyhow::bail!(anyhow::anyhow!(
-                "Failed to remove: {}. Error: {e}",
-                entry.path().display()
-            ));
        }
+        .with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
    }

    info!("finished deleting layer files, releasing layer_removal_cs.lock()");
@@ -293,6 +279,17 @@ async fn cleanup_remaining_timeline_fs_traces(
        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
    });

+    // Make sure previous deletions are ordered before mark removal.
+    // Otherwise there is no guarantee that they reach the disk before mark deletion.
+    // So its possible for mark to reach disk first and for other deletions
+    // to be reordered later and thus missed if a crash occurs.
+    // Note that we dont need to sync after mark file is removed
+    // because we can tolerate the case when mark file reappears on startup.
+    let timeline_path = conf.timelines_path(&tenant_id);
+    crashsafe::fsync_async(timeline_path)
+        .await
+        .context("fsync_pre_mark_remove")?;
+
    // Remove delete mark
    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
        .await
@@ -359,10 +356,11 @@ impl DeleteTimelineFlow {
    // NB: If this fails half-way through, and is retried, the retry will go through
    // all the same steps again. Make sure the code here is idempotent, and don't
    // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))]
+    #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))]
    pub async fn run(
        tenant: &Arc<Tenant>,
        timeline_id: TimelineId,
+        inplace: bool,
    ) -> Result<(), DeleteTimelineError> {
        let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;

@@ -380,7 +378,11 @@ impl DeleteTimelineFlow {
            ))?
        });

-        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        if inplace {
+            Self::background(guard, tenant.conf, tenant, &timeline).await?
+        } else {
+            Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        }

        Ok(())
    }
@@ -398,6 +400,8 @@ impl DeleteTimelineFlow {
    }

    /// Shortcut to create Timeline in stopping state and spawn deletion task.
+    /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
+    #[instrument(skip_all, fields(%timeline_id))]
    pub async fn resume_deletion(
        tenant: Arc<Tenant>,
        timeline_id: TimelineId,
@@ -444,11 +448,15 @@ impl DeleteTimelineFlow {
        Ok(())
    }

+    #[instrument(skip_all, fields(%timeline_id))]
    pub async fn cleanup_remaining_timeline_fs_traces(
        tenant: &Tenant,
        timeline_id: TimelineId,
    ) -> anyhow::Result<()> {
-        cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await
+        let r =
+            cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await;
+        info!("Done");
+        r
    }

    fn prepare(
@@ -494,11 +502,17 @@ impl DeleteTimelineFlow {
        // At the end of the operation we're holding the guard and need to lock timelines map
        // to remove the timeline from it.
        // Always if you have two locks that are taken in different order this can result in a deadlock.
-        let delete_lock_guard = DeletionGuard(
-            Arc::clone(&timeline.delete_progress)
-                .try_lock_owned()
-                .map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
-        );
+
+        let delete_progress = Arc::clone(&timeline.delete_progress);
+        let delete_lock_guard = match delete_progress.try_lock_owned() {
+            Ok(guard) => DeletionGuard(guard),
+            Err(_) => {
+                // Unfortunately if lock fails arc is consumed.
+                return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
+                    &timeline.delete_progress,
+                )));
+            }
+        };

        timeline.set_state(TimelineState::Stopping);

@@ -553,10 +567,14 @@ impl DeleteTimelineFlow {

        remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;

-        *guard.0 = Self::Finished;
+        *guard = Self::Finished;

        Ok(())
    }
+
+    pub(crate) fn is_finished(&self) -> bool {
+        matches!(self, Self::Finished)
+    }
 }

 struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -78,9 +78,6 @@ impl Timeline {

    #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
    async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
-        scopeguard::defer! {
-            info!("eviction task finishing");
-        }
        use crate::tenant::tasks::random_init_delay;
        {
            let policy = self.get_eviction_policy();
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -120,10 +120,9 @@ impl LayerManager {

        ensure!(
            lsn > last_record_lsn,
-            "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})\n{}",
+            "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
            lsn,
            last_record_lsn,
-            std::backtrace::Backtrace::force_capture(),
        );

        // Do we have a layer open for writing already?
@@ -164,7 +163,7 @@ impl LayerManager {
    }

    /// Called from `freeze_inmem_layer`, returns true if successfully frozen.
-    pub fn try_freeze_in_memory_layer(
+    pub async fn try_freeze_in_memory_layer(
        &mut self,
        Lsn(last_record_lsn): Lsn,
        last_freeze_at: &AtomicLsn,
@@ -174,7 +173,7 @@ impl LayerManager {
        if let Some(open_layer) = &self.layer_map.open_layer {
            let open_layer_rc = Arc::clone(open_layer);
            // Does this layer need freezing?
-            open_layer.freeze(end_lsn);
+            open_layer.freeze(end_lsn).await;

            // The layer is no longer open, update the layer map to reflect this.
            // We will replace it with on-disk historics below.
@@ -278,7 +277,7 @@ impl LayerManager {
        updates: &mut BatchedUpdates<'_>,
        mapping: &mut LayerFileManager,
    ) {
-        updates.remove_historic(layer.layer_desc().clone());
+        updates.remove_historic(layer.layer_desc());
        mapping.remove(layer);
    }

@@ -292,10 +291,10 @@ impl LayerManager {
        metrics: &TimelineMetrics,
        mapping: &mut LayerFileManager,
    ) -> anyhow::Result<()> {
+        let desc = layer.layer_desc();
        if !layer.is_remote_layer() {
            layer.delete_resident_layer_file()?;
-            let layer_file_size = layer.file_size();
-            metrics.resident_physical_size_gauge.sub(layer_file_size);
+            metrics.resident_physical_size_gauge.sub(desc.file_size);
        }

        // TODO Removing from the bottom of the layer map is expensive.
@@ -303,7 +302,7 @@ impl LayerManager {
        //      won't be needed for page reconstruction for this timeline,
        //      and mark what we can't delete yet as deleted from the layer
        //      map index without actually rebuilding the index.
-        updates.remove_historic(layer.layer_desc().clone());
+        updates.remove_historic(desc);
        mapping.remove(layer);

        Ok(())
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -31,14 +31,19 @@ use storage_broker::Streaming;
 use tokio::select;
 use tracing::*;

-use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use postgres_connection::{parse_host_port, PgConnectionConfig};
+use utils::backoff::{
+    exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
+};
 use utils::{
    id::{NodeId, TenantTimelineId},
    lsn::Lsn,
 };

-use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle};
+use super::{
+    walreceiver_connection::WalConnectionStatus, walreceiver_connection::WalReceiverError,
+    TaskEvent, TaskHandle,
+};

 /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
 /// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
@@ -419,13 +424,19 @@ impl ConnectionManagerState {
                match res {
                    Ok(()) => Ok(()),
                    Err(e) => {
-                        use super::walreceiver_connection::ExpectedError;
-                        if e.is_expected() {
-                            info!("walreceiver connection handling ended: {e:#}");
-                            Ok(())
-                        } else {
-                            // give out an error to have task_mgr give it a really verbose logging
-                            Err(e).context("walreceiver connection handling failure")
+                        match e {
+                            WalReceiverError::SuccessfulCompletion(msg) => {
+                                info!("walreceiver connection handling ended with success: {msg}");
+                                Ok(())
+                            }
+                            WalReceiverError::ExpectedSafekeeperError(e) => {
+                                info!("walreceiver connection handling ended: {e}");
+                                Ok(())
+                            }
+                            WalReceiverError::Other(e) => {
+                                // give out an error to have task_mgr give it a really verbose logging
+                                Err(e).context("walreceiver connection handling failure")
+                            }
                        }
                    }
                }
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -8,14 +8,14 @@ use std::{
    time::{Duration, SystemTime},
 };

-use anyhow::{bail, ensure, Context};
+use anyhow::{anyhow, Context};
 use bytes::BytesMut;
 use chrono::{NaiveDateTime, Utc};
 use fail::fail_point;
 use futures::StreamExt;
 use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
-use postgres_ffi::v14::xlog_utils::normalize_lsn;
 use postgres_ffi::WAL_SEGMENT_SIZE;
+use postgres_ffi::{v14::xlog_utils::normalize_lsn, waldecoder::WalDecodeError};
 use postgres_protocol::message::backend::ReplicationMessage;
 use postgres_types::PgLsn;
 use tokio::{select, sync::watch, time};
@@ -60,6 +60,50 @@ pub(super) struct WalConnectionStatus {
    pub node: NodeId,
 }

+pub(super) enum WalReceiverError {
+    /// An error of a type that does not indicate an issue, e.g. a connection closing
+    ExpectedSafekeeperError(postgres::Error),
+    /// An "error" message that carries a SUCCESSFUL_COMPLETION status code.  Carries
+    /// the message part of the original postgres error
+    SuccessfulCompletion(String),
+    /// Generic error
+    Other(anyhow::Error),
+}
+
+impl From<tokio_postgres::Error> for WalReceiverError {
+    fn from(err: tokio_postgres::Error) -> Self {
+        if let Some(dberror) = err.as_db_error().filter(|db_error| {
+            db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
+                && db_error.message().contains("ending streaming")
+        }) {
+            // Strip the outer DbError, which carries a misleading "error" severity
+            Self::SuccessfulCompletion(dberror.message().to_string())
+        } else if err.is_closed()
+            || err
+                .source()
+                .and_then(|source| source.downcast_ref::<std::io::Error>())
+                .map(is_expected_io_error)
+                .unwrap_or(false)
+        {
+            Self::ExpectedSafekeeperError(err)
+        } else {
+            Self::Other(anyhow::Error::new(err))
+        }
+    }
+}
+
+impl From<anyhow::Error> for WalReceiverError {
+    fn from(err: anyhow::Error) -> Self {
+        Self::Other(err)
+    }
+}
+
+impl From<WalDecodeError> for WalReceiverError {
+    fn from(err: WalDecodeError) -> Self {
+        Self::Other(anyhow::Error::new(err))
+    }
+}
+
 /// Open a connection to the given safekeeper and receive WAL, sending back progress
 /// messages as we go.
 pub(super) async fn handle_walreceiver_connection(
@@ -70,7 +114,7 @@ pub(super) async fn handle_walreceiver_connection(
    connect_timeout: Duration,
    ctx: RequestContext,
    node: NodeId,
-) -> anyhow::Result<()> {
+) -> Result<(), WalReceiverError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

    WALRECEIVER_STARTED_CONNECTIONS.inc();
@@ -130,11 +174,15 @@ pub(super) async fn handle_walreceiver_connection(
                connection_result = connection => match connection_result {
                    Ok(()) => debug!("Walreceiver db connection closed"),
                    Err(connection_error) => {
-                        if connection_error.is_expected() {
-                            // silence, because most likely we've already exited the outer call
-                            // with a similar error.
-                        } else {
-                            warn!("Connection aborted: {connection_error:#}")
+                        match WalReceiverError::from(connection_error) {
+                            WalReceiverError::ExpectedSafekeeperError(_) => {
+                                // silence, because most likely we've already exited the outer call
+                                // with a similar error.
+                            },
+                            WalReceiverError::SuccessfulCompletion(_) => {}
+                            WalReceiverError::Other(err) => {
+                                warn!("Connection aborted: {err:#}")
+                            }
                        }
                    }
                },
@@ -180,7 +228,7 @@ pub(super) async fn handle_walreceiver_connection(
    let mut startpoint = last_rec_lsn;

    if startpoint == Lsn(0) {
-        bail!("No previous WAL position");
+        return Err(WalReceiverError::Other(anyhow!("No previous WAL position")));
    }

    // There might be some padding after the last full record, skip it.
@@ -262,7 +310,9 @@ pub(super) async fn handle_walreceiver_connection(
                        // It is important to deal with the aligned records as lsn in getPage@LSN is
                        // aligned and can be several bytes bigger. Without this alignment we are
                        // at risk of hitting a deadlock.
-                        ensure!(lsn.is_aligned());
+                        if !lsn.is_aligned() {
+                            return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
+                        }

                        walingest
                            .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
@@ -419,51 +469,3 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
        Err(IdentifyError.into())
    }
 }
-
-/// Trait for avoid reporting walreceiver specific expected or "normal" or "ok" errors.
-pub(super) trait ExpectedError {
-    /// Test if this error is an ok error.
-    ///
-    /// We don't want to report connectivity problems as real errors towards connection manager because
-    /// 1. they happen frequently enough to make server logs hard to read and
-    /// 2. the connection manager can retry other safekeeper.
-    ///
-    /// If this function returns `true`, it's such an error.
-    /// The caller should log it at info level and then report to connection manager that we're done handling this connection.
-    /// Connection manager will then handle reconnections.
-    ///
-    /// If this function returns an `false` the error should be propagated and the connection manager
-    /// will log the error at ERROR level.
-    fn is_expected(&self) -> bool;
-}
-
-impl ExpectedError for postgres::Error {
-    fn is_expected(&self) -> bool {
-        self.is_closed()
-            || self
-                .source()
-                .and_then(|source| source.downcast_ref::<std::io::Error>())
-                .map(is_expected_io_error)
-                .unwrap_or(false)
-            || self
-                .as_db_error()
-                .filter(|db_error| {
-                    db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
-                        && db_error.message().contains("ending streaming")
-                })
-                .is_some()
-    }
-}
-
-impl ExpectedError for anyhow::Error {
-    fn is_expected(&self) -> bool {
-        let head = self.downcast_ref::<postgres::Error>();
-
-        let tail = self
-            .chain()
-            .filter_map(|e| e.downcast_ref::<postgres::Error>());
-
-        // check if self or any of the chained/sourced errors are expected
-        head.into_iter().chain(tail).any(|e| e.is_expected())
-    }
-}
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -53,6 +53,9 @@ pub struct VirtualFile {
    pub path: PathBuf,
    open_options: OpenOptions,

+    // These are strings becase we only use them for metrics, and those expect strings.
+    // It makes no sense for us to constantly turn the `TimelineId` and `TenantId` into
+    // strings.
    tenant_id: String,
    timeline_id: String,
 }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -450,15 +450,6 @@ impl<'a> WalIngest<'a> {
            let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
            if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
                let xlrec = XlHeapMultiInsert::decode(buf);
-
-                let offset_array_len = if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
-                    // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
-                    0
-                } else {
-                    std::mem::size_of::<u16>() * xlrec.ntuples as usize
-                };
-                assert_eq!(offset_array_len, buf.remaining());
-
                if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
                    new_heap_blkno = Some(decoded.blocks[0].blkno);
                }
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -270,13 +270,67 @@ pub struct XlHeapDelete {

 impl XlHeapDelete {
    pub fn decode(buf: &mut Bytes) -> XlHeapDelete {
+        let neon_format = buf.remaining() == pg_constants::SIZE_OF_HEAP_DELETE;
+        let xmax = buf.get_u32_le();
+        let offnum = buf.get_u16_le();
+        let _padding;
+        let t_cid;
+        if neon_format {
+            _padding = buf.get_u16_le();
+            t_cid = buf.get_u32_le();
+        } else {
+            _padding = 0;
+            t_cid = 0;
+        }
+        let infobits_set = buf.get_u8();
+        let flags = buf.get_u8();
+        assert!(((flags & pg_constants::XLH_DELETE_STORE_CID) == 0) ^ neon_format);
        XlHeapDelete {
-            xmax: buf.get_u32_le(),
-            offnum: buf.get_u16_le(),
-            _padding: buf.get_u16_le(),
-            t_cid: buf.get_u32_le(),
-            infobits_set: buf.get_u8(),
-            flags: buf.get_u8(),
+            xmax,
+            offnum,
+            _padding,
+            t_cid,
+            infobits_set,
+            flags,
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlHeapLock {
+    pub locking_xid: TransactionId,
+    pub offnum: OffsetNumber,
+    pub _padding: u16,
+    pub t_cid: u32,
+    pub infobits_set: u8,
+    pub flags: u8,
+}
+
+impl XlHeapLock {
+    pub fn decode(buf: &mut Bytes) -> XlHeapLock {
+        let neon_format = buf.remaining() == pg_constants::SIZE_OF_HEAP_LOCK;
+        let locking_xid = buf.get_u32_le();
+        let offnum = buf.get_u16_le();
+        let _padding;
+        let t_cid;
+        if neon_format {
+            _padding = buf.get_u16_le();
+            t_cid = buf.get_u32_le();
+        } else {
+            _padding = 0;
+            t_cid = 0;
+        }
+        let infobits_set = buf.get_u8();
+        let flags = buf.get_u8();
+        assert!(((flags & pg_constants::XLH_LOCK_STORE_CID) == 0) ^ neon_format);
+        XlHeapLock {
+            locking_xid,
+            offnum,
+            _padding,
+            t_cid,
+            infobits_set,
+            flags,
        }
    }
 }
@@ -295,12 +349,21 @@ pub struct XlHeapUpdate {

 impl XlHeapUpdate {
    pub fn decode(buf: &mut Bytes) -> XlHeapUpdate {
+        let old_xmax = buf.get_u32_le();
+        let old_offnum = buf.get_u16_le();
+        let old_infobits_set = buf.get_u8();
+        let flags = buf.get_u8();
+        let t_cid = if (flags & pg_constants::XLH_UPDATE_STORE_CID) != 0 {
+            buf.get_u32()
+        } else {
+            0
+        };
        XlHeapUpdate {
-            old_xmax: buf.get_u32_le(),
-            old_offnum: buf.get_u16_le(),
-            old_infobits_set: buf.get_u8(),
-            flags: buf.get_u8(),
-            t_cid: buf.get_u32(),
+            old_xmax,
+            old_offnum,
+            old_infobits_set,
+            flags,
+            t_cid,
            new_xmax: buf.get_u32_le(),
            new_offnum: buf.get_u16_le(),
        }
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -4,6 +4,7 @@
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
+	extension_server.o \
 	file_cache.o \
 	libpagestore.o \
 	libpqwalproposer.o \
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -0,0 +1,103 @@
+
+/*-------------------------------------------------------------------------
+ *
+ * extension_server.c
+ *	  Request compute_ctl to download extension files.
+ *
+ * IDENTIFICATION
+ *	 contrib/neon/extension_server.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "tcop/pquery.h"
+#include "tcop/utility.h"
+#include "access/xact.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+#include "commands/defrem.h"
+#include "miscadmin.h"
+#include "utils/acl.h"
+#include "fmgr.h"
+#include "utils/guc.h"
+#include "port.h"
+#include "fmgr.h"
+
+#include <curl/curl.h>
+
+static int extension_server_port = 0;
+
+static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
+
+// to download all SQL (and data) files for an extension:
+// curl -X POST http://localhost:8080/extension_server/postgis
+// it covers two possible extension files layouts:
+// 1. extension_name--version--platform.sql
+// 2. extension_name/extension_name--version.sql
+//    extension_name/extra_files.csv
+//
+// to download specific library file:
+// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
+static bool
+neon_download_extension_file_http(const char *filename, bool is_library)
+{
+    CURL *curl;
+    CURLcode res;
+    char *compute_ctl_url;
+    char *postdata;
+    bool ret = false;
+
+    if ((curl = curl_easy_init()) == NULL)
+    {
+        elog(ERROR, "Failed to initialize curl handle");
+    }
+
+    compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
+                               extension_server_port, filename, is_library ? "?is_library=true" : "");
+
+    elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
+
+    curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
+    curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
+    curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */);
+
+    if (curl)
+    {
+        /* Perform the request, res will get the return code */
+        res = curl_easy_perform(curl);
+        /* Check for errors */
+        if (res == CURLE_OK)
+        {
+            ret = true;
+        }
+        else
+        {
+            // Don't error here because postgres will try to find the file
+            // and will fail with some proper error message if it's not found.
+            elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
+        }
+
+        /* always cleanup */
+        curl_easy_cleanup(curl);
+    }
+
+    return ret;
+}
+
+void pg_init_extension_server()
+{
+    // Port to connect to compute_ctl on localhost
+    // to request extension files.
+    DefineCustomIntVariable("neon.extension_server_port",
+                            "connection string to the compute_ctl",
+                            NULL,
+                            &extension_server_port,
+                            0, 0, INT_MAX,
+                            PGC_POSTMASTER,
+                            0, /* no flags required */
+                            NULL, NULL, NULL);
+
+    // set download_extension_file_hook
+    prev_download_extension_file_hook = download_extension_file_hook;
+    download_extension_file_hook = neon_download_extension_file_http;
+}
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -172,7 +172,7 @@ lfc_change_limit_hook(int newval, void *extra)
 	{
 		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
 		if (lfc_desc < 0) {
-			elog(LOG, "Failed to open file cache %s: %m", lfc_path);
+			elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
 			lfc_size_limit = 0; /* disable file cache */
 			return;
 		}
@@ -557,7 +557,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 			Assert(victim->access_count == 0);
 			entry->offset = victim->offset; /* grab victim's chunk */
 			hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
-			elog(LOG, "Swap file cache page");
+			elog(DEBUG2, "Swap file cache page");
 		}
 		else
 		{
@@ -574,7 +574,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	{
 		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
 		if (lfc_desc < 0) {
-			elog(LOG, "Failed to open file cache %s: %m", lfc_path);
+			elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
 			lfc_size_limit = 0; /* disable file cache */
 		}
 	}
@@ -583,7 +583,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 		rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
 		if (rc != BLCKSZ)
 		{
-			elog(INFO, "Failed to write file cache: %m");
+			elog(WARNING, "Failed to write file cache: %m, disabling file cache");
 			lfc_size_limit = 0; /* disable file cache */
 		}
 	}
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -35,8 +35,11 @@ _PG_init(void)
 {
 	pg_init_libpagestore();
 	pg_init_walproposer();
+
 	InitControlPlaneConnector();

+	pg_init_extension_server();
+
        // Important: This must happen after other parts of the extension
        // are loaded, otherwise any settings to GUCs that were set before
        // the extension was loaded will be removed.
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -21,6 +21,8 @@ extern char *neon_tenant;
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);

+extern void pg_init_extension_server(void);
+
 /*
 * Returns true if we shouldn't do REDO on that block in record indicated by
 * block_id; false otherwise.
--- a/pgxn/neon/walproposer_utils.c
+++ b/pgxn/neon/walproposer_utils.c
@@ -37,68 +37,14 @@ static XLogSegNo walpropSegNo = 0;

 /* START cloned file-local variables and functions from walsender.c */

-/*
- * xlogreader used for replication.  Note that a WAL sender doing physical
- * replication does not need xlogreader to read WAL, but it needs one to
- * keep a state of its work.
- */
-static XLogReaderState *xlogreader = NULL;
-
-/*
- * These variables keep track of the state of the timeline we're currently
- * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric,
- * the timeline is not the latest timeline on this server, and the server's
- * history forked off from that timeline at sendTimeLineValidUpto.
- */
-static TimeLineID sendTimeLine = 0;
-static TimeLineID sendTimeLineNextTLI = 0;
-static bool sendTimeLineIsHistoric = false;
-static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr;
-
-/*
- * Timestamp of last ProcessRepliesIfAny() that saw a reply from the
- * standby. Set to 0 if wal_sender_timeout doesn't need to be active.
- */
-static TimestampTz last_reply_timestamp = 0;
-
-/* Have we sent a heartbeat message asking for reply, since last reply? */
-static bool waiting_for_ping_response = false;
-
-static bool streamingDoneSending;
-static bool streamingDoneReceiving;
-
-/* Are we there yet? */
-static bool WalSndCaughtUp = false;
-
-/* Flags set by signal handlers for later service in main loop */
-static volatile sig_atomic_t got_STOPPING = false;
-
 /*
 * How far have we sent WAL already? This is also advertised in
 * MyWalSnd->sentPtr.  (Actually, this is the next WAL location to send.)
 */
 static XLogRecPtr sentPtr = InvalidXLogRecPtr;

-/*
- * This is set while we are streaming. When not set
- * PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set,
- * the main loop is responsible for checking got_STOPPING and terminating when
- * it's set (after streaming any remaining WAL).
- */
-static volatile sig_atomic_t replication_active = false;
-
-typedef void (*WalSndSendDataCallback) (void);
-static void WalSndLoop(WalSndSendDataCallback send_data);
-static void XLogSendPhysical(void);
-#if PG_VERSION_NUM >= 150000
-static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *tli);
-#else
-static XLogRecPtr GetStandbyFlushRecPtr(void);
-#endif
-
-static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
-							  TimeLineID *tli_p);
-
+static void WalSndLoop(void);
+static void XLogBroadcastWalProposer(void);
 /* END cloned file-level variables and functions from walsender.c */

 int
@@ -506,7 +452,7 @@ XLogWalPropClose(XLogRecPtr recptr)
 /* START of cloned functions from walsender.c */

 /*
- * Handle START_REPLICATION command.
+ * Subscribe for new WAL and stream it in the loop to safekeepers.
 *
 * At the moment, this never returns, but an ereport(ERROR) will take us back
 * to the main loop.
@@ -524,18 +470,6 @@ StartProposerReplication(StartReplicationCmd *cmd)
 				 errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
 #endif

-	/* create xlogreader for physical replication */
-	xlogreader =
-		XLogReaderAllocate(wal_segment_size, NULL,
-						   XL_ROUTINE(.segment_open = WalSndSegmentOpen,
-									  .segment_close = wal_segment_close),
-						   NULL);
-
-	if (!xlogreader)
-		ereport(ERROR,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory")));
-
 	/*
 	 * We assume here that we're logging enough information in the WAL for
 	 * log-shipping, since this is checked in PostmasterMain().
@@ -569,341 +503,61 @@ StartProposerReplication(StartReplicationCmd *cmd)
 	 * we keep this code around to lighten the load for when we need it.
 	 */
 #if PG_VERSION_NUM >= 150000
-	if (am_cascading_walsender)
-	{
-		/* this also updates ThisTimeLineID */
-		FlushPtr = GetStandbyFlushRecPtr(&currTLI);
-	}
-	else
-		FlushPtr = GetFlushRecPtr(&currTLI);
+	FlushPtr = GetFlushRecPtr(&currTLI);
 #else
-	if (am_cascading_walsender)
-	{
-		/* this also updates ThisTimeLineID */
-		FlushPtr = GetStandbyFlushRecPtr();
-	}
-	else
-		FlushPtr = GetFlushRecPtr();
-
+	FlushPtr = GetFlushRecPtr();
 	currTLI = ThisTimeLineID;
 #endif

+	/*
+	 * When we first start replication the standby will be behind the
+	 * primary. For some applications, for example synchronous
+	 * replication, it is important to have a clear state for this initial
+	 * catchup mode, so we can trigger actions when we change streaming
+	 * state later. We may stay in this state for a long time, which is
+	 * exactly why we want to be able to monitor whether or not we are
+	 * still here.
+	 */
+	WalSndSetState(WALSNDSTATE_CATCHUP);

-	if (cmd->timeline != 0)
+	/*
+	 * Don't allow a request to stream from a future point in WAL that
+	 * hasn't been flushed to disk in this server yet.
+	 */
+	if (FlushPtr < cmd->startpoint)
 	{
-		XLogRecPtr	switchpoint;
-
-		sendTimeLine = cmd->timeline;
-		if (sendTimeLine == currTLI)
-		{
-			sendTimeLineIsHistoric = false;
-			sendTimeLineValidUpto = InvalidXLogRecPtr;
-		}
-		else
-		{
-			List	   *timeLineHistory;
-
-			sendTimeLineIsHistoric = true;
-
-			/*
-			 * Check that the timeline the client requested exists, and the
-			 * requested start location is on that timeline.
-			 */
-			timeLineHistory = readTimeLineHistory(currTLI);
-			switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory,
-										 &sendTimeLineNextTLI);
-			list_free_deep(timeLineHistory);
-
-			/*
-			 * Found the requested timeline in the history. Check that
-			 * requested startpoint is on that timeline in our history.
-			 *
-			 * This is quite loose on purpose. We only check that we didn't
-			 * fork off the requested timeline before the switchpoint. We
-			 * don't check that we switched *to* it before the requested
-			 * starting point. This is because the client can legitimately
-			 * request to start replication from the beginning of the WAL
-			 * segment that contains switchpoint, but on the new timeline, so
-			 * that it doesn't end up with a partial segment. If you ask for
-			 * too old a starting point, you'll get an error later when we
-			 * fail to find the requested WAL segment in pg_wal.
-			 *
-			 * XXX: we could be more strict here and only allow a startpoint
-			 * that's older than the switchpoint, if it's still in the same
-			 * WAL segment.
-			 */
-			if (!XLogRecPtrIsInvalid(switchpoint) &&
-				switchpoint < cmd->startpoint)
-			{
-				ereport(ERROR,
-						(errmsg("requested starting point %X/%X on timeline %u is not in this server's history",
-								LSN_FORMAT_ARGS(cmd->startpoint),
-								cmd->timeline),
-						 errdetail("This server's history forked from timeline %u at %X/%X.",
-								   cmd->timeline,
-								   LSN_FORMAT_ARGS(switchpoint))));
-			}
-			sendTimeLineValidUpto = switchpoint;
-		}
-	}
-	else
-	{
-		sendTimeLine = currTLI;
-		sendTimeLineValidUpto = InvalidXLogRecPtr;
-		sendTimeLineIsHistoric = false;
+		ereport(ERROR,
+				(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
+						LSN_FORMAT_ARGS(cmd->startpoint),
+						LSN_FORMAT_ARGS(FlushPtr))));
 	}

-	streamingDoneSending = streamingDoneReceiving = false;
+	/* Start streaming from the requested point */
+	sentPtr = cmd->startpoint;

-	/* If there is nothing to stream, don't even enter COPY mode */
-	if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto)
-	{
-		/*
-		 * When we first start replication the standby will be behind the
-		 * primary. For some applications, for example synchronous
-		 * replication, it is important to have a clear state for this initial
-		 * catchup mode, so we can trigger actions when we change streaming
-		 * state later. We may stay in this state for a long time, which is
-		 * exactly why we want to be able to monitor whether or not we are
-		 * still here.
-		 */
-		WalSndSetState(WALSNDSTATE_CATCHUP);
+	/* Initialize shared memory status, too */
+	SpinLockAcquire(&MyWalSnd->mutex);
+	MyWalSnd->sentPtr = sentPtr;
+	SpinLockRelease(&MyWalSnd->mutex);

-		/*
-		 * Don't allow a request to stream from a future point in WAL that
-		 * hasn't been flushed to disk in this server yet.
-		 */
-		if (FlushPtr < cmd->startpoint)
-		{
-			ereport(ERROR,
-					(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
-							LSN_FORMAT_ARGS(cmd->startpoint),
-							LSN_FORMAT_ARGS(FlushPtr))));
-		}
+	SyncRepInitConfig();

-		/* Start streaming from the requested point */
-		sentPtr = cmd->startpoint;
+	/* Infinite send loop, never returns */
+	WalSndLoop();

-		/* Initialize shared memory status, too */
-		SpinLockAcquire(&MyWalSnd->mutex);
-		MyWalSnd->sentPtr = sentPtr;
-		SpinLockRelease(&MyWalSnd->mutex);
-
-		SyncRepInitConfig();
-
-		/* Main loop of walsender */
-		replication_active = true;
-
-		WalSndLoop(XLogSendPhysical);
-
-		replication_active = false;
-		if (got_STOPPING)
-			proc_exit(0);
-		WalSndSetState(WALSNDSTATE_STARTUP);
-
-		Assert(streamingDoneSending && streamingDoneReceiving);
-	}
+	WalSndSetState(WALSNDSTATE_STARTUP);

 	if (cmd->slotname)
 		ReplicationSlotRelease();
-
-	/*
-	 * Copy is finished now. Send a single-row result set indicating the next
-	 * timeline.
-	 */
-	if (sendTimeLineIsHistoric)
-	{
-		char		startpos_str[8 + 1 + 8 + 1];
-		DestReceiver *dest;
-		TupOutputState *tstate;
-		TupleDesc	tupdesc;
-		Datum		values[2];
-		bool		nulls[2];
-
-		snprintf(startpos_str, sizeof(startpos_str), "%X/%X",
-				 LSN_FORMAT_ARGS(sendTimeLineValidUpto));
-
-		dest = CreateDestReceiver(DestRemoteSimple);
-		MemSet(nulls, false, sizeof(nulls));
-
-		/*
-		 * Need a tuple descriptor representing two columns. int8 may seem
-		 * like a surprising data type for this, but in theory int4 would not
-		 * be wide enough for this, as TimeLineID is unsigned.
-		 */
-		tupdesc = CreateTemplateTupleDesc(2);
-		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli",
-								  INT8OID, -1, 0);
-		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos",
-								  TEXTOID, -1, 0);
-
-		/* prepare for projection of tuple */
-		tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual);
-
-		values[0] = Int64GetDatum((int64) sendTimeLineNextTLI);
-		values[1] = CStringGetTextDatum(startpos_str);
-
-		/* send it to dest */
-		do_tup_output(tstate, values, nulls);
-
-		end_tup_output(tstate);
-	}
-
-	/* Send CommandComplete message */
-	EndReplicationCommand("START_STREAMING");
 }

-#if PG_VERSION_NUM >= 150000
-static XLogRecPtr
-GetStandbyFlushRecPtr(TimeLineID *tli)
-{
-	XLogRecPtr	replayPtr;
-	TimeLineID	replayTLI;
-	XLogRecPtr	receivePtr;
-	TimeLineID	receiveTLI;
-	XLogRecPtr	result;
-
-	/*
-	 * We can safely send what's already been replayed. Also, if walreceiver
-	 * is streaming WAL from the same timeline, we can send anything that it
-	 * has streamed, but hasn't been replayed yet.
-	 */
-
-	receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
-	replayPtr = GetXLogReplayRecPtr(&replayTLI);
-
-	*tli = replayTLI;
-
-	result = replayPtr;
-	if (receiveTLI == replayTLI && receivePtr > replayPtr)
-		result = receivePtr;
-
-	return result;
-}
-#else
 /*
- * Returns the latest point in WAL that has been safely flushed to disk, and
- * can be sent to the standby. This should only be called when in recovery,
- * ie. we're streaming to a cascaded standby.
- *
- * As a side-effect, ThisTimeLineID is updated to the TLI of the last
- * replayed WAL record.
+ * Main loop that waits for LSN updates and calls the walproposer.
+ * Synchronous replication sets latch in WalSndWakeup at walsender.c
 */
-static XLogRecPtr
-GetStandbyFlushRecPtr(void)
-{
-	XLogRecPtr	replayPtr;
-	TimeLineID	replayTLI;
-	XLogRecPtr	receivePtr;
-	TimeLineID	receiveTLI;
-	XLogRecPtr	result;
-
-	/*
-	 * We can safely send what's already been replayed. Also, if walreceiver
-	 * is streaming WAL from the same timeline, we can send anything that it
-	 * has streamed, but hasn't been replayed yet.
-	 */
-
-	receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
-	replayPtr = GetXLogReplayRecPtr(&replayTLI);
-
-	ThisTimeLineID = replayTLI;
-
-	result = replayPtr;
-	if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr)
-		result = receivePtr;
-
-	return result;
-}
-#endif
-
-
-
-/* XLogReaderRoutine->segment_open callback */
 static void
-WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
-				  TimeLineID *tli_p)
+WalSndLoop(void)
 {
-	char		path[MAXPGPATH];
-
-	/*-------
-	 * When reading from a historic timeline, and there is a timeline switch
-	 * within this segment, read from the WAL segment belonging to the new
-	 * timeline.
-	 *
-	 * For example, imagine that this server is currently on timeline 5, and
-	 * we're streaming timeline 4. The switch from timeline 4 to 5 happened at
-	 * 0/13002088. In pg_wal, we have these files:
-	 *
-	 * ...
-	 * 000000040000000000000012
-	 * 000000040000000000000013
-	 * 000000050000000000000013
-	 * 000000050000000000000014
-	 * ...
-	 *
-	 * In this situation, when requested to send the WAL from segment 0x13, on
-	 * timeline 4, we read the WAL from file 000000050000000000000013. Archive
-	 * recovery prefers files from newer timelines, so if the segment was
-	 * restored from the archive on this server, the file belonging to the old
-	 * timeline, 000000040000000000000013, might not exist. Their contents are
-	 * equal up to the switchpoint, because at a timeline switch, the used
-	 * portion of the old segment is copied to the new file.  -------
-	 */
-	*tli_p = sendTimeLine;
-	if (sendTimeLineIsHistoric)
-	{
-		XLogSegNo	endSegNo;
-
-		XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize);
-		if (nextSegNo == endSegNo)
-			*tli_p = sendTimeLineNextTLI;
-	}
-
-	XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize);
-	state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
-	if (state->seg.ws_file >= 0)
-		return;
-
-	/*
-	 * If the file is not found, assume it's because the standby asked for a
-	 * too old WAL segment that has already been removed or recycled.
-	 */
-	if (errno == ENOENT)
-	{
-		char		xlogfname[MAXFNAMELEN];
-		int			save_errno = errno;
-
-		XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size);
-		errno = save_errno;
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("requested WAL segment %s has already been removed",
-						xlogfname)));
-	}
-	else
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not open file \"%s\": %m",
-						path)));
-}
-
-
-/* Main loop of walsender process that streams the WAL over Copy messages. */
-static void
-WalSndLoop(WalSndSendDataCallback send_data)
-{
-	/*
-	 * Initialize the last reply timestamp. That enables timeout processing
-	 * from hereon.
-	 */
-	last_reply_timestamp = GetCurrentTimestamp();
-	waiting_for_ping_response = false;
-
-	/*
-	 * Loop until we reach the end of this timeline or the client requests to
-	 * stop streaming.
-	 */
 	for (;;)
 	{
 		/* Clear any already-pending wakeups */
@@ -911,153 +565,41 @@ WalSndLoop(WalSndSendDataCallback send_data)

 		CHECK_FOR_INTERRUPTS();

-		/* Process any requests or signals received recently */
-		if (ConfigReloadPending)
-		{
-			ConfigReloadPending = false;
-			ProcessConfigFile(PGC_SIGHUP);
-			SyncRepInitConfig();
-		}
+		XLogBroadcastWalProposer();

-		/* always true */
-		if (am_wal_proposer)
-		{
-			send_data();
-			if (WalSndCaughtUp)
-			{
-				if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
-					WalSndSetState(WALSNDSTATE_STREAMING);
-				WalProposerPoll();
-				WalSndCaughtUp = false;
-			}
-			continue;
-		}
+		if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
+			WalSndSetState(WALSNDSTATE_STREAMING);
+		WalProposerPoll();
 	}
 }

 /*
- * Send out the WAL in its normal physical/stored form.
- *
- * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
- * but not yet sent to the client, and buffer it in the libpq output
- * buffer.
- *
- * If there is no unsent WAL remaining, WalSndCaughtUp is set to true,
- * otherwise WalSndCaughtUp is set to false.
+ * Notify walproposer about the new WAL position.
 */
 static void
-XLogSendPhysical(void)
+XLogBroadcastWalProposer(void)
 {
-	XLogRecPtr	SendRqstPtr;
 	XLogRecPtr	startptr;
 	XLogRecPtr	endptr;
-	Size		nbytes PG_USED_FOR_ASSERTS_ONLY;
-	TimeLineID	currTLI;

-	/* If requested switch the WAL sender to the stopping state. */
-	if (got_STOPPING)
-		WalSndSetState(WALSNDSTATE_STOPPING);
+	/* Start from the last sent position */
+	startptr = sentPtr;

-	if (streamingDoneSending)
-	{
-		WalSndCaughtUp = true;
-		return;
-	}
-
-	/* Figure out how far we can safely send the WAL. */
-	if (sendTimeLineIsHistoric)
-	{
-		/*
-		 * Streaming an old timeline that's in this server's history, but is
-		 * not the one we're currently inserting or replaying. It can be
-		 * streamed up to the point where we switched off that timeline.
-		 */
-		SendRqstPtr = sendTimeLineValidUpto;
-	}
-	else if (am_cascading_walsender)
-	{
-		/*
-		 * Streaming the latest timeline on a standby.
-		 *
-		 * Attempt to send all WAL that has already been replayed, so that we
-		 * know it's valid. If we're receiving WAL through streaming
-		 * replication, it's also OK to send any WAL that has been received
-		 * but not replayed.
-		 *
-		 * The timeline we're recovering from can change, or we can be
-		 * promoted. In either case, the current timeline becomes historic. We
-		 * need to detect that so that we don't try to stream past the point
-		 * where we switched to another timeline. We check for promotion or
-		 * timeline switch after calculating FlushPtr, to avoid a race
-		 * condition: if the timeline becomes historic just after we checked
-		 * that it was still current, it's still be OK to stream it up to the
-		 * FlushPtr that was calculated before it became historic.
-		 */
-		bool		becameHistoric = false;
+	/*
+	 * Streaming the current timeline on a primary.
+	 *
+	 * Attempt to send all data that's already been written out and
+	 * fsync'd to disk.  We cannot go further than what's been written out
+	 * given the current implementation of WALRead().  And in any case
+	 * it's unsafe to send WAL that is not securely down to disk on the
+	 * primary: if the primary subsequently crashes and restarts, standbys
+	 * must not have applied any WAL that got lost on the primary.
+	 */
 #if PG_VERSION_NUM >= 150000
-		SendRqstPtr = GetStandbyFlushRecPtr(&currTLI);
+	endptr = GetFlushRecPtr(NULL);
 #else
-		SendRqstPtr = GetStandbyFlushRecPtr();
-		currTLI = ThisTimeLineID;
+	endptr = GetFlushRecPtr();
 #endif
-		if (!RecoveryInProgress())
-		{
-			/*
-			 * We have been promoted. RecoveryInProgress() updated
-			 * ThisTimeLineID to the new current timeline.
-			 */
-			am_cascading_walsender = false;
-			becameHistoric = true;
-		}
-		else
-		{
-			/*
-			 * Still a cascading standby. But is the timeline we're sending
-			 * still the one recovery is recovering from? currTLI was updated
-			 * by the GetStandbyFlushRecPtr() call above.
-			 */
-			if (sendTimeLine != currTLI)
-				becameHistoric = true;
-		}
-
-		if (becameHistoric)
-		{
-			/*
-			 * The timeline we were sending has become historic. Read the
-			 * timeline history file of the new timeline to see where exactly
-			 * we forked off from the timeline we were sending.
-			 */
-			List	   *history;
-
-			history = readTimeLineHistory(currTLI);
-			sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI);
-
-			Assert(sendTimeLine < sendTimeLineNextTLI);
-			list_free_deep(history);
-
-			sendTimeLineIsHistoric = true;
-
-			SendRqstPtr = sendTimeLineValidUpto;
-		}
-	}
-	else
-	{
-		/*
-		 * Streaming the current timeline on a primary.
-		 *
-		 * Attempt to send all data that's already been written out and
-		 * fsync'd to disk.  We cannot go further than what's been written out
-		 * given the current implementation of WALRead().  And in any case
-		 * it's unsafe to send WAL that is not securely down to disk on the
-		 * primary: if the primary subsequently crashes and restarts, standbys
-		 * must not have applied any WAL that got lost on the primary.
-		 */
-#if PG_VERSION_NUM >= 150000
-		SendRqstPtr = GetFlushRecPtr(NULL);
-#else
-		SendRqstPtr = GetFlushRecPtr();
-#endif
-	}

 	/*
 	 * Record the current system time as an approximation of the time at which
@@ -1083,91 +625,14 @@ XLogSendPhysical(void)
 	 * that arbitrary LSN is eventually reported as written, flushed and
 	 * applied, so that it can measure the elapsed time.
 	 */
-	LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp());
-
-	/*
-	 * If this is a historic timeline and we've reached the point where we
-	 * forked to the next timeline, stop streaming.
-	 *
-	 * Note: We might already have sent WAL > sendTimeLineValidUpto. The
-	 * startup process will normally replay all WAL that has been received
-	 * from the primary, before promoting, but if the WAL streaming is
-	 * terminated at a WAL page boundary, the valid portion of the timeline
-	 * might end in the middle of a WAL record. We might've already sent the
-	 * first half of that partial WAL record to the cascading standby, so that
-	 * sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't
-	 * replay the partial WAL record either, so it can still follow our
-	 * timeline switch.
-	 */
-	if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr)
-	{
-		/* close the current file. */
-		if (xlogreader->seg.ws_file >= 0)
-			wal_segment_close(xlogreader);
-
-		/* Send CopyDone */
-		pq_putmessage_noblock('c', NULL, 0);
-		streamingDoneSending = true;
-
-		WalSndCaughtUp = true;
-
-		elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
-			 LSN_FORMAT_ARGS(sendTimeLineValidUpto),
-			 LSN_FORMAT_ARGS(sentPtr));
-		return;
-	}
+	LagTrackerWrite(endptr, GetCurrentTimestamp());

 	/* Do we have any work to do? */
-	Assert(sentPtr <= SendRqstPtr);
-	if (SendRqstPtr <= sentPtr)
-	{
-		WalSndCaughtUp = true;
+	Assert(startptr <= endptr);
+	if (endptr <= startptr)
 		return;
-	}

-	/*
-	 * Figure out how much to send in one message. If there's no more than
-	 * MAX_SEND_SIZE bytes to send, send everything. Otherwise send
-	 * MAX_SEND_SIZE bytes, but round back to logfile or page boundary.
-	 *
-	 * The rounding is not only for performance reasons. Walreceiver relies on
-	 * the fact that we never split a WAL record across two messages. Since a
-	 * long WAL record is split at page boundary into continuation records,
-	 * page boundary is always a safe cut-off point. We also assume that
-	 * SendRqstPtr never points to the middle of a WAL record.
-	 */
-	startptr = sentPtr;
-	endptr = startptr;
-	endptr += MAX_SEND_SIZE;
-
-	/* if we went beyond SendRqstPtr, back off */
-	if (SendRqstPtr <= endptr)
-	{
-		endptr = SendRqstPtr;
-		if (sendTimeLineIsHistoric)
-			WalSndCaughtUp = false;
-		else
-			WalSndCaughtUp = true;
-	}
-	else
-	{
-		/* round down to page boundary. */
-		endptr -= (endptr % XLOG_BLCKSZ);
-		WalSndCaughtUp = false;
-	}
-
-	nbytes = endptr - startptr;
-	Assert(nbytes <= MAX_SEND_SIZE);
-
-	/* always true */
-	if (am_wal_proposer)
-	{
-		WalProposerBroadcast(startptr, endptr);
-	}
-	else
-	{
-		/* code removed for brevity */
-	}
+	WalProposerBroadcast(startptr, endptr);
 	sentPtr = endptr;

 	/* Update shared memory status */
--- a/poetry.lock
+++ b/poetry.lock
@@ -887,34 +887,34 @@ files = [

 [[package]]
 name = "cryptography"
-version = "41.0.2"
+version = "41.0.3"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:01f1d9e537f9a15b037d5d9ee442b8c22e3ae11ce65ea1f3316a41c78756b711"},
-    {file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:079347de771f9282fbfe0e0236c716686950c19dee1b76240ab09ce1624d76d7"},
-    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:439c3cc4c0d42fa999b83ded80a9a1fb54d53c58d6e59234cfe97f241e6c781d"},
-    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f14ad275364c8b4e525d018f6716537ae7b6d369c094805cae45300847e0894f"},
-    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:84609ade00a6ec59a89729e87a503c6e36af98ddcd566d5f3be52e29ba993182"},
-    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:49c3222bb8f8e800aead2e376cbef687bc9e3cb9b58b29a261210456a7783d83"},
-    {file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d73f419a56d74fef257955f51b18d046f3506270a5fd2ac5febbfa259d6c0fa5"},
-    {file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:2a034bf7d9ca894720f2ec1d8b7b5832d7e363571828037f9e0c4f18c1b58a58"},
-    {file = "cryptography-41.0.2-cp37-abi3-win32.whl", hash = "sha256:d124682c7a23c9764e54ca9ab5b308b14b18eba02722b8659fb238546de83a76"},
-    {file = "cryptography-41.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:9c3fe6534d59d071ee82081ca3d71eed3210f76ebd0361798c74abc2bcf347d4"},
-    {file = "cryptography-41.0.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a719399b99377b218dac6cf547b6ec54e6ef20207b6165126a280b0ce97e0d2a"},
-    {file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:182be4171f9332b6741ee818ec27daff9fb00349f706629f5cbf417bd50e66fd"},
-    {file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7a9a3bced53b7f09da251685224d6a260c3cb291768f54954e28f03ef14e3766"},
-    {file = "cryptography-41.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f0dc40e6f7aa37af01aba07277d3d64d5a03dc66d682097541ec4da03cc140ee"},
-    {file = "cryptography-41.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:674b669d5daa64206c38e507808aae49904c988fa0a71c935e7006a3e1e83831"},
-    {file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7af244b012711a26196450d34f483357e42aeddb04128885d95a69bd8b14b69b"},
-    {file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9b6d717393dbae53d4e52684ef4f022444fc1cce3c48c38cb74fca29e1f08eaa"},
-    {file = "cryptography-41.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:192255f539d7a89f2102d07d7375b1e0a81f7478925b3bc2e0549ebf739dae0e"},
-    {file = "cryptography-41.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f772610fe364372de33d76edcd313636a25684edb94cee53fd790195f5989d14"},
-    {file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b332cba64d99a70c1e0836902720887fb4529ea49ea7f5462cf6640e095e11d2"},
-    {file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9a6673c1828db6270b76b22cc696f40cde9043eb90373da5c2f8f2158957f42f"},
-    {file = "cryptography-41.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:342f3767e25876751e14f8459ad85e77e660537ca0a066e10e75df9c9e9099f0"},
-    {file = "cryptography-41.0.2.tar.gz", hash = "sha256:7d230bf856164de164ecb615ccc14c7fc6de6906ddd5b491f3af90d3514c925c"},
+    {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507"},
+    {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922"},
+    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81"},
+    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd"},
+    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47"},
+    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116"},
+    {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c"},
+    {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae"},
+    {file = "cryptography-41.0.3-cp37-abi3-win32.whl", hash = "sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306"},
+    {file = "cryptography-41.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574"},
+    {file = "cryptography-41.0.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087"},
+    {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858"},
+    {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906"},
+    {file = "cryptography-41.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e"},
+    {file = "cryptography-41.0.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd"},
+    {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207"},
+    {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84"},
+    {file = "cryptography-41.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7"},
+    {file = "cryptography-41.0.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d"},
+    {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de"},
+    {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1"},
+    {file = "cryptography-41.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4"},
+    {file = "cryptography-41.0.3.tar.gz", hash = "sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34"},
 ]

 [package.dependencies]
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -5,12 +5,12 @@ use crate::{
    auth::{self, AuthFlow, ClientCredentials},
    compute,
    console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
-    proxy::handle_try_wake,
+    proxy::{handle_try_wake, retry_after},
    sasl, scram,
    stream::PqStream,
 };
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::info;
+use tracing::{error, info, warn};

 pub(super) async fn authenticate(
    api: &impl console::Api,
@@ -55,11 +55,20 @@ pub(super) async fn authenticate(
    let mut num_retries = 0;
    let mut node = loop {
        let wake_res = api.wake_compute(extra, creds).await;
-        match handle_try_wake(wake_res, num_retries)? {
-            ControlFlow::Continue(_) => num_retries += 1,
-            ControlFlow::Break(n) => break n,
+        match handle_try_wake(wake_res, num_retries) {
+            Err(e) => {
+                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
+                return Err(e.into());
+            }
+            Ok(ControlFlow::Continue(e)) => {
+                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
+            }
+            Ok(ControlFlow::Break(n)) => break n,
        }
-        info!(num_retries, "retrying wake compute");
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+        tokio::time::sleep(wait_duration).await;
    };
    if let Some(keys) = scram_keys {
        use tokio_postgres::config::AuthKeys;
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -230,7 +230,8 @@ pub struct PostgresConnection {
 }

 impl ConnCfg {
-    async fn do_connect(
+    /// Connect to a corresponding compute node.
+    pub async fn connect(
        &self,
        allow_self_signed_compute: bool,
        timeout: Duration,
@@ -270,20 +271,6 @@ impl ConnCfg {

        Ok(connection)
    }
-
-    /// Connect to a corresponding compute node.
-    pub async fn connect(
-        &self,
-        allow_self_signed_compute: bool,
-        timeout: Duration,
-    ) -> Result<PostgresConnection, ConnectionError> {
-        self.do_connect(allow_self_signed_compute, timeout)
-            .inspect_err(|err| {
-                // Immediately log the error we have at our disposal.
-                error!("couldn't connect to compute node: {err}");
-            })
-            .await
-    }
 }

 /// Retrieve `options` from a startup message, dropping all proxy-secific flags.
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -8,6 +8,7 @@ use super::{
 use crate::{auth::ClientCredentials, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
+use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};

@@ -47,7 +48,9 @@ impl Api {
                .build()?;

            info!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
            let response = self.endpoint.execute(request).await?;
+            info!(duration = ?start.elapsed(), "received http response");
            let body = match parse_body::<GetRoleSecret>(response).await {
                Ok(body) => body,
                // Error 404 is special: it's ok not to have a secret.
@@ -88,7 +91,9 @@ impl Api {
                .build()?;

            info!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
            let response = self.endpoint.execute(request).await?;
+            info!(duration = ?start.elapsed(), "received http response");
            let body = parse_body::<WakeCompute>(response).await?;

            // Unfortunately, ownership won't let us use `Option::ok_or` here.
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -7,11 +7,14 @@ pub mod server;
 pub mod sql_over_http;
 pub mod websocket;

-use std::time::Duration;
+use std::{sync::Arc, time::Duration};

+use futures::FutureExt;
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+use tokio::time::Instant;
+use tracing::trace;

 use crate::url::ApiUrl;
 use reqwest_middleware::RequestBuilder;
@@ -20,13 +23,21 @@ use reqwest_middleware::RequestBuilder;
 /// because it takes care of observability (OpenTelemetry).
 /// We deliberately don't want to replace this with a public static.
 pub fn new_client() -> ClientWithMiddleware {
-    reqwest_middleware::ClientBuilder::new(reqwest::Client::new())
+    let client = reqwest::ClientBuilder::new()
+        .dns_resolver(Arc::new(GaiResolver::default()))
+        .connection_verbose(true)
+        .build()
+        .expect("Failed to create http client");
+
+    reqwest_middleware::ClientBuilder::new(client)
        .with(reqwest_tracing::TracingMiddleware::default())
        .build()
 }

 pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
    let timeout_client = reqwest::ClientBuilder::new()
+        .dns_resolver(Arc::new(GaiResolver::default()))
+        .connection_verbose(true)
        .timeout(default_timout)
        .build()
        .expect("Failed to create http client with timeout");
@@ -39,6 +50,10 @@ pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware
        // As per docs, "This middleware always errors when given requests with streaming bodies".
        // That's all right because we only use this client to send `serde_json::RawValue`, which
        // is not a stream.
+        //
+        // ex-maintainer note:
+        // this limitation can be fixed if streaming is necessary.
+        // retries will still not be performed, but it wont error immediately
        .with(RetryTransientMiddleware::new_with_policy(retry_policy))
        .build()
 }
@@ -81,6 +96,37 @@ impl Endpoint {
    }
 }

+/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
+use hyper::{
+    client::connect::dns::{GaiResolver as HyperGaiResolver, Name},
+    service::Service,
+};
+use reqwest::dns::{Addrs, Resolve, Resolving};
+#[derive(Debug)]
+pub struct GaiResolver(HyperGaiResolver);
+
+impl Default for GaiResolver {
+    fn default() -> Self {
+        Self(HyperGaiResolver::new())
+    }
+}
+
+impl Resolve for GaiResolver {
+    fn resolve(&self, name: Name) -> Resolving {
+        let this = &mut self.0.clone();
+        let start = Instant::now();
+        Box::pin(
+            Service::<Name>::call(this, name.clone()).map(move |result| {
+                let resolve_duration = start.elapsed();
+                trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
+                result
+                    .map(|addrs| -> Addrs { Box::new(addrs) })
+                    .map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
+            }),
+        )
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -27,21 +27,26 @@ struct QueryData {
    params: Vec<serde_json::Value>,
 }

+#[derive(serde::Deserialize)]
+struct BatchQueryData {
+    queries: Vec<QueryData>,
+}
+
 #[derive(serde::Deserialize)]
 #[serde(untagged)]
 enum Payload {
    Single(QueryData),
-    Batch(Vec<QueryData>),
+    Batch(BatchQueryData),
 }

-pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
+pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB

 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
-static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
 static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
 static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");
+static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrable");

 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");

@@ -188,9 +193,9 @@ pub async fn handle(
    let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);

    // Allow connection pooling only if explicitly requested
-    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
+    let allow_pool = false;

-    // isolation level and read only
+    // isolation level, read only and deferrable

    let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
    let txn_isolation_level = match txn_isolation_level_raw {
@@ -204,8 +209,8 @@ pub async fn handle(
        None => None,
    };

-    let txn_read_only_raw = headers.get(&TXN_READ_ONLY).cloned();
-    let txn_read_only = txn_read_only_raw.as_ref() == Some(&HEADER_VALUE_TRUE);
+    let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
+    let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);

    let request_content_length = match request.body().size_hint().upper() {
        Some(v) => v,
@@ -214,7 +219,7 @@ pub async fn handle(

    if request_content_length > MAX_REQUEST_SIZE {
        return Err(anyhow::anyhow!(
-            "request is too large (max {MAX_REQUEST_SIZE} bytes)"
+            "request is too large (max is {MAX_REQUEST_SIZE} bytes)"
        ));
    }

@@ -233,7 +238,7 @@ pub async fn handle(
        Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode)
            .await
            .map(|x| (x, HashMap::default())),
-        Payload::Batch(queries) => {
+        Payload::Batch(batch_query) => {
            let mut results = Vec::new();
            let mut builder = client.build_transaction();
            if let Some(isolation_level) = txn_isolation_level {
@@ -242,8 +247,11 @@ pub async fn handle(
            if txn_read_only {
                builder = builder.read_only(true);
            }
+            if txn_deferrable {
+                builder = builder.deferrable(true);
+            }
            let transaction = builder.start().await?;
-            for query in queries {
+            for query in batch_query.queries {
                let result = query_to_json(&transaction, query, raw_output, array_mode).await;
                match result {
                    Ok(r) => results.push(r),
@@ -255,12 +263,20 @@ pub async fn handle(
            }
            transaction.commit().await?;
            let mut headers = HashMap::default();
-            headers.insert(
-                TXN_READ_ONLY.clone(),
-                HeaderValue::try_from(txn_read_only.to_string())?,
-            );
-            if let Some(txn_isolation_level_raw) = txn_isolation_level_raw {
-                headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level_raw);
+            if txn_read_only {
+                headers.insert(
+                    TXN_READ_ONLY.clone(),
+                    HeaderValue::try_from(txn_read_only.to_string())?,
+                );
+            }
+            if txn_deferrable {
+                headers.insert(
+                    TXN_DEFERRABLE.clone(),
+                    HeaderValue::try_from(txn_deferrable.to_string())?,
+                );
+            }
+            if let Some(txn_isolation_level) = txn_isolation_level_raw {
+                headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
            }
            Ok((json!({ "results": results }), headers))
        }
@@ -292,13 +308,15 @@ async fn query_to_json<T: GenericClient>(
    // big.
    pin_mut!(row_stream);
    let mut rows: Vec<tokio_postgres::Row> = Vec::new();
-    let mut curret_size = 0;
+    let mut current_size = 0;
    while let Some(row) = row_stream.next().await {
        let row = row?;
-        curret_size += row.body_len();
+        current_size += row.body_len();
        rows.push(row);
-        if curret_size > MAX_RESPONSE_SIZE {
-            return Err(anyhow::anyhow!("response too large"));
+        if current_size > MAX_RESPONSE_SIZE {
+            return Err(anyhow::anyhow!(
+                "response is too large (max is {MAX_RESPONSE_SIZE} bytes)"
+            ));
        }
    }

--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -187,12 +187,16 @@ async fn ws_handler(
        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
            .map_err(|e| ApiError::BadRequest(e.into()))?;

-        tokio::spawn(async move {
-            if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
-            {
-                error!(session_id = ?session_id, "error in websocket connection: {e:?}");
+        tokio::spawn(
+            async move {
+                if let Err(e) =
+                    serve_websocket(websocket, config, &cancel_map, session_id, host).await
+                {
+                    error!(session_id = ?session_id, "error in websocket connection: {e:#}");
+                }
            }
-        });
+            .in_current_span(),
+        );

        // Return the response so the spawned future can continue.
        Ok(response)
@@ -217,6 +221,10 @@ async fn ws_handler(
                    },
                    None => Value::Null,
                };
+                error!(
+                    ?code,
+                    "sql-over-http per-client task finished with an error: {e:#}"
+                );
                (
                    json!({ "message": message, "code": code }),
                    HashMap::default(),
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -23,7 +23,7 @@ use tokio::{
    time,
 };
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, warn};
+use tracing::{error, info, info_span, warn, Instrument};
 use utils::measured_stream::MeasuredStream;

 /// Number of times we should retry the `/proxy_wake_compute` http request.
@@ -101,21 +101,20 @@ pub async fn task_main(
        tokio::select! {
            accept_result = listener.accept() => {
                let (socket, peer_addr) = accept_result?;
-                info!("accepted postgres client connection from {peer_addr}");

                let session_id = uuid::Uuid::new_v4();
                let cancel_map = Arc::clone(&cancel_map);
                connections.spawn(
                    async move {
-                        info!("spawned a task for {peer_addr}");
+                        info!("accepted postgres client connection");

                        socket
                            .set_nodelay(true)
                            .context("failed to set socket option")?;

-                        handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp)
-                        .await
+                        handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp).await
                    }
+                    .instrument(info_span!("handle_client", ?session_id, %peer_addr))
                    .unwrap_or_else(move |e| {
                        // Acknowledge that the task has finished with an error.
                        error!(?session_id, "per-client task finished with an error: {e:#}");
@@ -183,7 +182,6 @@ impl ClientMode {
    }
 }

-#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    config: &'static ProxyConfig,
    cancel_map: &CancelMap,
@@ -425,11 +423,17 @@ where
            auth::BackendType::Test(x) => x.wake_compute(),
        };

-        match handle_try_wake(wake_res, num_retries)? {
+        match handle_try_wake(wake_res, num_retries) {
+            Err(e) => {
+                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
+                return Err(e.into());
+            }
            // failed to wake up but we can continue to retry
-            ControlFlow::Continue(_) => {}
+            Ok(ControlFlow::Continue(e)) => {
+                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
+            }
            // successfully woke up a compute node and can break the wakeup loop
-            ControlFlow::Break(mut node_info) => {
+            Ok(ControlFlow::Break(mut node_info)) => {
                node_info.config.reuse_password(&config);
                mechanism.update_connect_config(&mut node_info.config);
                break node_info;
@@ -440,7 +444,6 @@ where
        num_retries += 1;

        time::sleep(wait_duration).await;
-        info!(num_retries, "retrying wake compute");
    };

    // now that we have a new node, try connect to it repeatedly.
@@ -451,10 +454,12 @@ where
        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
            Ok(res) => return Ok(res),
            Err(e) => {
-                error!(error = ?e, "could not connect to compute node");
-                if !e.should_retry(num_retries) {
+                let retriable = e.should_retry(num_retries);
+                if !retriable {
+                    error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
                    return Err(e.into());
                }
+                warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
            }
        }

@@ -462,7 +467,6 @@ where
        num_retries += 1;

        time::sleep(wait_duration).await;
-        info!(num_retries, "retrying connect_once");
    }
 }

@@ -541,7 +545,7 @@ impl ShouldRetry for compute::ConnectionError {
    }
 }

-fn retry_after(num_retries: u32) -> time::Duration {
+pub fn retry_after(num_retries: u32) -> time::Duration {
    // 1.5 seems to be an ok growth factor heuristic
    BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
 }
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -15,6 +15,7 @@ use toml_edit::Document;
 use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::path::{Path, PathBuf};
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 use storage_broker::Uri;
@@ -79,6 +80,10 @@ struct Args {
    /// Listen http endpoint for management and metrics in the form host:port.
    #[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)]
    listen_http: String,
+    /// Advertised endpoint for receiving/sending WAL in the form host:port. If not
+    /// specified, listen_pg is used to advertise instead.
+    #[arg(long, default_value = None)]
+    advertise_pg: Option<String>,
    /// Availability zone of the safekeeper.
    #[arg(long)]
    availability_zone: Option<String>,
@@ -118,9 +123,24 @@ struct Args {
    /// WAL backup horizon.
    #[arg(long)]
    disable_wal_backup: bool,
-    /// Path to a .pem public key which is used to check JWT tokens.
-    #[arg(long)]
-    auth_validation_public_key_path: Option<PathBuf>,
+    /// If given, enables auth on incoming connections to WAL service endpoint
+    /// (--listen-pg). Value specifies path to a .pem public key used for
+    /// validations of JWT tokens. Empty string is allowed and means disabling
+    /// auth.
+    #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
+    pg_auth_public_key_path: Option<PathBuf>,
+    /// If given, enables auth on incoming connections to tenant only WAL
+    /// service endpoint (--listen-pg-tenant-only). Value specifies path to a
+    /// .pem public key used for validations of JWT tokens. Empty string is
+    /// allowed and means disabling auth.
+    #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
+    pg_tenant_only_auth_public_key_path: Option<PathBuf>,
+    /// If given, enables auth on incoming connections to http management
+    /// service endpoint (--listen-http). Value specifies path to a .pem public
+    /// key used for validations of JWT tokens. Empty string is allowed and
+    /// means disabling auth.
+    #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
+    http_auth_public_key_path: Option<PathBuf>,
    /// Format for logging, either 'plain' or 'json'.
    #[arg(long, default_value = "plain")]
    log_format: String,
@@ -130,9 +150,39 @@ struct Args {
    current_thread_runtime: bool,
 }

+// Like PathBufValueParser, but allows empty string.
+fn opt_pathbuf_parser(s: &str) -> Result<PathBuf, String> {
+    Ok(PathBuf::from_str(s).unwrap())
+}
+
 #[tokio::main(flavor = "current_thread")]
 async fn main() -> anyhow::Result<()> {
-    let args = Args::parse();
+    // We want to allow multiple occurences of the same arg (taking the last) so
+    // that neon_local could generate command with defaults + overrides without
+    // getting 'argument cannot be used multiple times' error. This seems to be
+    // impossible with pure Derive API, so convert struct to Command, modify it,
+    // parse arguments, and then fill the struct back.
+    let cmd = <Args as clap::CommandFactory>::command().args_override_self(true);
+    let mut matches = cmd.get_matches();
+    let mut args = <Args as clap::FromArgMatches>::from_arg_matches_mut(&mut matches)?;
+
+    // I failed to modify opt_pathbuf_parser to return Option<PathBuf> in
+    // reasonable time, so turn empty string into option post factum.
+    if let Some(pb) = &args.pg_auth_public_key_path {
+        if pb.as_os_str().is_empty() {
+            args.pg_auth_public_key_path = None;
+        }
+    }
+    if let Some(pb) = &args.pg_tenant_only_auth_public_key_path {
+        if pb.as_os_str().is_empty() {
+            args.pg_tenant_only_auth_public_key_path = None;
+        }
+    }
+    if let Some(pb) = &args.http_auth_public_key_path {
+        if pb.as_os_str().is_empty() {
+            args.http_auth_public_key_path = None;
+        }
+    }

    if let Some(addr) = args.dump_control_file {
        let state = control_file::FileStorage::load_control_file(addr)?;
@@ -166,13 +216,40 @@ async fn main() -> anyhow::Result<()> {
        return Ok(());
    }

-    let auth = match args.auth_validation_public_key_path.as_ref() {
+    let pg_auth = match args.pg_auth_public_key_path.as_ref() {
        None => {
-            info!("auth is disabled");
+            info!("pg auth is disabled");
            None
        }
        Some(path) => {
-            info!("loading JWT auth key from {}", path.display());
+            info!("loading pg auth JWT key from {}", path.display());
+            Some(Arc::new(
+                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
+            ))
+        }
+    };
+    let pg_tenant_only_auth = match args.pg_tenant_only_auth_public_key_path.as_ref() {
+        None => {
+            info!("pg tenant only auth is disabled");
+            None
+        }
+        Some(path) => {
+            info!(
+                "loading pg tenant only auth JWT key from {}",
+                path.display()
+            );
+            Some(Arc::new(
+                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
+            ))
+        }
+    };
+    let http_auth = match args.http_auth_public_key_path.as_ref() {
+        None => {
+            info!("http auth is disabled");
+            None
+        }
+        Some(path) => {
+            info!("loading http auth JWT key from {}", path.display());
            Some(Arc::new(
                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
            ))
@@ -185,6 +262,7 @@ async fn main() -> anyhow::Result<()> {
        listen_pg_addr: args.listen_pg,
        listen_pg_addr_tenant_only: args.listen_pg_tenant_only,
        listen_http_addr: args.listen_http,
+        advertise_pg_addr: args.advertise_pg,
        availability_zone: args.availability_zone,
        no_sync: args.no_sync,
        broker_endpoint: args.broker_endpoint,
@@ -194,7 +272,9 @@ async fn main() -> anyhow::Result<()> {
        max_offloader_lag_bytes: args.max_offloader_lag,
        wal_backup_enabled: !args.disable_wal_backup,
        backup_parallel_jobs: args.wal_backup_parallel_jobs,
-        auth,
+        pg_auth,
+        pg_tenant_only_auth,
+        http_auth,
        current_thread_runtime: args.current_thread_runtime,
    };

@@ -283,7 +363,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
        .spawn(wal_service::task_main(
            conf_,
            pg_listener,
-            Some(Scope::SafekeeperData),
+            Scope::SafekeeperData,
        ))
        // wrap with task name for error reporting
        .map(|res| ("WAL service main".to_owned(), res));
@@ -297,7 +377,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
            .spawn(wal_service::task_main(
                conf_,
                pg_listener_tenant_only,
-                Some(Scope::Tenant),
+                Scope::Tenant,
            ))
            // wrap with task name for error reporting
            .map(|res| ("WAL service tenant only main".to_owned(), res));
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -2,8 +2,9 @@
 //! protocol commands.

 use anyhow::Context;
-use std::str;
 use std::str::FromStr;
+use std::str::{self};
+use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span, Instrument};

@@ -11,6 +12,7 @@ use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};

 use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED};
+use crate::safekeeper::Term;
 use crate::timeline::TimelineError;
 use crate::wal_service::ConnectionId;
 use crate::{GlobalTimelines, SafeKeeperConf};
@@ -19,7 +21,7 @@ use postgres_backend::{self, PostgresBackend};
 use postgres_ffi::PG_TLI;
 use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
 use regex::Regex;
-use utils::auth::{Claims, Scope};
+use utils::auth::{Claims, JwtAuth, Scope};
 use utils::{
    id::{TenantId, TenantTimelineId, TimelineId},
    lsn::Lsn,
@@ -35,8 +37,8 @@ pub struct SafekeeperPostgresHandler {
    pub ttid: TenantTimelineId,
    /// Unique connection id is logged in spans for observability.
    pub conn_id: ConnectionId,
-    /// Auth scope allowed on the connections. None if auth is not configured.
-    allowed_auth_scope: Option<Scope>,
+    /// Auth scope allowed on the connections and public key used to check auth tokens. None if auth is not configured.
+    auth: Option<(Scope, Arc<JwtAuth>)>,
    claims: Option<Claims>,
    io_metrics: Option<TrafficMetrics>,
 }
@@ -44,7 +46,7 @@ pub struct SafekeeperPostgresHandler {
 /// Parsed Postgres command.
 enum SafekeeperPostgresCommand {
    StartWalPush,
-    StartReplication { start_lsn: Lsn },
+    StartReplication { start_lsn: Lsn, term: Option<Term> },
    IdentifySystem,
    TimelineStatus,
    JSONCtrl { cmd: AppendLogicalMessage },
@@ -55,15 +57,21 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
        Ok(SafekeeperPostgresCommand::StartWalPush)
    } else if cmd.starts_with("START_REPLICATION") {
        let re = Regex::new(
-            r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)",
+            // We follow postgres START_REPLICATION LOGICAL options to pass term.
+            r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)(?: \(term='(\d+)'\))?",
        )
        .unwrap();
-        let mut caps = re.captures_iter(cmd);
-        let start_lsn = caps
-            .next()
-            .map(|cap| Lsn::from_str(&cap[1]))
-            .context("parse start LSN from START_REPLICATION command")??;
-        Ok(SafekeeperPostgresCommand::StartReplication { start_lsn })
+        let caps = re
+            .captures(cmd)
+            .context(format!("failed to parse START_REPLICATION command {}", cmd))?;
+        let start_lsn =
+            Lsn::from_str(&caps[1]).context("parse start LSN from START_REPLICATION command")?;
+        let term = if let Some(m) = caps.get(2) {
+            Some(m.as_str().parse::<u64>().context("invalid term")?)
+        } else {
+            None
+        };
+        Ok(SafekeeperPostgresCommand::StartReplication { start_lsn, term })
    } else if cmd.starts_with("IDENTIFY_SYSTEM") {
        Ok(SafekeeperPostgresCommand::IdentifySystem)
    } else if cmd.starts_with("TIMELINE_STATUS") {
@@ -147,18 +155,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
    ) -> Result<(), QueryError> {
        // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
        // which requires auth to be present
-        let data = self
-            .conf
+        let (allowed_auth_scope, auth) = self
            .auth
            .as_ref()
-            .unwrap()
-            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
+            .expect("auth_type is configured but .auth of handler is missing");
+        let data =
+            auth.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;

-        let scope = self
-            .allowed_auth_scope
-            .expect("auth is enabled but scope is not configured");
        // The handler might be configured to allow only tenant scope tokens.
-        if matches!(scope, Scope::Tenant) && !matches!(data.claims.scope, Scope::Tenant) {
+        if matches!(allowed_auth_scope, Scope::Tenant)
+            && !matches!(data.claims.scope, Scope::Tenant)
+        {
            return Err(QueryError::Other(anyhow::anyhow!(
                "passed JWT token is for full access, but only tenant scope is allowed"
            )));
@@ -218,8 +225,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                    .instrument(info_span!("WAL receiver", ttid = %span_ttid))
                    .await
            }
-            SafekeeperPostgresCommand::StartReplication { start_lsn } => {
-                self.handle_start_replication(pgb, start_lsn)
+            SafekeeperPostgresCommand::StartReplication { start_lsn, term } => {
+                self.handle_start_replication(pgb, start_lsn, term)
                    .instrument(info_span!("WAL sender", ttid = %span_ttid))
                    .await
            }
@@ -237,7 +244,7 @@ impl SafekeeperPostgresHandler {
        conf: SafeKeeperConf,
        conn_id: u32,
        io_metrics: Option<TrafficMetrics>,
-        allowed_auth_scope: Option<Scope>,
+        auth: Option<(Scope, Arc<JwtAuth>)>,
    ) -> Self {
        SafekeeperPostgresHandler {
            conf,
@@ -247,7 +254,7 @@ impl SafekeeperPostgresHandler {
            ttid: TenantTimelineId::empty(),
            conn_id,
            claims: None,
-            allowed_auth_scope,
+            auth,
            io_metrics,
        }
    }
@@ -255,7 +262,7 @@ impl SafekeeperPostgresHandler {
    // when accessing management api supply None as an argument
    // when using to authorize tenant pass corresponding tenant id
    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
-        if self.conf.auth.is_none() {
+        if self.auth.is_none() {
            // auth is set to Trust, nothing to check so just return ok
            return Ok(());
        }
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -359,7 +359,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
 /// Safekeeper http router.
 pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
    let mut router = endpoint::make_router();
-    if conf.auth.is_some() {
+    if conf.http_auth.is_some() {
        router = router.middleware(auth_middleware(|request| {
            #[allow(clippy::mutable_key_type)]
            static ALLOWLIST_ROUTES: Lazy<HashSet<Uri>> =
@@ -375,7 +375,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>

    // NB: on any changes do not forget to update the OpenAPI spec
    // located nearby (/safekeeper/src/http/openapi_spec.yaml).
-    let auth = conf.auth.clone();
+    let auth = conf.http_auth.clone();
    router
        .data(Arc::new(conf))
        .data(auth)
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -55,6 +55,7 @@ pub struct SafeKeeperConf {
    pub listen_pg_addr: String,
    pub listen_pg_addr_tenant_only: Option<String>,
    pub listen_http_addr: String,
+    pub advertise_pg_addr: Option<String>,
    pub availability_zone: Option<String>,
    pub no_sync: bool,
    pub broker_endpoint: Uri,
@@ -64,7 +65,9 @@ pub struct SafeKeeperConf {
    pub max_offloader_lag_bytes: u64,
    pub backup_parallel_jobs: usize,
    pub wal_backup_enabled: bool,
-    pub auth: Option<Arc<JwtAuth>>,
+    pub pg_auth: Option<Arc<JwtAuth>>,
+    pub pg_tenant_only_auth: Option<Arc<JwtAuth>>,
+    pub http_auth: Option<Arc<JwtAuth>>,
    pub current_thread_runtime: bool,
 }

@@ -88,6 +91,7 @@ impl SafeKeeperConf {
            listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
            listen_pg_addr_tenant_only: None,
            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
+            advertise_pg_addr: None,
            availability_zone: None,
            remote_storage: None,
            my_id: NodeId(0),
@@ -97,7 +101,9 @@ impl SafeKeeperConf {
            broker_keepalive_interval: Duration::from_secs(5),
            wal_backup_enabled: true,
            backup_parallel_jobs: 1,
-            auth: None,
+            pg_auth: None,
+            pg_tenant_only_auth: None,
+            http_auth: None,
            heartbeat_timeout: Duration::new(5, 0),
            max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
            current_thread_runtime: false,
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -2,6 +2,7 @@
 //! with the "START_REPLICATION" message, and registry of walsenders.

 use crate::handler::SafekeeperPostgresHandler;
+use crate::safekeeper::Term;
 use crate::timeline::Timeline;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
@@ -359,8 +360,12 @@ impl SafekeeperPostgresHandler {
        &mut self,
        pgb: &mut PostgresBackend<IO>,
        start_pos: Lsn,
+        term: Option<Term>,
    ) -> Result<(), QueryError> {
-        if let Err(end) = self.handle_start_replication_guts(pgb, start_pos).await {
+        if let Err(end) = self
+            .handle_start_replication_guts(pgb, start_pos, term)
+            .await
+        {
            // Log the result and probably send it to the client, closing the stream.
            pgb.handle_copy_stream_end(end).await;
        }
@@ -371,6 +376,7 @@ impl SafekeeperPostgresHandler {
        &mut self,
        pgb: &mut PostgresBackend<IO>,
        start_pos: Lsn,
+        term: Option<Term>,
    ) -> Result<(), CopyStreamHandlerEnd> {
        let appname = self.appname.clone();
        let tli =
@@ -440,6 +446,7 @@ impl SafekeeperPostgresHandler {
            start_pos,
            end_pos,
            stop_pos,
+            term,
            commit_lsn_watch_rx,
            ws_guard: ws_guard.clone(),
            wal_reader,
@@ -476,6 +483,10 @@ struct WalSender<'a, IO> {
    // If present, terminate after reaching this position; used by walproposer
    // in recovery.
    stop_pos: Option<Lsn>,
+    /// When streaming uncommitted part, the term the client acts as the leader
+    /// in. Streaming is stopped if local term changes to a different (higher)
+    /// value.
+    term: Option<Term>,
    commit_lsn_watch_rx: Receiver<Lsn>,
    ws_guard: Arc<WalSenderGuard>,
    wal_reader: WalReader,
@@ -518,8 +529,18 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                .0 as usize;
            send_size = min(send_size, self.send_buf.len());
            let send_buf = &mut self.send_buf[..send_size];
-            // read wal into buffer
-            send_size = self.wal_reader.read(send_buf).await?;
+            let send_size: usize;
+            {
+                // If uncommitted part is being pulled, check that the term is
+                // still the expected one.
+                let _term_guard = if let Some(t) = self.term {
+                    Some(self.tli.acquire_term(t).await?)
+                } else {
+                    None
+                };
+                // read wal into buffer
+                send_size = self.wal_reader.read(send_buf).await?
+            };
            let send_buf = &send_buf[..send_size];

            // and send it
@@ -568,6 +589,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
            {
                if self.tli.should_walsender_stop(remote_consistent_lsn).await {
                    // Terminate if there is nothing more to send.
+                    // Note that "ending streaming" part of the string is used by
+                    // pageserver to identify WalReceiverError::SuccessfulCompletion,
+                    // do not change this string without updating pageserver.
                    return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
                        "ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
                        self.appname, self.start_pos,
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -237,7 +237,10 @@ impl SharedState {
            commit_lsn: self.sk.inmem.commit_lsn.0,
            remote_consistent_lsn: remote_consistent_lsn.0,
            peer_horizon_lsn: self.sk.inmem.peer_horizon_lsn.0,
-            safekeeper_connstr: conf.listen_pg_addr.clone(),
+            safekeeper_connstr: conf
+                .advertise_pg_addr
+                .to_owned()
+                .unwrap_or(conf.listen_pg_addr.clone()),
            backup_lsn: self.sk.inmem.backup_lsn.0,
            local_start_lsn: self.sk.state.local_start_lsn.0,
            availability_zone: conf.availability_zone.clone(),
@@ -496,6 +499,19 @@ impl Timeline {
        false
    }

+    /// Ensure taht current term is t, erroring otherwise, and lock the state.
+    pub async fn acquire_term(&self, t: Term) -> Result<MutexGuard<SharedState>> {
+        let ss = self.write_shared_state().await;
+        if ss.sk.state.acceptor_state.term != t {
+            bail!(
+                "failed to acquire term {}, current term {}",
+                t,
+                ss.sk.state.acceptor_state.term
+            );
+        }
+        Ok(ss)
+    }
+
    /// Returns whether s3 offloading is required and sets current status as
    /// matching it.
    pub async fn wal_backup_attend(&self) -> bool {
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -16,10 +16,13 @@ use crate::SafeKeeperConf;
 use postgres_backend::{AuthType, PostgresBackend};

 /// Accept incoming TCP connections and spawn them into a background thread.
+/// allowed_auth_scope is either SafekeeperData (wide JWT tokens giving access
+/// to any tenant are allowed) or Tenant (only tokens giving access to specific
+/// tenant are allowed). Doesn't matter if auth is disabled in conf.
 pub async fn task_main(
    conf: SafeKeeperConf,
    pg_listener: std::net::TcpListener,
-    allowed_auth_scope: Option<Scope>,
+    allowed_auth_scope: Scope,
 ) -> anyhow::Result<()> {
    // Tokio's from_std won't do this for us, per its comment.
    pg_listener.set_nonblocking(true)?;
@@ -50,7 +53,7 @@ async fn handle_socket(
    socket: TcpStream,
    conf: SafeKeeperConf,
    conn_id: ConnectionId,
-    allowed_auth_scope: Option<Scope>,
+    allowed_auth_scope: Scope,
 ) -> Result<(), QueryError> {
    socket.set_nodelay(true)?;
    let peer_addr = socket.peer_addr()?;
@@ -82,16 +85,17 @@ async fn handle_socket(
        },
    );

-    let auth_type = match conf.auth {
+    let auth_key = match allowed_auth_scope {
+        Scope::Tenant => conf.pg_tenant_only_auth.clone(),
+        _ => conf.pg_auth.clone(),
+    };
+    let auth_type = match auth_key {
        None => AuthType::Trust,
        Some(_) => AuthType::NeonJWT,
    };
-    let mut conn_handler = SafekeeperPostgresHandler::new(
-        conf,
-        conn_id,
-        Some(traffic_metrics.clone()),
-        allowed_auth_scope,
-    );
+    let auth_pair = auth_key.map(|key| (allowed_auth_scope, key));
+    let mut conn_handler =
+        SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()), auth_pair);
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
    // libpq protocol between safekeeper and walproposer / pageserver
    // We don't use shutdown.
--- a/scripts/combine_control_files.py
+++ b/scripts/combine_control_files.py
@@ -15,16 +15,12 @@ from pathlib import Path
    ],
    "library_index": {
        "anon": "anon",
-        "kq_imcx": "kq_imcx"
-        // would be more complicated for something like postgis where multiple library names all map to postgis
+        // for more complex extensions like postgis
+        // we might have something like:
+        // address_standardizer: postgis
+        // postgis_tiger: postgis
    },
    "extension_data": {
-        "kq_imcx": {
-            "control_data": {
-                "kq_imcx.control": "# This file is generated content from add_postgresql_extension.\n# No point in modifying it, it will be overwritten anyway.\n\n# Default version, always set\ndefault_version = '0.1'\n\n# Module pathname generated from target shared library name. Use\n# MODULE_PATHNAME in script file.\nmodule_pathname = '$libdir/kq_imcx.so'\n\n# Comment for extension. Set using COMMENT option. Can be set in\n# script file as well.\ncomment = 'ketteQ In-Memory Calendar Extension (IMCX)'\n\n# Encoding for script file. Set using ENCODING option.\n#encoding = ''\n\n# Required extensions. Set using REQUIRES option (multi-valued).\n#requires = ''\ntrusted = true\n"
-            },
-            "archive_path": "5648391853/v15/extensions/kq_imcx.tar.zst"
-        },
        "anon": {
            "control_data": {
                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
--- a/scripts/comment-test-report.js
+++ b/scripts/comment-test-report.js
@@ -223,6 +223,7 @@ module.exports = async ({ github, context, fetch, report }) => {
    } else {
        commentBody += `#### No tests were run or test report is not available\n`
    }
+    commentBody += autoupdateNotice

    let createCommentFn, listCommentsFn, updateCommentFn, issueNumberOrSha
    if (isPullRequest) {
--- a/scripts/ingest_regress_test_result-new-format.py
+++ b/scripts/ingest_regress_test_result-new-format.py
@@ -0,0 +1,198 @@
+#! /usr/bin/env python3
+
+import argparse
+import dataclasses
+import json
+import logging
+import os
+import re
+import sys
+from contextlib import contextmanager
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Tuple
+
+import backoff
+import psycopg2
+from psycopg2.extras import execute_values
+
+CREATE_TABLE = """
+CREATE TABLE IF NOT EXISTS results (
+    id           BIGSERIAL PRIMARY KEY,
+    parent_suite TEXT NOT NULL,
+    suite        TEXT NOT NULL,
+    name         TEXT NOT NULL,
+    status       TEXT NOT NULL,
+    started_at   TIMESTAMPTZ NOT NULL,
+    stopped_at   TIMESTAMPTZ NOT NULL,
+    duration     INT NOT NULL,
+    flaky        BOOLEAN NOT NULL,
+    build_type   TEXT NOT NULL,
+    pg_version   INT NOT NULL,
+    run_id       BIGINT NOT NULL,
+    run_attempt  INT NOT NULL,
+    reference    TEXT NOT NULL,
+    revision     CHAR(40) NOT NULL,
+    raw          JSONB COMPRESSION lz4 NOT NULL,
+    UNIQUE (parent_suite, suite, name, build_type, pg_version, started_at, stopped_at, run_id)
+);
+"""
+
+
+@dataclass
+class Row:
+    parent_suite: str
+    suite: str
+    name: str
+    status: str
+    started_at: datetime
+    stopped_at: datetime
+    duration: int
+    flaky: bool
+    build_type: str
+    pg_version: int
+    run_id: int
+    run_attempt: int
+    reference: str
+    revision: str
+    raw: str
+
+
+TEST_NAME_RE = re.compile(r"[\[-](?P<build_type>debug|release)-pg(?P<pg_version>\d+)[-\]]")
+
+
+def err(msg):
+    print(f"error: {msg}")
+    sys.exit(1)
+
+
+@contextmanager
+def get_connection_cursor(connstr: str):
+    @backoff.on_exception(backoff.expo, psycopg2.OperationalError, max_time=150)
+    def connect(connstr):
+        conn = psycopg2.connect(connstr, connect_timeout=30)
+        conn.autocommit = True
+        return conn
+
+    conn = connect(connstr)
+    try:
+        with conn.cursor() as cur:
+            yield cur
+    finally:
+        if conn is not None:
+            conn.close()
+
+
+def create_table(cur):
+    cur.execute(CREATE_TABLE)
+
+
+def parse_test_name(test_name: str) -> Tuple[str, int, str]:
+    build_type, pg_version = None, None
+    if match := TEST_NAME_RE.search(test_name):
+        found = match.groupdict()
+        build_type = found["build_type"]
+        pg_version = int(found["pg_version"])
+    else:
+        # It's ok, we embed BUILD_TYPE and Postgres Version into the test name only for regress suite and do not for other suites (like performance)
+        build_type = "release"
+        pg_version = 14
+
+    unparametrized_name = re.sub(rf"{build_type}-pg{pg_version}-?", "", test_name).replace("[]", "")
+
+    return build_type, pg_version, unparametrized_name
+
+
+def ingest_test_result(
+    cur,
+    reference: str,
+    revision: str,
+    run_id: int,
+    run_attempt: int,
+    test_cases_dir: Path,
+):
+    rows = []
+    for f in test_cases_dir.glob("*.json"):
+        test = json.loads(f.read_text())
+        # Drop unneded fields from raw data
+        raw = test.copy()
+        raw.pop("parameterValues")
+        raw.pop("labels")
+        raw.pop("extra")
+
+        build_type, pg_version, unparametrized_name = parse_test_name(test["name"])
+        labels = {label["name"]: label["value"] for label in test["labels"]}
+        row = Row(
+            parent_suite=labels["parentSuite"],
+            suite=labels["suite"],
+            name=unparametrized_name,
+            status=test["status"],
+            started_at=datetime.fromtimestamp(test["time"]["start"] / 1000, tz=timezone.utc),
+            stopped_at=datetime.fromtimestamp(test["time"]["stop"] / 1000, tz=timezone.utc),
+            duration=test["time"]["duration"],
+            flaky=test["flaky"] or test["retriesStatusChange"],
+            build_type=build_type,
+            pg_version=pg_version,
+            run_id=run_id,
+            run_attempt=run_attempt,
+            reference=reference,
+            revision=revision,
+            raw=json.dumps(raw),
+        )
+        rows.append(dataclasses.astuple(row))
+
+    columns = ",".join(f.name for f in dataclasses.fields(Row))
+    query = f"INSERT INTO results ({columns}) VALUES %s ON CONFLICT DO NOTHING"
+    execute_values(cur, query, rows)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Regress test result uploader. \
+            Database connection string should be provided via DATABASE_URL environment variable",
+    )
+    parser.add_argument("--initdb", action="store_true", help="Initialuze database")
+    parser.add_argument(
+        "--reference", type=str, required=True, help="git reference, for example refs/heads/main"
+    )
+    parser.add_argument("--revision", type=str, required=True, help="git revision")
+    parser.add_argument("--run-id", type=int, required=True, help="GitHub Workflow run id")
+    parser.add_argument(
+        "--run-attempt", type=int, required=True, help="GitHub Workflow run attempt"
+    )
+    parser.add_argument(
+        "--test-cases-dir",
+        type=Path,
+        required=True,
+        help="Path to a dir with extended test cases data",
+    )
+
+    connstr = os.getenv("DATABASE_URL", "")
+    if not connstr:
+        err("DATABASE_URL environment variable is not set")
+
+    args = parser.parse_args()
+    with get_connection_cursor(connstr) as cur:
+        if args.initdb:
+            create_table(cur)
+
+        if not args.test_cases_dir.exists():
+            err(f"test-cases dir {args.test_cases_dir} does not exist")
+
+        if not args.test_cases_dir.is_dir():
+            err(f"test-cases dir {args.test_cases_dir} it not a directory")
+
+        ingest_test_result(
+            cur,
+            reference=args.reference,
+            revision=args.revision,
+            run_id=args.run_id,
+            run_attempt=args.run_attempt,
+            test_cases_dir=args.test_cases_dir,
+        )
+
+
+if __name__ == "__main__":
+    logging.getLogger("backoff").addHandler(logging.StreamHandler())
+    main()
--- a/test_runner/fixtures/broker.py
+++ b/test_runner/fixtures/broker.py
@@ -0,0 +1,60 @@
+import subprocess
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Optional
+
+from fixtures.log_helper import log
+
+
+@dataclass
+class NeonBroker:
+    """An object managing storage_broker instance"""
+
+    logfile: Path
+    port: int
+    neon_binpath: Path
+    handle: Optional[subprocess.Popen[Any]] = None  # handle of running daemon
+
+    def listen_addr(self):
+        return f"127.0.0.1:{self.port}"
+
+    def client_url(self):
+        return f"http://{self.listen_addr()}"
+
+    def check_status(self):
+        return True  # TODO
+
+    def try_start(self):
+        if self.handle is not None:
+            log.debug(f"storage_broker is already running on port {self.port}")
+            return
+
+        listen_addr = self.listen_addr()
+        log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"')
+        with open(self.logfile, "wb") as logfile:
+            args = [
+                str(self.neon_binpath / "storage_broker"),
+                f"--listen-addr={listen_addr}",
+            ]
+            self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile)
+
+        # wait for start
+        started_at = time.time()
+        while True:
+            try:
+                self.check_status()
+            except Exception as e:
+                elapsed = time.time() - started_at
+                if elapsed > 5:
+                    raise RuntimeError(
+                        f"timed out waiting {elapsed:.0f}s for storage_broker start: {e}"
+                    ) from e
+                time.sleep(0.5)
+            else:
+                break  # success
+
+    def stop(self):
+        if self.handle is not None:
+            self.handle.terminate()
+            self.handle.wait()
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2,13 +2,11 @@ from __future__ import annotations

 import abc
 import asyncio
-import enum
 import filecmp
 import json
 import os
 import re
 import shutil
-import socket
 import subprocess
 import tempfile
 import textwrap
@@ -17,12 +15,11 @@ import uuid
 from contextlib import closing, contextmanager
 from dataclasses import dataclass, field
 from datetime import datetime
-from enum import Flag, auto
 from functools import cached_property
 from itertools import chain, product
 from pathlib import Path
 from types import TracebackType
-from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
+from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, cast
 from urllib.parse import urlparse

 import asyncpg
@@ -35,6 +32,7 @@ import requests
 from _pytest.config import Config
 from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
+from mypy_boto3_s3 import S3Client

 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
@@ -42,10 +40,21 @@ from psycopg2.extensions import cursor as PgCursor
 from psycopg2.extensions import make_dsn, parse_dsn
 from typing_extensions import Literal

+from fixtures.broker import NeonBroker
 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.pg_version import PgVersion
+from fixtures.port_distributor import PortDistributor
+from fixtures.remote_storage import (
+    LocalFsStorage,
+    MockS3Server,
+    RemoteStorage,
+    RemoteStorageKind,
+    RemoteStorageUsers,
+    S3Storage,
+    remote_storage_to_toml_inline_table,
+)
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
    ATTACHMENT_NAME_REGEX,
@@ -78,19 +87,6 @@ DEFAULT_OUTPUT_DIR: str = "test_output"
 DEFAULT_BRANCH_NAME: str = "main"

 BASE_PORT: int = 15000
-WORKER_PORT_NUM: int = 1000
-
-
-def pytest_configure(config: Config):
-    """
-    Check that we do not overflow available ports range.
-    """
-
-    numprocesses = config.getoption("numprocesses")
-    if (
-        numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768
-    ):  # do not use ephemeral ports
-        raise Exception("Too many workers configured. Cannot distribute ports for services.")


@pytest.fixture(scope="session")
@@ -192,6 +188,11 @@ def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "fu
    return scope


+@pytest.fixture(scope="session")
+def worker_port_num():
+    return (32768 - BASE_PORT) // int(os.environ.get("PYTEST_XDIST_WORKER_COUNT", "1"))
+
+
@pytest.fixture(scope="session")
 def worker_seq_no(worker_id: str) -> int:
    # worker_id is a pytest-xdist fixture
@@ -204,10 +205,10 @@ def worker_seq_no(worker_id: str) -> int:


@pytest.fixture(scope="session")
-def worker_base_port(worker_seq_no: int) -> int:
-    # so we divide ports in ranges of 100 ports
+def worker_base_port(worker_seq_no: int, worker_port_num: int) -> int:
+    # so we divide ports in ranges of ports
    # so workers have disjoint set of ports for services
-    return BASE_PORT + worker_seq_no * WORKER_PORT_NUM
+    return BASE_PORT + worker_seq_no * worker_port_num


 def get_dir_size(path: str) -> int:
@@ -220,80 +221,9 @@ def get_dir_size(path: str) -> int:
    return totalbytes


-def can_bind(host: str, port: int) -> bool:
-    """
-    Check whether a host:port is available to bind for listening
-
-    Inspired by the can_bind() perl function used in Postgres tests, in
-    vendor/postgres-v14/src/test/perl/PostgresNode.pm
-    """
-    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-        # TODO: The pageserver and safekeepers don't use SO_REUSEADDR at the
-        # moment. If that changes, we should use start using SO_REUSEADDR here
-        # too, to allow reusing ports more quickly.
-        # See https://github.com/neondatabase/neon/issues/801
-        # sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-
-        try:
-            sock.bind((host, port))
-            sock.listen()
-            return True
-        except socket.error:
-            log.info(f"Port {port} is in use, skipping")
-            return False
-        finally:
-            sock.close()
-
-
-class PortDistributor:
-    def __init__(self, base_port: int, port_number: int):
-        self.iterator = iter(range(base_port, base_port + port_number))
-        self.port_map: Dict[int, int] = {}
-
-    def get_port(self) -> int:
-        for port in self.iterator:
-            if can_bind("localhost", port):
-                return port
-        raise RuntimeError(
-            "port range configured for test is exhausted, consider enlarging the range"
-        )
-
-    def replace_with_new_port(self, value: Union[int, str]) -> Union[int, str]:
-        """
-        Returns a new port for a port number in a string (like "localhost:1234") or int.
-        Replacements are memorised, so a substitution for the same port is always the same.
-        """
-
-        # TODO: replace with structural pattern matching for Python >= 3.10
-        if isinstance(value, int):
-            return self._replace_port_int(value)
-
-        if isinstance(value, str):
-            return self._replace_port_str(value)
-
-        raise TypeError(f"unsupported type {type(value)} of {value=}")
-
-    def _replace_port_int(self, value: int) -> int:
-        known_port = self.port_map.get(value)
-        if known_port is None:
-            known_port = self.port_map[value] = self.get_port()
-
-        return known_port
-
-    def _replace_port_str(self, value: str) -> str:
-        # Use regex to find port in a string
-        # urllib.parse.urlparse produces inconvenient results for cases without scheme like "localhost:5432"
-        # See https://bugs.python.org/issue27657
-        ports = re.findall(r":(\d+)(?:/|$)", value)
-        assert len(ports) == 1, f"can't find port in {value}"
-        port_int = int(ports[0])
-
-        return value.replace(f":{port_int}", f":{self._replace_port_int(port_int)}")
-
-
@pytest.fixture(scope="session")
-def port_distributor(worker_base_port: int) -> PortDistributor:
-    return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM)
+def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistributor:
+    return PortDistributor(base_port=worker_base_port, port_number=worker_port_num)


@pytest.fixture(scope="session")
@@ -464,120 +394,6 @@ class AuthKeys:
        return self.generate_token(scope="tenant", tenant_id=str(tenant_id))


-class MockS3Server:
-    """
-    Starts a mock S3 server for testing on a port given, errors if the server fails to start or exits prematurely.
-    Relies that `poetry` and `moto` server are installed, since it's the way the tests are run.
-
-    Also provides a set of methods to derive the connection properties from and the method to kill the underlying server.
-    """
-
-    def __init__(
-        self,
-        port: int,
-    ):
-        self.port = port
-
-        # XXX: do not use `shell=True` or add `exec ` to the command here otherwise.
-        # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux
-        # if a process is started from the shell process.
-        self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"])
-        error = None
-        try:
-            return_code = self.subprocess.poll()
-            if return_code is not None:
-                error = f"expected mock s3 server to run but it exited with code {return_code}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'"
-        except Exception as e:
-            error = f"expected mock s3 server to start but it failed with exception: {e}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'"
-        if error is not None:
-            log.error(error)
-            self.kill()
-            raise RuntimeError("failed to start s3 mock server")
-
-    def endpoint(self) -> str:
-        return f"http://127.0.0.1:{self.port}"
-
-    def region(self) -> str:
-        return "us-east-1"
-
-    def access_key(self) -> str:
-        return "test"
-
-    def secret_key(self) -> str:
-        return "test"
-
-    def kill(self):
-        self.subprocess.kill()
-
-
-@enum.unique
-class RemoteStorageKind(str, enum.Enum):
-    LOCAL_FS = "local_fs"
-    MOCK_S3 = "mock_s3"
-    REAL_S3 = "real_s3"
-    # Pass to tests that are generic to remote storage
-    # to ensure the test pass with or without the remote storage
-    NOOP = "noop"
-
-
-def available_remote_storages() -> List[RemoteStorageKind]:
-    remote_storages = [RemoteStorageKind.LOCAL_FS, RemoteStorageKind.MOCK_S3]
-    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None:
-        remote_storages.append(RemoteStorageKind.REAL_S3)
-        log.info("Enabling real s3 storage for tests")
-    else:
-        log.info("Using mock implementations to test remote storage")
-    return remote_storages
-
-
-@dataclass
-class LocalFsStorage:
-    root: Path
-
-
-@dataclass
-class S3Storage:
-    bucket_name: str
-    bucket_region: str
-    access_key: str
-    secret_key: str
-    endpoint: Optional[str] = None
-    prefix_in_bucket: Optional[str] = ""
-
-    def access_env_vars(self) -> Dict[str, str]:
-        return {
-            "AWS_ACCESS_KEY_ID": self.access_key,
-            "AWS_SECRET_ACCESS_KEY": self.secret_key,
-        }
-
-
-RemoteStorage = Union[LocalFsStorage, S3Storage]
-
-
-# serialize as toml inline table
-def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str:
-    if isinstance(remote_storage, LocalFsStorage):
-        remote_storage_config = f"local_path='{remote_storage.root}'"
-    elif isinstance(remote_storage, S3Storage):
-        remote_storage_config = f"bucket_name='{remote_storage.bucket_name}',\
-            bucket_region='{remote_storage.bucket_region}'"
-
-        if remote_storage.prefix_in_bucket is not None:
-            remote_storage_config += f",prefix_in_bucket='{remote_storage.prefix_in_bucket}'"
-
-        if remote_storage.endpoint is not None:
-            remote_storage_config += f",endpoint='{remote_storage.endpoint}'"
-    else:
-        raise Exception("invalid remote storage type")
-
-    return f"{{{remote_storage_config}}}"
-
-
-class RemoteStorageUsers(Flag):
-    PAGESERVER = auto()
-    SAFEKEEPER = auto()
-
-
 class NeonEnvBuilder:
    """
    Builder object to create a Neon runtime environment
@@ -616,10 +432,12 @@ class NeonEnvBuilder:
        self.rust_log_override = rust_log_override
        self.port_distributor = port_distributor
        self.remote_storage = remote_storage
+        self.ext_remote_storage: Optional[S3Storage] = None
+        self.remote_storage_client: Optional[S3Client] = None
        self.remote_storage_users = remote_storage_users
        self.broker = broker
        self.run_id = run_id
-        self.mock_s3_server = mock_s3_server
+        self.mock_s3_server: MockS3Server = mock_s3_server
        self.pageserver_config_override = pageserver_config_override
        self.num_safekeepers = num_safekeepers
        self.safekeepers_id_start = safekeepers_id_start
@@ -651,7 +469,7 @@ class NeonEnvBuilder:

        # Prepare the default branch to start the postgres on later.
        # Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API.
-        log.info(
+        log.debug(
            f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
        )
        initial_tenant, initial_timeline = env.neon_cli.create_tenant(
@@ -667,15 +485,24 @@ class NeonEnvBuilder:
        remote_storage_kind: RemoteStorageKind,
        test_name: str,
        force_enable: bool = True,
+        enable_remote_extensions: bool = False,
    ):
        if remote_storage_kind == RemoteStorageKind.NOOP:
            return
        elif remote_storage_kind == RemoteStorageKind.LOCAL_FS:
            self.enable_local_fs_remote_storage(force_enable=force_enable)
        elif remote_storage_kind == RemoteStorageKind.MOCK_S3:
-            self.enable_mock_s3_remote_storage(bucket_name=test_name, force_enable=force_enable)
+            self.enable_mock_s3_remote_storage(
+                bucket_name=test_name,
+                force_enable=force_enable,
+                enable_remote_extensions=enable_remote_extensions,
+            )
        elif remote_storage_kind == RemoteStorageKind.REAL_S3:
-            self.enable_real_s3_remote_storage(test_name=test_name, force_enable=force_enable)
+            self.enable_real_s3_remote_storage(
+                test_name=test_name,
+                force_enable=force_enable,
+                enable_remote_extensions=enable_remote_extensions,
+            )
        else:
            raise RuntimeError(f"Unknown storage type: {remote_storage_kind}")

@@ -689,11 +516,18 @@ class NeonEnvBuilder:
        assert force_enable or self.remote_storage is None, "remote storage is enabled already"
        self.remote_storage = LocalFsStorage(Path(self.repo_dir / "local_fs_remote_storage"))

-    def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable: bool = True):
+    def enable_mock_s3_remote_storage(
+        self,
+        bucket_name: str,
+        force_enable: bool = True,
+        enable_remote_extensions: bool = False,
+    ):
        """
        Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already.
        Starts up the mock server, if that does not run yet.
        Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
+
+        Also creates the bucket for extensions, self.ext_remote_storage bucket
        """
        assert force_enable or self.remote_storage is None, "remote storage is enabled already"
        mock_endpoint = self.mock_s3_server.endpoint()
@@ -714,9 +548,25 @@ class NeonEnvBuilder:
            bucket_region=mock_region,
            access_key=self.mock_s3_server.access_key(),
            secret_key=self.mock_s3_server.secret_key(),
+            prefix_in_bucket="pageserver",
        )

-    def enable_real_s3_remote_storage(self, test_name: str, force_enable: bool = True):
+        if enable_remote_extensions:
+            self.ext_remote_storage = S3Storage(
+                bucket_name=bucket_name,
+                endpoint=mock_endpoint,
+                bucket_region=mock_region,
+                access_key=self.mock_s3_server.access_key(),
+                secret_key=self.mock_s3_server.secret_key(),
+                prefix_in_bucket="ext",
+            )
+
+    def enable_real_s3_remote_storage(
+        self,
+        test_name: str,
+        force_enable: bool = True,
+        enable_remote_extensions: bool = False,
+    ):
        """
        Sets up configuration to use real s3 endpoint without mock server
        """
@@ -756,6 +606,15 @@ class NeonEnvBuilder:
            prefix_in_bucket=self.remote_storage_prefix,
        )

+        if enable_remote_extensions:
+            self.ext_remote_storage = S3Storage(
+                bucket_name="neon-dev-extensions-eu-central-1",
+                bucket_region="eu-central-1",
+                access_key=access_key,
+                secret_key=secret_key,
+                prefix_in_bucket=None,
+            )
+
    def cleanup_local_storage(self):
        if self.preserve_database_files:
            return
@@ -789,6 +648,7 @@ class NeonEnvBuilder:
        # `self.remote_storage_prefix` is coupled with `S3Storage` storage type,
        # so this line effectively a no-op
        assert isinstance(self.remote_storage, S3Storage)
+        assert self.remote_storage_client is not None

        if self.keep_remote_storage_contents:
            log.info("keep_remote_storage_contents skipping remote storage cleanup")
@@ -918,6 +778,8 @@ class NeonEnv:
        self.neon_binpath = config.neon_binpath
        self.pg_distrib_dir = config.pg_distrib_dir
        self.endpoint_counter = 0
+        self.remote_storage_client = config.remote_storage_client
+        self.ext_remote_storage = config.ext_remote_storage

        # generate initial tenant ID here instead of letting 'neon init' generate it,
        # so that we don't need to dig it out of the config file afterwards.
@@ -1014,7 +876,14 @@ class NeonEnv:

    def timeline_dir(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
        """Get a timeline directory's path based on the repo directory of the test environment"""
-        return self.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
+        return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id)
+
+    def tenant_dir(
+        self,
+        tenant_id: TenantId,
+    ) -> Path:
+        """Get a tenant directory's path based on the repo directory of the test environment"""
+        return self.repo_dir / "tenants" / str(tenant_id)

    def get_pageserver_version(self) -> str:
        bin_pageserver = str(self.neon_binpath / "pageserver")
@@ -1444,12 +1313,20 @@ class NeonCli(AbstractNeonCli):
        log.info(f"Stopping pageserver with {cmd}")
        return self.raw_cli(cmd)

-    def safekeeper_start(self, id: int) -> "subprocess.CompletedProcess[str]":
+    def safekeeper_start(
+        self, id: int, extra_opts: Optional[List[str]] = None
+    ) -> "subprocess.CompletedProcess[str]":
        s3_env_vars = None
        if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
            s3_env_vars = self.env.remote_storage.access_env_vars()

-        return self.raw_cli(["safekeeper", "start", str(id)], extra_env_vars=s3_env_vars)
+        if extra_opts is not None:
+            extra_opts = [f"-e={opt}" for opt in extra_opts]
+        else:
+            extra_opts = []
+        return self.raw_cli(
+            ["safekeeper", "start", str(id), *extra_opts], extra_env_vars=s3_env_vars
+        )

    def safekeeper_stop(
        self, id: Optional[int] = None, immediate=False
@@ -1505,6 +1382,7 @@ class NeonCli(AbstractNeonCli):
        tenant_id: Optional[TenantId] = None,
        lsn: Optional[Lsn] = None,
        branch_name: Optional[str] = None,
+        remote_ext_config: Optional[str] = None,
    ) -> "subprocess.CompletedProcess[str]":
        args = [
            "endpoint",
@@ -1514,6 +1392,8 @@ class NeonCli(AbstractNeonCli):
            "--pg-version",
            self.env.pg_version,
        ]
+        if remote_ext_config is not None:
+            args.extend(["--remote-ext-config", remote_ext_config])
        if lsn is not None:
            args.append(f"--lsn={lsn}")
        args.extend(["--pg-port", str(pg_port)])
@@ -1526,7 +1406,11 @@ class NeonCli(AbstractNeonCli):
        if endpoint_id is not None:
            args.append(endpoint_id)

-        res = self.raw_cli(args)
+        s3_env_vars = None
+        if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
+            s3_env_vars = self.env.remote_storage.access_env_vars()
+
+        res = self.raw_cli(args, extra_env_vars=s3_env_vars)
        res.check_returncode()
        return res

@@ -1618,7 +1502,6 @@ class NeonPageserver(PgProtocol):
            # FIXME: replication patch for tokio_postgres regards  any but CopyDone/CopyData message in CopyBoth stream as unexpected
            ".*Connection aborted: unexpected message from server*",
            ".*kill_and_wait_impl.*: wait successful.*",
-            ".*: db error:.*ending streaming to Some.*",
            ".*query handler for 'pagestream.*failed: Broken pipe.*",  # pageserver notices compute shut down
            ".*query handler for 'pagestream.*failed: Connection reset by peer.*",  # pageserver notices compute shut down
            # safekeeper connection can fail with this, in the window between timeline creation
@@ -1628,9 +1511,6 @@ class NeonPageserver(PgProtocol):
            ".*Error processing HTTP request: Forbidden",
            # intentional failpoints
            ".*failpoint ",
-            # FIXME: there is a race condition between GC and detach, see
-            # https://github.com/neondatabase/neon/issues/2442
-            ".*could not remove ephemeral file.*No such file or directory.*",
            # FIXME: These need investigation
            ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
            ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
@@ -1653,6 +1533,9 @@ class NeonPageserver(PgProtocol):
            ".*Compaction failed, retrying in .*: queue is in state Stopped.*",
            # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
            ".*Error processing HTTP request: NotFound: Timeline .* was not found",
+            ".*took more than expected to complete.*",
+            # these can happen during shutdown, but it should not be a reason to fail a test
+            ".*completed, took longer than expected.*",
        ]

    def start(
@@ -2375,7 +2258,7 @@ class Endpoint(PgProtocol):

        return self

-    def start(self) -> "Endpoint":
+    def start(self, remote_ext_config: Optional[str] = None) -> "Endpoint":
        """
        Start the Postgres instance.
        Returns self.
@@ -2391,6 +2274,7 @@ class Endpoint(PgProtocol):
            http_port=self.http_port,
            tenant_id=self.tenant_id,
            safekeepers=self.active_safekeepers,
+            remote_ext_config=remote_ext_config,
        )
        self.running = True

@@ -2480,6 +2364,7 @@ class Endpoint(PgProtocol):
        hot_standby: bool = False,
        lsn: Optional[Lsn] = None,
        config_lines: Optional[List[str]] = None,
+        remote_ext_config: Optional[str] = None,
    ) -> "Endpoint":
        """
        Create an endpoint, apply config, and start Postgres.
@@ -2494,7 +2379,7 @@ class Endpoint(PgProtocol):
            config_lines=config_lines,
            hot_standby=hot_standby,
            lsn=lsn,
-        ).start()
+        ).start(remote_ext_config=remote_ext_config)

        log.info(f"Postgres startup took {time.time() - started_at} seconds")

@@ -2528,6 +2413,7 @@ class EndpointFactory:
        lsn: Optional[Lsn] = None,
        hot_standby: bool = False,
        config_lines: Optional[List[str]] = None,
+        remote_ext_config: Optional[str] = None,
    ) -> Endpoint:
        ep = Endpoint(
            self.env,
@@ -2544,6 +2430,7 @@ class EndpointFactory:
            hot_standby=hot_standby,
            config_lines=config_lines,
            lsn=lsn,
+            remote_ext_config=remote_ext_config,
        )

    def create(
@@ -2628,9 +2515,9 @@ class Safekeeper:
    id: int
    running: bool = False

-    def start(self) -> "Safekeeper":
+    def start(self, extra_opts: Optional[List[str]] = None) -> "Safekeeper":
        assert self.running is False
-        self.env.neon_cli.safekeeper_start(self.id)
+        self.env.neon_cli.safekeeper_start(self.id, extra_opts=extra_opts)
        self.running = True
        # wait for wal acceptor start by checking its status
        started_at = time.time()
@@ -2822,59 +2709,6 @@ class SafekeeperHttpClient(requests.Session):
        return metrics


-@dataclass
-class NeonBroker:
-    """An object managing storage_broker instance"""
-
-    logfile: Path
-    port: int
-    neon_binpath: Path
-    handle: Optional[subprocess.Popen[Any]] = None  # handle of running daemon
-
-    def listen_addr(self):
-        return f"127.0.0.1:{self.port}"
-
-    def client_url(self):
-        return f"http://{self.listen_addr()}"
-
-    def check_status(self):
-        return True  # TODO
-
-    def try_start(self):
-        if self.handle is not None:
-            log.debug(f"storage_broker is already running on port {self.port}")
-            return
-
-        listen_addr = self.listen_addr()
-        log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"')
-        with open(self.logfile, "wb") as logfile:
-            args = [
-                str(self.neon_binpath / "storage_broker"),
-                f"--listen-addr={listen_addr}",
-            ]
-            self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile)
-
-        # wait for start
-        started_at = time.time()
-        while True:
-            try:
-                self.check_status()
-            except Exception as e:
-                elapsed = time.time() - started_at
-                if elapsed > 5:
-                    raise RuntimeError(
-                        f"timed out waiting {elapsed:.0f}s for storage_broker start: {e}"
-                    ) from e
-                time.sleep(0.5)
-            else:
-                break  # success
-
-    def stop(self):
-        if self.handle is not None:
-            self.handle.terminate()
-            self.handle.wait()
-
-
 def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
    """Compute the working directory for an individual test."""
    test_name = request.node.name
@@ -3001,8 +2835,15 @@ def check_restored_datadir_content(
    endpoint: Endpoint,
 ):
    # Get the timeline ID. We need it for the 'basebackup' command
-    timeline = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])

+    # many tests already checkpoint, but do it just in case
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("CHECKPOINT")
+
+    # wait for pageserver to catch up
+    wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
    # stop postgres to ensure that files won't change
    endpoint.stop()

@@ -3017,7 +2858,7 @@ def check_restored_datadir_content(
        {psql_path}                                    \
            --no-psqlrc                                \
            postgres://localhost:{env.pageserver.service_port.pg}  \
-            -c 'basebackup {endpoint.tenant_id} {timeline}'  \
+            -c 'basebackup {endpoint.tenant_id} {timeline_id}'  \
         | tar -x -C {restored_dir_path}
    """

--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -210,6 +210,10 @@ class PageserverHttpClient(requests.Session):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
        self.verbose_error(res)

+    def tenant_delete(self, tenant_id: TenantId):
+        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
+        self.verbose_error(res)
+
    def tenant_load(self, tenant_id: TenantId):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
        self.verbose_error(res)
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -1,9 +1,11 @@
 import time
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional

 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.remote_storage import RemoteStorageKind, S3Storage
 from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import wait_until


 def assert_tenant_state(
@@ -17,15 +19,6 @@ def assert_tenant_state(
    assert tenant_status["state"]["slug"] == expected_state, message or tenant_status


-def tenant_exists(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
-    tenants = pageserver_http.tenant_list()
-    matching = [t for t in tenants if TenantId(t["id"]) == tenant_id]
-    assert len(matching) < 2
-    if len(matching) == 0:
-        return None
-    return matching[0]
-
-
 def remote_consistent_lsn(
    pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
 ) -> Lsn:
@@ -198,21 +191,24 @@ def wait_timeline_detail_404(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    iterations: int,
+    interval: Optional[float] = None,
 ):
-    last_exc = None
-    for _ in range(iterations):
-        time.sleep(0.250)
+    if interval is None:
+        interval = 0.25
+
+    def timeline_is_missing():
+        data = {}
        try:
            data = pageserver_http.timeline_detail(tenant_id, timeline_id)
-            log.info(f"detail {data}")
+            log.info(f"timeline detail {data}")
        except PageserverApiException as e:
            log.debug(e)
            if e.status_code == 404:
                return

-            last_exc = e
+        raise RuntimeError(f"Timeline exists state {data.get('state')}")

-    raise last_exc or RuntimeError(f"Timeline wasnt deleted in time, state: {data['state']}")
+    wait_until(iterations, interval, func=timeline_is_missing)


 def timeline_delete_wait_completed(
@@ -220,7 +216,77 @@ def timeline_delete_wait_completed(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    iterations: int = 20,
+    interval: Optional[float] = None,
    **delete_args,
 ):
    pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
-    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations)
+    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)
+
+
+if TYPE_CHECKING:
+    # TODO avoid by combining remote storage related stuff in single type
+    # and just passing in this type instead of whole builder
+    from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
+    # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
+    assert neon_env_builder.remote_storage_kind in (
+        RemoteStorageKind.MOCK_S3,
+        RemoteStorageKind.REAL_S3,
+    )
+    # For mypy
+    assert isinstance(neon_env_builder.remote_storage, S3Storage)
+    assert neon_env_builder.remote_storage_client is not None
+
+    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
+    response = neon_env_builder.remote_storage_client.list_objects_v2(
+        Bucket=neon_env_builder.remote_storage.bucket_name,
+        Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
+    )
+    objects = response.get("Contents")
+    assert (
+        response["KeyCount"] == 0
+    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+
+
+def wait_tenant_status_404(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    iterations: int,
+    interval: float = 0.250,
+):
+    def tenant_is_missing():
+        data = {}
+        try:
+            data = pageserver_http.tenant_status(tenant_id)
+            log.info(f"tenant status {data}")
+        except PageserverApiException as e:
+            log.debug(e)
+            if e.status_code == 404:
+                return
+
+        raise RuntimeError(f"Timeline exists state {data.get('state')}")
+
+    wait_until(iterations, interval=interval, func=tenant_is_missing)
+
+
+def tenant_delete_wait_completed(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    iterations: int,
+):
+    pageserver_http.tenant_delete(tenant_id=tenant_id)
+    wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)
+
+
+MANY_SMALL_LAYERS_TENANT_CONFIG = {
+    "gc_period": "0s",
+    "compaction_period": "0s",
+    "checkpoint_distance": f"{1024**2}",
+    "image_creation_threshold": "100",
+}
+
+
+def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
+    return 30 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 10
--- a/test_runner/fixtures/port_distributor.py
+++ b/test_runner/fixtures/port_distributor.py
@@ -0,0 +1,77 @@
+import re
+import socket
+from contextlib import closing
+from typing import Dict, Union
+
+from fixtures.log_helper import log
+
+
+def can_bind(host: str, port: int) -> bool:
+    """
+    Check whether a host:port is available to bind for listening
+
+    Inspired by the can_bind() perl function used in Postgres tests, in
+    vendor/postgres-v14/src/test/perl/PostgresNode.pm
+    """
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+        # TODO: The pageserver and safekeepers don't use SO_REUSEADDR at the
+        # moment. If that changes, we should use start using SO_REUSEADDR here
+        # too, to allow reusing ports more quickly.
+        # See https://github.com/neondatabase/neon/issues/801
+        # sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+
+        try:
+            sock.bind((host, port))
+            sock.listen()
+            return True
+        except socket.error:
+            log.info(f"Port {port} is in use, skipping")
+            return False
+        finally:
+            sock.close()
+
+
+class PortDistributor:
+    def __init__(self, base_port: int, port_number: int):
+        self.iterator = iter(range(base_port, base_port + port_number))
+        self.port_map: Dict[int, int] = {}
+
+    def get_port(self) -> int:
+        for port in self.iterator:
+            if can_bind("localhost", port):
+                return port
+        raise RuntimeError(
+            "port range configured for test is exhausted, consider enlarging the range"
+        )
+
+    def replace_with_new_port(self, value: Union[int, str]) -> Union[int, str]:
+        """
+        Returns a new port for a port number in a string (like "localhost:1234") or int.
+        Replacements are memorised, so a substitution for the same port is always the same.
+        """
+
+        # TODO: replace with structural pattern matching for Python >= 3.10
+        if isinstance(value, int):
+            return self._replace_port_int(value)
+
+        if isinstance(value, str):
+            return self._replace_port_str(value)
+
+        raise TypeError(f"unsupported type {type(value)} of {value=}")
+
+    def _replace_port_int(self, value: int) -> int:
+        known_port = self.port_map.get(value)
+        if known_port is None:
+            known_port = self.port_map[value] = self.get_port()
+
+        return known_port
+
+    def _replace_port_str(self, value: str) -> str:
+        # Use regex to find port in a string
+        # urllib.parse.urlparse produces inconvenient results for cases without scheme like "localhost:5432"
+        # See https://bugs.python.org/issue27657
+        ports = re.findall(r":(\d+)(?:/|$)", value)
+        assert len(ports) == 1, f"can't find port in {value}"
+        port_int = int(ports[0])
+
+        return value.replace(f":{port_int}", f":{self._replace_port_int(port_int)}")
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -0,0 +1,143 @@
+import enum
+import json
+import os
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+from fixtures.log_helper import log
+
+
+class MockS3Server:
+    """
+    Starts a mock S3 server for testing on a port given, errors if the server fails to start or exits prematurely.
+    Relies that `poetry` and `moto` server are installed, since it's the way the tests are run.
+
+    Also provides a set of methods to derive the connection properties from and the method to kill the underlying server.
+    """
+
+    def __init__(
+        self,
+        port: int,
+    ):
+        self.port = port
+
+        # XXX: do not use `shell=True` or add `exec ` to the command here otherwise.
+        # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux
+        # if a process is started from the shell process.
+        self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"])
+        error = None
+        try:
+            return_code = self.subprocess.poll()
+            if return_code is not None:
+                error = f"expected mock s3 server to run but it exited with code {return_code}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'"
+        except Exception as e:
+            error = f"expected mock s3 server to start but it failed with exception: {e}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'"
+        if error is not None:
+            log.error(error)
+            self.kill()
+            raise RuntimeError("failed to start s3 mock server")
+
+    def endpoint(self) -> str:
+        return f"http://127.0.0.1:{self.port}"
+
+    def region(self) -> str:
+        return "us-east-1"
+
+    def access_key(self) -> str:
+        return "test"
+
+    def secret_key(self) -> str:
+        return "test"
+
+    def kill(self):
+        self.subprocess.kill()
+
+
+@enum.unique
+class RemoteStorageKind(str, enum.Enum):
+    LOCAL_FS = "local_fs"
+    MOCK_S3 = "mock_s3"
+    REAL_S3 = "real_s3"
+    # Pass to tests that are generic to remote storage
+    # to ensure the test pass with or without the remote storage
+    NOOP = "noop"
+
+
+def available_remote_storages() -> List[RemoteStorageKind]:
+    remote_storages = [RemoteStorageKind.LOCAL_FS, RemoteStorageKind.MOCK_S3]
+    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None:
+        remote_storages.append(RemoteStorageKind.REAL_S3)
+        log.info("Enabling real s3 storage for tests")
+    else:
+        log.info("Using mock implementations to test remote storage")
+    return remote_storages
+
+
+def available_s3_storages() -> List[RemoteStorageKind]:
+    remote_storages = [RemoteStorageKind.MOCK_S3]
+    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None:
+        remote_storages.append(RemoteStorageKind.REAL_S3)
+        log.info("Enabling real s3 storage for tests")
+    else:
+        log.info("Using mock implementations to test remote storage")
+    return remote_storages
+
+
+@dataclass
+class LocalFsStorage:
+    root: Path
+
+
+@dataclass
+class S3Storage:
+    bucket_name: str
+    bucket_region: str
+    access_key: str
+    secret_key: str
+    endpoint: Optional[str] = None
+    prefix_in_bucket: Optional[str] = ""
+
+    def access_env_vars(self) -> Dict[str, str]:
+        return {
+            "AWS_ACCESS_KEY_ID": self.access_key,
+            "AWS_SECRET_ACCESS_KEY": self.secret_key,
+        }
+
+    def to_string(self) -> str:
+        return json.dumps(
+            {
+                "bucket": self.bucket_name,
+                "region": self.bucket_region,
+                "endpoint": self.endpoint,
+                "prefix": self.prefix_in_bucket,
+            }
+        )
+
+
+RemoteStorage = Union[LocalFsStorage, S3Storage]
+
+
+# serialize as toml inline table
+def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str:
+    if isinstance(remote_storage, LocalFsStorage):
+        remote_storage_config = f"local_path='{remote_storage.root}'"
+    elif isinstance(remote_storage, S3Storage):
+        remote_storage_config = f"bucket_name='{remote_storage.bucket_name}',\
+            bucket_region='{remote_storage.bucket_region}'"
+
+        if remote_storage.prefix_in_bucket is not None:
+            remote_storage_config += f",prefix_in_bucket='{remote_storage.prefix_in_bucket}'"
+
+        if remote_storage.endpoint is not None:
+            remote_storage_config += f",endpoint='{remote_storage.endpoint}'"
+    else:
+        raise Exception("invalid remote storage type")
+
+    return f"{{{remote_storage_config}}}"
+
+
+class RemoteStorageUsers(enum.Flag):
+    PAGESERVER = enum.auto()
+    SAFEKEEPER = enum.auto()
--- a/test_runner/fixtures/types.py
+++ b/test_runner/fixtures/types.py
@@ -89,6 +89,9 @@ class TenantId(Id):
    def __repr__(self) -> str:
        return f'`TenantId("{self.id.hex()}")'

+    def __str__(self) -> str:
+        return self.id.hex()
+

 class TimelineId(Id):
    def __repr__(self) -> str:
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -6,13 +6,16 @@ import subprocess
 import tarfile
 import time
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Tuple, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, TypeVar
 from urllib.parse import urlencode

 import allure
 from psycopg2.extensions import cursor

 from fixtures.log_helper import log
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import PgBin
 from fixtures.types import TimelineId

 Fn = TypeVar("Fn", bound=Callable[..., Any])
@@ -300,17 +303,13 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn):
    raise Exception("timed out while waiting for %s" % func) from last_exception


-def wait_while(number_of_iterations: int, interval: float, func):
+def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
    """
-    Wait until 'func' returns false, or throws an exception.
+    Fast way to populate data.
+    For more layers consider combining with these tenant settings:
+    {
+        "checkpoint_distance": 1024 ** 2,
+        "image_creation_threshold": 100,
+    }
    """
-    for i in range(number_of_iterations):
-        try:
-            if not func():
-                return
-            log.info("waiting for %s iteration %s failed", func, i + 1)
-            time.sleep(interval)
-            continue
-        except Exception:
-            return
-    raise Exception("timed out while waiting for %s" % func)
+    pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", connstr])
--- a/test_runner/regress/data/extension_test/5670669815/v14/ext_index.json
+++ b/test_runner/regress/data/extension_test/5670669815/v14/ext_index.json
@@ -0,0 +1,24 @@
+{
+    "public_extensions": [
+        "anon",
+        "pg_buffercache"
+    ],
+    "library_index": {
+        "anon": "anon",
+        "pg_buffercache": "pg_buffercache"
+    },
+    "extension_data": {
+        "pg_buffercache": {
+            "control_data": {
+                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
+            },
+            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
+        },
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
+        }
+    }
+}
--- a/Show More
+++ b/Show More