try: catch all bad tests by removing the implicit endpoint creation

test: do not start two primary endpoints on same branch
test: allow passing branch-name to endpoint_start
2026-07-17 19:10:38 +00:00 · 2023-07-27 16:47:46 +03:00 · 2023-07-27 14:32:13 +03:00 · 2023-07-27 14:31:53 +03:00 · 2023-07-27 14:31:04 +03:00
158 changed files with 3891 additions and 9179 deletions
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -1,20 +1,7 @@
 name: 'Create Allure report'
 description: 'Generate Allure report from uploaded by actions/allure-report-store tests results'

-inputs:
-  store-test-results-into-db:
-    description: 'Whether to store test results into the database. TEST_RESULT_CONNSTR/TEST_RESULT_CONNSTR_NEW should be set'
-    type: boolean
-    required: false
-    default: false
-
 outputs:
-  base-url:
-    description: 'Base URL for Allure report'
-    value: ${{ steps.generate-report.outputs.base-url }}
-  base-s3-url:
-    description: 'Base S3 URL for Allure report'
-    value: ${{ steps.generate-report.outputs.base-s3-url }}
  report-url:
    description: 'Allure report URL'
    value: ${{ steps.generate-report.outputs.report-url }}
@@ -76,8 +63,8 @@ runs:
          rm -f ${ALLURE_ZIP}
        fi
      env:
-        ALLURE_VERSION: 2.23.1
-        ALLURE_ZIP_SHA256: 11141bfe727504b3fd80c0f9801eb317407fd0ac983ebb57e671f14bac4bcd86
+        ALLURE_VERSION: 2.22.1
+        ALLURE_ZIP_SHA256: fdc7a62d94b14c5e0bf25198ae1feded6b005fdbed864b4d3cb4e5e901720b0b

    # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
    - name: Acquire lock
@@ -115,11 +102,6 @@ runs:
        REPORT_PREFIX=reports/${BRANCH_OR_PR}
        RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}

-        BASE_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}
-        BASE_S3_URL=s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}
-        REPORT_URL=${BASE_URL}/index.html
-        REPORT_JSON_URL=${BASE_URL}/data/suites.json
-
        # Get previously uploaded data for this run
        ZSTD_NBTHREADS=0

@@ -128,9 +110,10 @@ runs:
          # There's no previously uploaded data for this $GITHUB_RUN_ID
          exit 0
        fi
+        for S3_FILEPATH in ${S3_FILEPATHS}; do
+          time aws s3 cp --only-show-errors "s3://${BUCKET}/${S3_FILEPATH}" "${WORKDIR}"

-        time aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${RAW_PREFIX}/" "${WORKDIR}/"
-        for archive in $(find ${WORKDIR} -name "*.tar.zst"); do
+          archive=${WORKDIR}/$(basename $S3_FILEPATH)
          mkdir -p ${archive%.tar.zst}
          time tar -xf ${archive} -C ${archive%.tar.zst}
          rm -f ${archive}
@@ -147,10 +130,9 @@ runs:

        # Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
+        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"

-        # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
-        # and to keep files on the host to upload them to the database
-        time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
+        REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html

        # Generate redirect
        cat <<EOF > ${WORKDIR}/index.html
@@ -162,10 +144,8 @@ runs:
        EOF
        time aws s3 cp --only-show-errors ${WORKDIR}/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"

-        echo "base-url=${BASE_URL}"               >> $GITHUB_OUTPUT
-        echo "base-s3-url=${BASE_S3_URL}"         >> $GITHUB_OUTPUT
-        echo "report-url=${REPORT_URL}"           >> $GITHUB_OUTPUT
-        echo "report-json-url=${REPORT_JSON_URL}" >> $GITHUB_OUTPUT
+        echo "report-url=${REPORT_URL}"                                   >> $GITHUB_OUTPUT
+        echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT

        echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}

@@ -179,41 +159,6 @@ runs:
          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
        fi

-    - name: Store Allure test stat in the DB
-      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
-      shell: bash -euxo pipefail {0}
-      env:
-        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-        REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }}
-      run: |
-        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR}
-
-        ./scripts/pysync
-
-        poetry run python3 scripts/ingest_regress_test_result.py \
-          --revision ${COMMIT_SHA} \
-          --reference ${GITHUB_REF} \
-          --build-type unified \
-          --ingest ${WORKDIR}/report/data/suites.json
-
-    - name: Store Allure test stat in the DB (new)
-      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
-      shell: bash -euxo pipefail {0}
-      env:
-        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-        BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
-      run: |
-        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}
-
-        ./scripts/pysync
-
-        poetry run python3 scripts/ingest_regress_test_result-new-format.py \
-          --reference ${GITHUB_REF} \
-          --revision ${COMMIT_SHA} \
-          --run-id ${GITHUB_RUN_ID} \
-          --run-attempt ${GITHUB_RUN_ATTEMPT} \
-          --test-cases-dir ${WORKDIR}/report/data/test-cases
-
    - name: Cleanup
      if: always()
      shell: bash -euxo pipefail {0}
--- a/.github/actions/download/action.yml
+++ b/.github/actions/download/action.yml
@@ -31,7 +31,7 @@ runs:
        BUCKET=neon-github-public-dev
        FILENAME=$(basename $ARCHIVE)

-        S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+        S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
        if [ -z "${S3_KEY}" ]; then
          if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then
            echo 'SKIPPED=true' >> $GITHUB_OUTPUT
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -432,11 +432,6 @@ jobs:
        if: ${{ !cancelled() }}
        id: create-allure-report
        uses: ./.github/actions/allure-report-generate
-        with:
-          store-test-results-into-db: true
-        env:
-          REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
-          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

      - uses: actions/github-script@v6
        if: ${{ !cancelled() }}
@@ -457,6 +452,25 @@ jobs:
              report,
            })

+      - name: Store Allure test stat in the DB
+        if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
+        env:
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
+        run: |
+          ./scripts/pysync
+
+          curl --fail --output suites.json "${REPORT_JSON_URL}"
+          export BUILD_TYPE=unified
+          export DATABASE_URL="$TEST_RESULT_CONNSTR"
+
+          poetry run python3 scripts/ingest_regress_test_result.py \
+            --revision ${COMMIT_SHA} \
+            --reference ${GITHUB_REF} \
+            --build-type ${BUILD_TYPE} \
+            --ingest suites.json
+
  coverage-report:
    runs-on: [ self-hosted, gen3, small ]
    container:
@@ -780,7 +794,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.15.4
+      VM_BUILDER_VERSION: v0.13.1

    steps:
      - name: Checkout
@@ -1053,7 +1067,7 @@ jobs:
            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
            FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst

-            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
            if [ -z "${S3_KEY}" ]; then
              echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
              exit 1
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -639,12 +639,6 @@ dependencies = [
 "vsimd",
 ]

-[[package]]
-name = "base64ct"
-version = "1.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
-
 [[package]]
 name = "bincode"
 version = "1.3.3"
@@ -746,9 +740,6 @@ name = "cc"
 version = "1.0.79"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
-dependencies = [
- "jobserver",
-]

 [[package]]
 name = "cexpr"
@@ -892,8 +883,6 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "chrono",
- "regex",
- "remote_storage",
 "serde",
 "serde_json",
 "serde_with",
@@ -918,14 +907,12 @@ dependencies = [
 "opentelemetry",
 "postgres",
 "regex",
- "remote_storage",
 "reqwest",
 "serde",
 "serde_json",
 "tar",
 "tokio",
 "tokio-postgres",
- "toml_edit",
 "tracing",
 "tracing-opentelemetry",
 "tracing-subscriber",
@@ -933,7 +920,6 @@ dependencies = [
 "url",
 "utils",
 "workspace_hack",
- "zstd",
 ]

 [[package]]
@@ -994,7 +980,6 @@ dependencies = [
 "tar",
 "thiserror",
 "toml",
- "tracing",
 "url",
 "utils",
 "workspace_hack",
@@ -1018,9 +1003,9 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"

 [[package]]
 name = "cpufeatures"
-version = "0.2.9"
+version = "0.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1"
+checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58"
 dependencies = [
 "libc",
 ]
@@ -1200,15 +1185,15 @@ dependencies = [

 [[package]]
 name = "dashmap"
-version = "5.5.0"
+version = "5.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
+checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
 dependencies = [
 "cfg-if",
- "hashbrown 0.14.0",
+ "hashbrown 0.12.3",
 "lock_api",
 "once_cell",
- "parking_lot_core 0.9.8",
+ "parking_lot_core 0.9.7",
 ]

 [[package]]
@@ -1657,12 +1642,6 @@ dependencies = [
 "ahash",
 ]

-[[package]]
-name = "hashbrown"
-version = "0.14.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
-
 [[package]]
 name = "hashlink"
 version = "0.8.2"
@@ -1993,15 +1972,6 @@ version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"

-[[package]]
-name = "jobserver"
-version = "0.1.26"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
-dependencies = [
- "libc",
-]
-
 [[package]]
 name = "js-sys"
 version = "0.3.63"
@@ -2087,9 +2057,9 @@ checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"

 [[package]]
 name = "lock_api"
-version = "0.4.10"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16"
+checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df"
 dependencies = [
 "autocfg",
 "scopeguard",
@@ -2353,9 +2323,9 @@ dependencies = [

 [[package]]
 name = "once_cell"
-version = "1.18.0"
+version = "1.17.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
+checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"

 [[package]]
 name = "oorandom"
@@ -2654,7 +2624,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
 dependencies = [
 "lock_api",
- "parking_lot_core 0.9.8",
+ "parking_lot_core 0.9.7",
 ]

 [[package]]
@@ -2673,26 +2643,15 @@ dependencies = [

 [[package]]
 name = "parking_lot_core"
-version = "0.9.8"
+version = "0.9.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447"
+checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521"
 dependencies = [
 "cfg-if",
 "libc",
- "redox_syscall 0.3.5",
+ "redox_syscall 0.2.16",
 "smallvec",
- "windows-targets 0.48.0",
-]
-
-[[package]]
-name = "password-hash"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "346f04948ba92c43e8469c1ee6736c7563d71012b17d40745260fe106aac2166"
-dependencies = [
- "base64ct",
- "rand_core",
- "subtle",
+ "windows-sys 0.45.0",
 ]

 [[package]]
@@ -2703,8 +2662,6 @@ checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
 dependencies = [
 "digest",
 "hmac",
- "password-hash",
- "sha2",
 ]

 [[package]]
@@ -3083,7 +3040,6 @@ dependencies = [
 "chrono",
 "clap",
 "consumption_metrics",
- "dashmap",
 "futures",
 "git-version",
 "hashbrown 0.13.2",
@@ -3281,7 +3237,6 @@ dependencies = [
 "metrics",
 "once_cell",
 "pin-project-lite",
- "scopeguard",
 "serde",
 "serde_json",
 "tempfile",
@@ -5341,7 +5296,6 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "bytes",
- "cc",
 "chrono",
 "clap",
 "clap_builder",
@@ -5442,33 +5396,3 @@ name = "zeroize"
 version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9"
-
-[[package]]
-name = "zstd"
-version = "0.12.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
-dependencies = [
- "zstd-safe",
-]
-
-[[package]]
-name = "zstd-safe"
-version = "6.0.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
-dependencies = [
- "libc",
- "zstd-sys",
-]
-
-[[package]]
-name = "zstd-sys"
-version = "2.0.8+zstd.1.5.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c"
-dependencies = [
- "cc",
- "libc",
- "pkg-config",
-]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -54,7 +54,6 @@ comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
-dashmap = "5.5.0"
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
@@ -89,7 +88,7 @@ opentelemetry = "0.19.0"
 opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.11.0"
 parking_lot = "0.12"
-pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
+pbkdf2 = "0.12.1"
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
--- a/2
+++ b/2
@@ -51,7 +51,6 @@ RUN set -e \
      --bin safekeeper  \
      --bin storage_broker  \
      --bin proxy  \
-      --bin neon_local \
      --locked --release \
    && cachepot -s

@@ -77,7 +76,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -551,8 +551,10 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
-    echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
+# eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703 made on 15/07/2023
+# There is no release tag yet
+RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703.tar.gz -O pg_embedding.tar.gz && \
+    echo "030846df723652f99a8689ce63b66fa0c23477a7fd723533ab8a6b28ab70730f pg_embedding.tar.gz" | sha256sum --check && \
    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -816,7 +818,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libxml2, libxslt1.1 for xml2
 # libzstd1 for zstd
 # libboost*, libfreetype6, and zlib1g for rdkit
-# ca-certificates for communicating with s3 by compute_ctl
 RUN apt update &&  \
    apt install --no-install-recommends -y \
        gdb \
@@ -840,8 +841,7 @@ RUN apt update &&  \
        libcurl4-openssl-dev \
        locales \
        procps \
-        zlib1g \
-        ca-certificates && \
+        zlib1g && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

--- a/2
+++ b/2
@@ -108,8 +108,6 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
-	+@echo "Compiling amcheck $*"
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install

 .PHONY: postgres-clean-%
 postgres-clean-%:
--- a/README.md
+++ b/README.md
@@ -29,13 +29,13 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev openssl python-poetry
+libcurl4-openssl-dev
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel libcurl-devel openssl poetry
+  protobuf-devel libcurl-devel
 ```
 * On Arch based systems, these packages are needed:
 ```bash
@@ -235,13 +235,6 @@ CARGO_BUILD_FLAGS="--features=testing" make
 ./scripts/pytest
 ```

-By default, this runs both debug and release modes, and all supported postgres versions. When
-testing locally, it is convenient to run just run one set of permutations, like this:
-
-```sh
-DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
-```
-
 ## Documentation

 [docs](/docs) Contains a top-level overview of all available markdown documentation.
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -32,6 +32,3 @@ url.workspace = true
 compute_api.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
-toml_edit.workspace = true
-remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
-zstd = "0.12.4"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -5,8 +5,6 @@
 //! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
 //! - Every start is a fresh start, so the data directory is removed and
 //!   initialized again on each run.
-//! - If remote_extension_config is provided, it will be used to fetch extensions list
-//!  and download `shared_preload_libraries` from the remote storage.
 //! - Next it will put configuration files into the `PGDATA` directory.
 //! - Sync safekeepers and get commit LSN.
 //! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -29,8 +27,7 @@
 //! compute_ctl -D /var/db/postgres/compute \
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
-//!             -b /usr/local/bin/postgres \
-//!             -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
+//!             -b /usr/local/bin/postgres
 //! ```
 //!
 use std::collections::HashMap;
@@ -38,7 +35,7 @@ use std::fs::File;
 use std::panic;
 use std::path::Path;
 use std::process::exit;
-use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
+use std::sync::{mpsc, Arc, Condvar, Mutex};
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
@@ -51,33 +48,22 @@ use compute_api::responses::ComputeStatus;

 use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
-use compute_tools::extension_server::{get_pg_version, init_remote_storage};
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;

-// this is an arbitrary build tag. Fine as a default / for testing purposes
-// in-case of not-set environment var
-const BUILD_TAG_DEFAULT: &str = "5670669815";
+const BUILD_TAG_DEFAULT: &str = "local";

 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;

-    let build_tag = option_env!("BUILD_TAG")
-        .unwrap_or(BUILD_TAG_DEFAULT)
-        .to_string();
+    let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
+
    info!("build_tag: {build_tag}");

    let matches = cli().get_matches();
-    let pgbin_default = String::from("postgres");
-    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
-
-    let remote_ext_config = matches.get_one::<String>("remote-ext-config");
-    let ext_remote_storage = remote_ext_config.map(|x| {
-        init_remote_storage(x).expect("cannot initialize remote extension storage from config")
-    });

    let http_port = *matches
        .get_one::<u16>("http-port")
@@ -142,12 +128,14 @@ fn main() -> Result<()> {
    let compute_id = matches.get_one::<String>("compute-id");
    let control_plane_uri = matches.get_one::<String>("control-plane-uri");

+    // Try to use just 'postgres' if no path is provided
+    let pgbin = matches.get_one::<String>("pgbin").unwrap();
+
    let spec;
    let mut live_config_allowed = false;
    match spec_json {
        // First, try to get cluster spec from the cli argument
        Some(json) => {
-            info!("got spec from cli argument {}", json);
            spec = Some(serde_json::from_str(json)?);
        }
        None => {
@@ -180,10 +168,8 @@ fn main() -> Result<()> {

    let mut new_state = ComputeState::new();
    let spec_set;
-
    if let Some(spec) = spec {
        let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
-        info!("new pspec.spec: {:?}", pspec.spec);
        new_state.pspec = Some(pspec);
        spec_set = true;
    } else {
@@ -193,35 +179,20 @@ fn main() -> Result<()> {
        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
        pgdata: pgdata.to_string(),
        pgbin: pgbin.to_string(),
-        pgversion: get_pg_version(pgbin),
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
-        ext_remote_storage,
-        ext_download_progress: RwLock::new(HashMap::new()),
-        build_tag,
    };
    let compute = Arc::new(compute_node);

-    // If this is a pooled VM, prewarm before starting HTTP server and becoming
-    // available for binding. Prewarming helps postgres start quicker later,
-    // because QEMU will already have it's memory allocated from the host, and
-    // the necessary binaries will alreaady be cached.
-    if !spec_set {
-        compute.prewarm_postgres()?;
-    }
-
    // Launch http service first, so we were able to serve control-plane
    // requests, while configuration is still in progress.
    let _http_handle =
        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");

-    let extension_server_port: u16 = http_port;
-
    if !spec_set {
        // No spec provided, hang waiting for it.
        info!("no compute spec provided, waiting");
-
        let mut state = compute.state.lock().unwrap();
        while state.status != ComputeStatus::ConfigurationPending {
            state = compute.state_changed.wait(state).unwrap();
@@ -258,7 +229,7 @@ fn main() -> Result<()> {
    // Start Postgres
    let mut delay_exit = false;
    let mut exit_code = None;
-    let pg = match compute.start_compute(extension_server_port) {
+    let pg = match compute.start_compute() {
        Ok(pg) => Some(pg),
        Err(err) => {
            error!("could not start the compute node: {:?}", err);
@@ -387,12 +358,6 @@ fn cli() -> clap::Command {
                .long("control-plane-uri")
                .value_name("CONTROL_PLANE_API_BASE_URI"),
        )
-        .arg(
-            Arg::new("remote-ext-config")
-                .short('r')
-                .long("remote-ext-config")
-                .value_name("REMOTE_EXT_CONFIG"),
-        )
 }

 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,20 +1,16 @@
-use std::collections::HashMap;
 use std::fs;
 use std::io::BufRead;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::{Condvar, Mutex, RwLock};
-use std::time::Instant;
+use std::sync::{Condvar, Mutex};

 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use postgres::{Client, NoTls};
-use tokio;
 use tokio_postgres;
 use tracing::{error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
@@ -24,12 +20,10 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
-
+use crate::config;
 use crate::pg_helpers::*;
 use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
-use crate::{config, extension_server};

 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
@@ -37,7 +31,6 @@ pub struct ComputeNode {
    pub connstr: url::Url,
    pub pgdata: String,
    pub pgbin: String,
-    pub pgversion: String,
    /// We should only allow live re- / configuration of the compute node if
    /// it uses 'pull model', i.e. it can go to control-plane and fetch
    /// the latest configuration. Otherwise, there could be a case:
@@ -57,19 +50,6 @@ pub struct ComputeNode {
    pub state: Mutex<ComputeState>,
    /// `Condvar` to allow notifying waiters about state changes.
    pub state_changed: Condvar,
-    ///  the S3 bucket that we search for extensions in
-    pub ext_remote_storage: Option<GenericRemoteStorage>,
-    // key: ext_archive_name, value: started download time, download_completed?
-    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
-    pub build_tag: String,
-}
-
-// store some metrics about download size that might impact startup time
-#[derive(Clone, Debug)]
-pub struct RemoteExtensionMetrics {
-    num_ext_downloaded: u64,
-    largest_ext_size: u64,
-    total_ext_download_size: u64,
 }

 #[derive(Clone, Debug)]
@@ -280,7 +260,7 @@ impl ComputeNode {
    #[instrument(skip_all, fields(%lsn))]
    fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");
-        let start_time = Instant::now();
+        let start_time = Utc::now();

        let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;

@@ -293,10 +273,7 @@ impl ComputeNode {
            info!("Storage auth token not set");
        }

-        // Connect to pageserver
        let mut client = config.connect(NoTls)?;
-        let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
-
        let basebackup_cmd = match lsn {
            // HACK We don't use compression on first start (Lsn(0)) because there's no API for it
            Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id),
@@ -342,10 +319,13 @@ impl ComputeNode {
        };

        // Report metrics
-        let mut state = self.state.lock().unwrap();
-        state.metrics.pageserver_connect_micros = pageserver_connect_micros;
-        state.metrics.basebackup_bytes = measured_reader.get_byte_count() as u64;
-        state.metrics.basebackup_ms = start_time.elapsed().as_millis() as u64;
+        self.state.lock().unwrap().metrics.basebackup_bytes =
+            measured_reader.get_byte_count() as u64;
+        self.state.lock().unwrap().metrics.basebackup_ms = Utc::now()
+            .signed_duration_since(start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
        Ok(())
    }

@@ -493,22 +473,14 @@ impl ComputeNode {
    /// Do all the preparations like PGDATA directory creation, configuration,
    /// safekeepers sync, basebackup, etc.
    #[instrument(skip_all)]
-    pub fn prepare_pgdata(
-        &self,
-        compute_state: &ComputeState,
-        extension_server_port: u16,
-    ) -> Result<()> {
+    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        let spec = &pspec.spec;
        let pgdata_path = Path::new(&self.pgdata);

        // Remove/create an empty pgdata directory and put configuration there.
        self.create_pgdata()?;
-        config::write_postgres_conf(
-            &pgdata_path.join("postgresql.conf"),
-            &pspec.spec,
-            Some(extension_server_port),
-        )?;
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;

        // Syncing safekeepers is only safe with primary nodes: if a primary
        // is already connected it will be kicked out, so a secondary (standby)
@@ -560,50 +532,6 @@ impl ComputeNode {
        Ok(())
    }

-    /// Start and stop a postgres process to warm up the VM for startup.
-    pub fn prewarm_postgres(&self) -> Result<()> {
-        info!("prewarming");
-
-        // Create pgdata
-        let pgdata = &format!("{}.warmup", self.pgdata);
-        create_pgdata(pgdata)?;
-
-        // Run initdb to completion
-        info!("running initdb");
-        let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
-        Command::new(initdb_bin)
-            .args(["-D", pgdata])
-            .output()
-            .expect("cannot start initdb process");
-
-        // Write conf
-        use std::io::Write;
-        let conf_path = Path::new(pgdata).join("postgresql.conf");
-        let mut file = std::fs::File::create(conf_path)?;
-        writeln!(file, "shared_buffers=65536")?;
-        writeln!(file, "port=51055")?; // Nobody should be connecting
-        writeln!(file, "shared_preload_libraries = 'neon'")?;
-
-        // Start postgres
-        info!("starting postgres");
-        let mut pg = Command::new(&self.pgbin)
-            .args(["-D", pgdata])
-            .spawn()
-            .expect("cannot start postgres process");
-
-        // Stop it when it's ready
-        info!("waiting for postgres");
-        wait_for_postgres(&mut pg, Path::new(pgdata))?;
-        pg.kill()?;
-        info!("sent kill signal");
-        pg.wait()?;
-        info!("done prewarming");
-
-        // clean up
-        let _ok = fs::remove_dir_all(pgdata);
-        Ok(())
-    }
-
    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
    #[instrument(skip_all)]
@@ -698,7 +626,7 @@ impl ComputeNode {

        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
        self.pg_reload_conf(&mut client)?;
@@ -728,7 +656,7 @@ impl ComputeNode {
    }

    #[instrument(skip_all)]
-    pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
+    pub fn start_compute(&self) -> Result<std::process::Child> {
        let compute_state = self.state.lock().unwrap().clone();
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
@@ -739,38 +667,7 @@ impl ComputeNode {
            pspec.timeline_id,
        );

-        info!(
-            "start_compute spec.remote_extensions {:?}",
-            pspec.spec.remote_extensions
-        );
-
-        // This part is sync, because we need to download
-        // remote shared_preload_libraries before postgres start (if any)
-        if let Some(remote_extensions) = &pspec.spec.remote_extensions {
-            // First, create control files for all availale extensions
-            extension_server::create_control_files(remote_extensions, &self.pgbin);
-
-            let library_load_start_time = Utc::now();
-            let remote_ext_metrics = self.prepare_preload_libraries(&pspec.spec)?;
-
-            let library_load_time = Utc::now()
-                .signed_duration_since(library_load_start_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64;
-            let mut state = self.state.lock().unwrap();
-            state.metrics.load_ext_ms = library_load_time;
-            state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
-            state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
-            state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
-            info!(
-                "Loading shared_preload_libraries took {:?}ms",
-                library_load_time
-            );
-            info!("{:?}", remote_ext_metrics);
-        }
-
-        self.prepare_pgdata(&compute_state, extension_server_port)?;
+        self.prepare_pgdata(&compute_state)?;

        let start_time = Utc::now();
        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
@@ -918,172 +815,4 @@ LIMIT 100",
            "{{\"pg_stat_statements\": []}}".to_string()
        }
    }
-
-    // download an archive, unzip and place files in correct locations
-    pub async fn download_extension(
-        &self,
-        real_ext_name: String,
-        ext_path: RemotePath,
-    ) -> Result<u64, DownloadError> {
-        let remote_storage = self
-            .ext_remote_storage
-            .as_ref()
-            .ok_or(DownloadError::BadInput(anyhow::anyhow!(
-                "Remote extensions storage is not configured",
-            )))?;
-
-        let ext_archive_name = ext_path.object_name().expect("bad path");
-
-        let mut first_try = false;
-        if !self
-            .ext_download_progress
-            .read()
-            .expect("lock err")
-            .contains_key(ext_archive_name)
-        {
-            self.ext_download_progress
-                .write()
-                .expect("lock err")
-                .insert(ext_archive_name.to_string(), (Utc::now(), false));
-            first_try = true;
-        }
-        let (download_start, download_completed) =
-            self.ext_download_progress.read().expect("lock err")[ext_archive_name];
-        let start_time_delta = Utc::now()
-            .signed_duration_since(download_start)
-            .to_std()
-            .unwrap()
-            .as_millis() as u64;
-
-        // how long to wait for extension download if it was started by another process
-        const HANG_TIMEOUT: u64 = 3000; // milliseconds
-
-        if download_completed {
-            info!("extension already downloaded, skipping re-download");
-            return Ok(0);
-        } else if start_time_delta < HANG_TIMEOUT && !first_try {
-            info!("download {ext_archive_name} already started by another process, hanging untill completion or timeout");
-            let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(500));
-            loop {
-                info!("waiting for download");
-                interval.tick().await;
-                let (_, download_completed_now) =
-                    self.ext_download_progress.read().expect("lock")[ext_archive_name];
-                if download_completed_now {
-                    info!("download finished by whoever else downloaded it");
-                    return Ok(0);
-                }
-            }
-            // NOTE: the above loop will get terminated
-            // based on the timeout of the download function
-        }
-
-        // if extension hasn't been downloaded before or the previous
-        // attempt to download was at least HANG_TIMEOUT ms ago
-        // then we try to download it here
-        info!("downloading new extension {ext_archive_name}");
-
-        let download_size = extension_server::download_extension(
-            &real_ext_name,
-            &ext_path,
-            remote_storage,
-            &self.pgbin,
-        )
-        .await
-        .map_err(DownloadError::Other);
-
-        self.ext_download_progress
-            .write()
-            .expect("bad lock")
-            .insert(ext_archive_name.to_string(), (download_start, true));
-
-        download_size
-    }
-
-    #[tokio::main]
-    pub async fn prepare_preload_libraries(
-        &self,
-        spec: &ComputeSpec,
-    ) -> Result<RemoteExtensionMetrics> {
-        if self.ext_remote_storage.is_none() {
-            return Ok(RemoteExtensionMetrics {
-                num_ext_downloaded: 0,
-                largest_ext_size: 0,
-                total_ext_download_size: 0,
-            });
-        }
-        let remote_extensions = spec
-            .remote_extensions
-            .as_ref()
-            .ok_or(anyhow::anyhow!("Remote extensions are not configured",))?;
-
-        info!("parse shared_preload_libraries from spec.cluster.settings");
-        let mut libs_vec = Vec::new();
-        if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
-            libs_vec = libs
-                .split(&[',', '\'', ' '])
-                .filter(|s| *s != "neon" && !s.is_empty())
-                .map(str::to_string)
-                .collect();
-        }
-        info!("parse shared_preload_libraries from provided postgresql.conf");
-
-        // that is used in neon_local and python tests
-        if let Some(conf) = &spec.cluster.postgresql_conf {
-            let conf_lines = conf.split('\n').collect::<Vec<&str>>();
-            let mut shared_preload_libraries_line = "";
-            for line in conf_lines {
-                if line.starts_with("shared_preload_libraries") {
-                    shared_preload_libraries_line = line;
-                }
-            }
-            let mut preload_libs_vec = Vec::new();
-            if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
-                preload_libs_vec = libs
-                    .split(&[',', '\'', ' '])
-                    .filter(|s| *s != "neon" && !s.is_empty())
-                    .map(str::to_string)
-                    .collect();
-            }
-            libs_vec.extend(preload_libs_vec);
-        }
-
-        // Don't try to download libraries that are not in the index.
-        // Assume that they are already present locally.
-        libs_vec.retain(|lib| remote_extensions.library_index.contains_key(lib));
-
-        info!("Downloading to shared preload libraries: {:?}", &libs_vec);
-
-        let mut download_tasks = Vec::new();
-        for library in &libs_vec {
-            let (ext_name, ext_path) = remote_extensions.get_ext(library, true)?;
-            download_tasks.push(self.download_extension(ext_name, ext_path));
-        }
-        let results = join_all(download_tasks).await;
-
-        let mut remote_ext_metrics = RemoteExtensionMetrics {
-            num_ext_downloaded: 0,
-            largest_ext_size: 0,
-            total_ext_download_size: 0,
-        };
-        for result in results {
-            let download_size = match result {
-                Ok(res) => {
-                    remote_ext_metrics.num_ext_downloaded += 1;
-                    res
-                }
-                Err(err) => {
-                    // if we failed to download an extension, we don't want to fail the whole
-                    // process, but we do want to log the error
-                    error!("Failed to download extension: {}", err);
-                    0
-                }
-            };
-
-            remote_ext_metrics.largest_ext_size =
-                std::cmp::max(remote_ext_metrics.largest_ext_size, download_size);
-            remote_ext_metrics.total_ext_download_size += download_size;
-        }
-        Ok(remote_ext_metrics)
-    }
 }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -33,11 +33,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
 }

 /// Create or completely rewrite configuration file specified by `path`
-pub fn write_postgres_conf(
-    path: &Path,
-    spec: &ComputeSpec,
-    extension_server_port: Option<u16>,
-) -> Result<()> {
+pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
    // File::create() destroys the file content if it exists.
    let mut file = File::create(path)?;

@@ -91,9 +87,5 @@ pub fn write_postgres_conf(
        writeln!(file, "# Managed by compute_ctl: end")?;
    }

-    if let Some(port) = extension_server_port {
-        writeln!(file, "neon.extension_server_port={}", port)?;
-    }
-
    Ok(())
 }
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -1,221 +0,0 @@
-// Download extension files from the extension store
-// and put them in the right place in the postgres directory (share / lib)
-/*
-The layout of the S3 bucket is as follows:
-5615610098 // this is an extension build number
-├── v14
-│   ├── extensions
-│   │   ├── anon.tar.zst
-│   │   └── embedding.tar.zst
-│   └── ext_index.json
-└── v15
-    ├── extensions
-    │   ├── anon.tar.zst
-    │   └── embedding.tar.zst
-    └── ext_index.json
-5615261079
-├── v14
-│   ├── extensions
-│   │   └── anon.tar.zst
-│   └── ext_index.json
-└── v15
-    ├── extensions
-    │   └── anon.tar.zst
-    └── ext_index.json
-5623261088
-├── v14
-│   ├── extensions
-│   │   └── embedding.tar.zst
-│   └── ext_index.json
-└── v15
-    ├── extensions
-    │   └── embedding.tar.zst
-    └── ext_index.json
-
-Note that build number cannot be part of prefix because we might need extensions
-from other build numbers.
-
-ext_index.json stores the control files and location of extension archives
-It also stores a list of public extensions and a library_index
-
-We don't need to duplicate extension.tar.zst files.
-We only need to upload a new one if it is updated.
-(Although currently we just upload every time anyways, hopefully will change
-this sometime)
-
-*access* is controlled by spec
-
-More specifically, here is an example ext_index.json
-{
-    "public_extensions": [
-        "anon",
-        "pg_buffercache"
-    ],
-    "library_index": {
-        "anon": "anon",
-        "pg_buffercache": "pg_buffercache"
-    },
-    "extension_data": {
-        "pg_buffercache": {
-            "control_data": {
-                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
-            },
-            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
-        },
-        "anon": {
-            "control_data": {
-                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
-            },
-            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
-        }
-    }
-}
-*/
-use anyhow::Context;
-use anyhow::{self, Result};
-use compute_api::spec::RemoteExtSpec;
-use remote_storage::*;
-use serde_json;
-use std::io::Read;
-use std::num::{NonZeroU32, NonZeroUsize};
-use std::path::Path;
-use std::str;
-use tar::Archive;
-use tokio::io::AsyncReadExt;
-use tracing::info;
-use tracing::log::warn;
-use zstd::stream::read::Decoder;
-
-fn get_pg_config(argument: &str, pgbin: &str) -> String {
-    // gives the result of `pg_config [argument]`
-    // where argument is a flag like `--version` or `--sharedir`
-    let pgconfig = pgbin
-        .strip_suffix("postgres")
-        .expect("bad pgbin")
-        .to_owned()
-        + "/pg_config";
-    let config_output = std::process::Command::new(pgconfig)
-        .arg(argument)
-        .output()
-        .expect("pg_config error");
-    std::str::from_utf8(&config_output.stdout)
-        .expect("pg_config error")
-        .trim()
-        .to_string()
-}
-
-pub fn get_pg_version(pgbin: &str) -> String {
-    // pg_config --version returns a (platform specific) human readable string
-    // such as "PostgreSQL 15.4". We parse this to v14/v15
-    let human_version = get_pg_config("--version", pgbin);
-    if human_version.contains("15") {
-        return "v15".to_string();
-    } else if human_version.contains("14") {
-        return "v14".to_string();
-    }
-    panic!("Unsuported postgres version {human_version}");
-}
-
-// download the archive for a given extension,
-// unzip it, and place files in the appropriate locations (share/lib)
-pub async fn download_extension(
-    ext_name: &str,
-    ext_path: &RemotePath,
-    remote_storage: &GenericRemoteStorage,
-    pgbin: &str,
-) -> Result<u64> {
-    info!("Download extension {:?} from {:?}", ext_name, ext_path);
-    let mut download = remote_storage.download(ext_path).await?;
-    let mut download_buffer = Vec::new();
-    download
-        .download_stream
-        .read_to_end(&mut download_buffer)
-        .await?;
-    let download_size = download_buffer.len() as u64;
-    // it's unclear whether it is more performant to decompress into memory or not
-    // TODO: decompressing into memory can be avoided
-    let mut decoder = Decoder::new(download_buffer.as_slice())?;
-    let mut decompress_buffer = Vec::new();
-    decoder.read_to_end(&mut decompress_buffer)?;
-    let mut archive = Archive::new(decompress_buffer.as_slice());
-    let unzip_dest = pgbin
-        .strip_suffix("/bin/postgres")
-        .expect("bad pgbin")
-        .to_string()
-        + "/download_extensions";
-    archive.unpack(&unzip_dest)?;
-    info!("Download + unzip {:?} completed successfully", &ext_path);
-
-    let sharedir_paths = (
-        unzip_dest.to_string() + "/share/extension",
-        Path::new(&get_pg_config("--sharedir", pgbin)).join("extension"),
-    );
-    let libdir_paths = (
-        unzip_dest.to_string() + "/lib",
-        Path::new(&get_pg_config("--pkglibdir", pgbin)).to_path_buf(),
-    );
-    // move contents of the libdir / sharedir in unzipped archive to the correct local paths
-    for paths in [sharedir_paths, libdir_paths] {
-        let (zip_dir, real_dir) = paths;
-        info!("mv {zip_dir:?}/*  {real_dir:?}");
-        for file in std::fs::read_dir(zip_dir)? {
-            let old_file = file?.path();
-            let new_file =
-                Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?);
-            info!("moving {old_file:?} to {new_file:?}");
-
-            // extension download failed: Directory not empty (os error 39)
-            match std::fs::rename(old_file, new_file) {
-                Ok(()) => info!("move succeeded"),
-                Err(e) => {
-                    warn!("move failed, probably because the extension already exists: {e}")
-                }
-            }
-        }
-    }
-    info!("done moving extension {ext_name}");
-    Ok(download_size)
-}
-
-// Create extension control files from spec
-pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
-    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
-    for ext_data in remote_extensions.extension_data.values() {
-        for (control_name, control_content) in &ext_data.control_data {
-            let control_path = local_sharedir.join(control_name);
-            if !control_path.exists() {
-                info!("writing file {:?}{:?}", control_path, control_content);
-                std::fs::write(control_path, control_content).unwrap();
-            } else {
-                warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_path);
-            }
-        }
-    }
-}
-
-// This function initializes the necessary structs to use remote storage
-pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
-    #[derive(Debug, serde::Deserialize)]
-    struct RemoteExtJson {
-        bucket: String,
-        region: String,
-        endpoint: Option<String>,
-        prefix: Option<String>,
-    }
-    let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
-
-    let config = S3Config {
-        bucket_name: remote_ext_json.bucket,
-        bucket_region: remote_ext_json.region,
-        prefix_in_bucket: remote_ext_json.prefix,
-        endpoint: remote_ext_json.endpoint,
-        concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
-        max_keys_per_list_response: None,
-    };
-    let config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
-        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
-        storage: RemoteStorageKind::AwsS3(config),
-    };
-    GenericRemoteStorage::from_config(&config)
-}
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -13,7 +13,7 @@ use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use num_cpus;
 use serde_json;
 use tokio::task;
-use tracing::{error, info, warn};
+use tracing::{error, info};
 use tracing_utils::http::OtelName;

 fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
@@ -121,78 +121,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

-        // download extension files from S3 on demand
-        (&Method::POST, route) if route.starts_with("/extension_server/") => {
-            info!("serving {:?} POST request", route);
-            info!("req.uri {:?}", req.uri());
-
-            // don't even try to download extensions
-            // if no remote storage is configured
-            if compute.ext_remote_storage.is_none() {
-                info!("no extensions remote storage configured");
-                let mut resp = Response::new(Body::from("no remote storage configured"));
-                *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                return resp;
-            }
-
-            let mut is_library = false;
-            if let Some(params) = req.uri().query() {
-                info!("serving {:?} POST request with params: {}", route, params);
-                if params == "is_library=true" {
-                    is_library = true;
-                } else {
-                    let mut resp = Response::new(Body::from("Wrong request parameters"));
-                    *resp.status_mut() = StatusCode::BAD_REQUEST;
-                    return resp;
-                }
-            }
-            let filename = route.split('/').last().unwrap().to_string();
-            info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
-
-            // get ext_name and path from spec
-            // don't lock compute_state for too long
-            let ext = {
-                let compute_state = compute.state.lock().unwrap();
-                let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-                let spec = &pspec.spec;
-
-                // debug only
-                info!("spec: {:?}", spec);
-
-                let remote_extensions = match spec.remote_extensions.as_ref() {
-                    Some(r) => r,
-                    None => {
-                        info!("no remote extensions spec was provided");
-                        let mut resp = Response::new(Body::from("no remote storage configured"));
-                        *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                        return resp;
-                    }
-                };
-
-                remote_extensions.get_ext(&filename, is_library)
-            };
-
-            match ext {
-                Ok((ext_name, ext_path)) => {
-                    match compute.download_extension(ext_name, ext_path).await {
-                        Ok(_) => Response::new(Body::from("OK")),
-                        Err(e) => {
-                            error!("extension download failed: {}", e);
-                            let mut resp = Response::new(Body::from(e.to_string()));
-                            *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                            resp
-                        }
-                    }
-                }
-                Err(e) => {
-                    warn!("extension download failed to find extension: {}", e);
-                    let mut resp = Response::new(Body::from("failed to find file"));
-                    *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                    resp
-                }
-            }
-        }
-
        // Return the `404 Not Found` for any other routes.
        _ => {
            let mut not_found = Response::new(Body::from("404 Not Found"));
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -139,34 +139,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/GenericError"
-  /extension_server:
-    post:
-      tags:
-      - Extension
-      summary: Download extension from S3 to local folder.
-      description: ""
-      operationId: downloadExtension
-      responses:
-        200:
-          description: Extension downloaded
-          content:
-            text/plain:
-              schema:
-                type: string
-                description: Error text or 'OK' if download succeeded.
-                example: "OK"
-        400:
-        description: Request is invalid.
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/GenericError"
-        500:
-        description: Extension download request failed.
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/GenericError"

 components:
  securitySchemes:
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -9,7 +9,6 @@ pub mod http;
 #[macro_use]
 pub mod logger;
 pub mod compute;
-pub mod extension_server;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane(
 pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
    // File `postgresql.conf` is no longer included into `basebackup`, so just
    // always write all config into it creating new file.
-    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
+    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;

    update_pg_hba(pgdata_path)?;

@@ -270,7 +270,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            }
            RoleAction::Create => {
                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
+                    "CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -32,4 +32,3 @@ utils.workspace = true

 compute_api.workspace = true
 workspace_hack.workspace = true
-tracing.workspace = true
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -652,14 +652,12 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
            )?;
        }
        "start" => {
-            let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
-            let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
+            // let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
+            // let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
            let endpoint_id = sub_args
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

-            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
-
            // If --safekeepers argument is given, use only the listed safekeeper nodes.
            let safekeepers =
                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
@@ -675,7 +673,10 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    env.safekeepers.iter().map(|sk| sk.id).collect()
                };

-            let endpoint = cplane.endpoints.get(endpoint_id.as_str());
+            let endpoint = cplane
+                .endpoints
+                .get(endpoint_id.as_str())
+                .ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;

            let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
                let claims = Claims::new(Some(tenant_id), Scope::Tenant);
@@ -690,63 +691,17 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .copied()
                .unwrap_or(false);

-            if let Some(endpoint) = endpoint {
-                match (&endpoint.mode, hot_standby) {
-                    (ComputeMode::Static(_), true) => {
-                        bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
-                    }
-                    (ComputeMode::Primary, true) => {
-                        bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
-                    }
-                    _ => {}
+            match (&endpoint.mode, hot_standby) {
+                (ComputeMode::Static(_), true) => {
+                    bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
                }
-                println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
-            } else {
-                let branch_name = sub_args
-                    .get_one::<String>("branch-name")
-                    .map(|s| s.as_str())
-                    .unwrap_or(DEFAULT_BRANCH_NAME);
-                let timeline_id = env
-                    .get_branch_timeline_id(branch_name, tenant_id)
-                    .ok_or_else(|| {
-                        anyhow!("Found no timeline id for branch name '{branch_name}'")
-                    })?;
-                let lsn = sub_args
-                    .get_one::<String>("lsn")
-                    .map(|lsn_str| Lsn::from_str(lsn_str))
-                    .transpose()
-                    .context("Failed to parse Lsn from the request")?;
-                let pg_version = sub_args
-                    .get_one::<u32>("pg-version")
-                    .copied()
-                    .context("Failed to `pg-version` from the argument string")?;
-
-                let mode = match (lsn, hot_standby) {
-                    (Some(lsn), false) => ComputeMode::Static(lsn),
-                    (None, true) => ComputeMode::Replica,
-                    (None, false) => ComputeMode::Primary,
-                    (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
-                };
-
-                // when used with custom port this results in non obvious behaviour
-                // port is remembered from first start command, i e
-                // start --port X
-                // stop
-                // start <-- will also use port X even without explicit port argument
-                println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
-
-                let ep = cplane.new_endpoint(
-                    endpoint_id,
-                    tenant_id,
-                    timeline_id,
-                    pg_port,
-                    http_port,
-                    pg_version,
-                    mode,
-                )?;
-                ep.start(&auth_token, safekeepers, remote_ext_config)?;
+                (ComputeMode::Primary, true) => {
+                    bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
+                }
+                _ => {}
            }
+            println!("Starting existing endpoint {endpoint_id}...");
+            endpoint.start(&auth_token, safekeepers)?;
        }
        "stop" => {
            let endpoint_id = sub_args
@@ -825,16 +780,6 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNod
    }
 }

-// Get list of options to append to safekeeper command invocation.
-fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
-    init_match
-        .get_many::<String>("safekeeper-extra-opt")
-        .into_iter()
-        .flatten()
-        .map(|s| s.to_owned())
-        .collect()
-}
-
 fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    let (sub_name, sub_args) = match sub_match.subcommand() {
        Some(safekeeper_command_data) => safekeeper_command_data,
@@ -851,9 +796,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul

    match sub_name {
        "start" => {
-            let extra_opts = safekeeper_extra_opts(sub_args);
-
-            if let Err(e) = safekeeper.start(extra_opts) {
+            if let Err(e) = safekeeper.start() {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -878,8 +821,7 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
                exit(1);
            }

-            let extra_opts = safekeeper_extra_opts(sub_args);
-            if let Err(e) = safekeeper.start(extra_opts) {
+            if let Err(e) = safekeeper.start() {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -906,7 +848,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow

    for node in env.safekeepers.iter() {
        let safekeeper = SafekeeperNode::from_env(env, node);
-        if let Err(e) = safekeeper.start(vec![]) {
+        if let Err(e) = safekeeper.start() {
            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
            try_stop_all(env, false);
            exit(1);
@@ -969,14 +911,6 @@ fn cli() -> Command {

    let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);

-    let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt")
-        .short('e')
-        .long("safekeeper-extra-opt")
-        .num_args(1)
-        .action(ArgAction::Append)
-        .help("Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo")
-        .required(false);
-
    let tenant_id_arg = Arg::new("tenant-id")
        .long("tenant-id")
        .help("Tenant id. Represented as a hexadecimal string 32 symbols length")
@@ -1026,12 +960,6 @@ fn cli() -> Command {
        .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
        .required(false);

-    let remote_ext_config_args = Arg::new("remote-ext-config")
-        .long("remote-ext-config")
-        .num_args(1)
-        .help("Configure the S3 bucket that we search for extensions in.")
-        .required(false);
-
    let lsn_arg = Arg::new("lsn")
        .long("lsn")
        .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
@@ -1145,7 +1073,6 @@ fn cli() -> Command {
                .subcommand(Command::new("start")
                            .about("Start local safekeeper")
                            .arg(safekeeper_id_arg.clone())
-                            .arg(safekeeper_extra_opt_arg.clone())
                )
                .subcommand(Command::new("stop")
                            .about("Stop local safekeeper")
@@ -1156,7 +1083,6 @@ fn cli() -> Command {
                            .about("Restart local safekeeper")
                            .arg(safekeeper_id_arg)
                            .arg(stop_mode_arg.clone())
-                            .arg(safekeeper_extra_opt_arg)
                )
        )
        .subcommand(
@@ -1192,7 +1118,6 @@ fn cli() -> Command {
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
                    .arg(safekeepers_arg)
-                    .arg(remote_ext_config_args)
                )
                .subcommand(
                    Command::new("stop")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -128,6 +128,20 @@ impl ComputeControlPlane {
    ) -> Result<Arc<Endpoint>> {
        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
+
+        if matches!(mode, ComputeMode::Primary) {
+            // this check is not complete, as you could have a concurrent attempt at
+            // creating another primary, both reading the state before checking it here,
+            // but it's better than nothing.
+            let mut duplicates = self.endpoints.iter().filter(|(_k, v)| {
+                v.tenant_id == tenant_id && v.timeline_id == timeline_id && v.mode == mode
+            });
+
+            if let Some((key, _)) = duplicates.next() {
+                bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported.");
+            }
+        }
+
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
@@ -313,7 +327,7 @@ impl Endpoint {

                // TODO: use future host field from safekeeper spec
                // Pass the list of safekeepers to the replica so that it can connect to any of them,
-                // whichever is available.
+                // whichever is availiable.
                let sk_ports = self
                    .env
                    .safekeepers
@@ -420,12 +434,7 @@ impl Endpoint {
        Ok(())
    }

-    pub fn start(
-        &self,
-        auth_token: &Option<String>,
-        safekeepers: Vec<NodeId>,
-        remote_ext_config: Option<&String>,
-    ) -> Result<()> {
+    pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
        }
@@ -493,7 +502,6 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
-            remote_extensions: None,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -525,11 +533,6 @@ impl Endpoint {
            .stdin(std::process::Stdio::null())
            .stderr(logfile.try_clone()?)
            .stdout(logfile);
-
-        if let Some(remote_ext_config) = remote_ext_config {
-            cmd.args(["--remote-ext-config", remote_ext_config]);
-        }
-
        let child = cmd.spawn()?;

        // Write down the pid so we can wait for it when we want to stop
@@ -575,7 +578,9 @@ impl Endpoint {
                }
                Err(e) => {
                    if attempt == MAX_ATTEMPTS {
-                        return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
+                        return Err(e).context(
+                            "timed out waiting to connect to compute_ctl HTTP; last error: {e}",
+                        );
                    }
                }
            }
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -101,7 +101,7 @@ impl SafekeeperNode {
        self.datadir_path().join("safekeeper.pid")
    }

-    pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
+    pub fn start(&self) -> anyhow::Result<Child> {
        print!(
            "Starting safekeeper at '{}' in '{}'",
            self.pg_connection_config.raw_address(),
@@ -161,28 +161,17 @@ impl SafekeeperNode {

        let key_path = self.env.base_data_dir.join("auth_public_key.pem");
        if self.conf.auth_enabled {
-            let key_path_string = key_path
-                .to_str()
-                .with_context(|| {
-                    format!("Key path {key_path:?} cannot be represented as a unicode string")
-                })?
-                .to_owned();
            args.extend([
-                "--pg-auth-public-key-path".to_owned(),
-                key_path_string.clone(),
-            ]);
-            args.extend([
-                "--pg-tenant-only-auth-public-key-path".to_owned(),
-                key_path_string.clone(),
-            ]);
-            args.extend([
-                "--http-auth-public-key-path".to_owned(),
-                key_path_string.clone(),
+                "--auth-validation-public-key-path".to_owned(),
+                key_path
+                    .to_str()
+                    .with_context(|| {
+                        format!("Key path {key_path:?} cannot be represented as a unicode string")
+                    })?
+                    .to_owned(),
            ]);
        }

-        args.extend(extra_opts);
-
        background_process::start_process(
            &format!("safekeeper-{id}"),
            &datadir,
--- a/docs/rfcs/024-extension-loading.md
+++ b/docs/rfcs/024-extension-loading.md
@@ -1,236 +0,0 @@
-# Supporting custom user Extensions (Dynamic Extension Loading)
-Created 2023-05-03
-
-## Motivation
-
-There are many extensions in the PostgreSQL ecosystem, and not all extensions
-are of a quality that we can confidently support them. Additionally, our
-current extension inclusion mechanism has several problems because we build all
-extensions into the primary Compute image: We build the extensions every time
-we build the compute image regardless of whether we actually need to rebuild
-the image, and the inclusion of these extensions in the image adds a hard
-dependency on all supported extensions - thus increasing the image size, and
-with it the time it takes to download that image - increasing first start
-latency.
-
-This RFC proposes a dynamic loading mechanism that solves most of these
-problems.
-
-## Summary
-
-`compute_ctl` is made responsible for loading extensions on-demand into
-the container's file system for dynamically loaded extensions, and will also
-make sure that the extensions in `shared_preload_libraries` are downloaded
-before the compute node starts.
-
-## Components
-
-compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store
-
-## Requirements
-
-Compute nodes with no extra extensions should not be negatively impacted by
-the existence of support for many extensions.
-
-Installing an extension into PostgreSQL should be easy.
-
-Non-preloaded extensions shouldn't impact startup latency.
-
-Uninstalled extensions shouldn't impact query latency.
-
-A small latency penalty for dynamically loaded extensions is acceptable in
-the first seconds of compute startup, but not in steady-state operations.
-
-## Proposed implementation
-
-### On-demand, JIT-loading of extensions
-
-Before postgres starts we download 
- control files for all extensions available to that compute node;
- all `shared_preload_libraries`;
-
-After postgres is running, `compute_ctl` listens for requests to load files.
-When PostgreSQL requests a file, `compute_ctl` downloads it.
-
-PostgreSQL requests files in the following cases:
- When loading a preload library set in `local_preload_libraries`
- When explicitly loading a library with `LOAD`
- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files)))
-
-
-#### Summary
-
-Pros:
- - Startup is only as slow as it takes to load all (shared_)preload_libraries
- - Supports BYO Extension
-
-Cons:
- - O(sizeof(extensions)) IO requirement for loading all extensions.
-
-### Alternative solutions
-
-1. Allow users to add their extensions to the base image
-   
-   Pros:
-    - Easy to deploy
-
-   Cons:
-    - Doesn't scale - first start size is dependent on image size;
-    - All extensions are shared across all users: It doesn't allow users to
-      bring their own restrictive-licensed extensions
-
-2. Bring Your Own compute image
-   
-   Pros:
-    - Still easy to deploy
-    - User can bring own patched version of PostgreSQL
-
-   Cons:
-    - First start latency is O(sizeof(extensions image))
-    - Warm instance pool for skipping pod schedule latency is not feasible with
-      O(n) custom images
-    - Support channels are difficult to manage
-
-3. Download all user extensions in bulk on compute start
-   
-   Pros:
-    - Easy to deploy
-    - No startup latency issues for "clean" users.
-    - Warm instance pool for skipping pod schedule latency is possible
-
-   Cons:
-    - Downloading all extensions in advance takes a lot of time, thus startup
-      latency issues
-
-4. Store user's extensions in persistent storage
-   
-   Pros:
-    - Easy to deploy
-    - No startup latency issues
-    - Warm instance pool for skipping pod schedule latency is possible
-
-   Cons:
-    - EC2 instances have only limited number of attachments shared between EBS
-      volumes, direct-attached NVMe drives, and ENIs.
-    - Compute instance migration isn't trivially solved for EBS mounts (e.g.
-      the device is unavailable whilst moving the mount between instances).
-    - EBS can only mount on one instance at a time (except the expensive IO2
-      device type).
-
-5. Store user's extensions in network drive
-   
-   Pros:
-    - Easy to deploy
-    - Few startup latency issues
-    - Warm instance pool for skipping pod schedule latency is possible
-
-   Cons:
-    - We'd need networked drives, and a lot of them, which would store many
-      duplicate extensions.
-    - **UNCHECKED:** Compute instance migration may not work nicely with
-      networked IOs
-
-
-### Idea extensions
-
-The extension store does not have to be S3 directly, but could be a Node-local
-caching service on top of S3. This would reduce the load on the network for
-popular extensions.
-
-## Extension Storage implementation
-
-The layout of the S3 bucket is as follows:
-```
-5615610098 // this is an extension build number
-├── v14
-│   ├── extensions
-│   │   ├── anon.tar.zst
-│   │   └── embedding.tar.zst
-│   └── ext_index.json
-└── v15
-    ├── extensions
-    │   ├── anon.tar.zst
-    │   └── embedding.tar.zst
-    └── ext_index.json
-5615261079
-├── v14
-│   ├── extensions
-│   │   └── anon.tar.zst
-│   └── ext_index.json
-└── v15
-    ├── extensions
-    │   └── anon.tar.zst
-    └── ext_index.json
-5623261088
-├── v14
-│   ├── extensions
-│   │   └── embedding.tar.zst
-│   └── ext_index.json
-└── v15
-    ├── extensions
-    │   └── embedding.tar.zst
-    └── ext_index.json
-```
-
-Note that build number cannot be part of prefix because we might need extensions
-from other build numbers.
-
-`ext_index.json` stores the control files and location of extension archives. 
-It also stores a list of public extensions and a library_index
-
-We don't need to duplicate `extension.tar.zst`` files.
-We only need to upload a new one if it is updated.
-(Although currently we just upload every time anyways, hopefully will change
-this sometime)
-
-*access* is controlled by spec
-
-More specifically, here is an example ext_index.json
-```
-{
-    "public_extensions": [
-        "anon",
-        "pg_buffercache"
-    ],
-    "library_index": {
-        "anon": "anon",
-        "pg_buffercache": "pg_buffercache"
-        // for more complex extensions like postgis
-        // we might have something like:
-        // address_standardizer: postgis
-        // postgis_tiger: postgis
-    },
-    "extension_data": {
-        "pg_buffercache": {
-            "control_data": {
-                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
-            },
-            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
-        },
-        "anon": {
-            "control_data": {
-                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
-            },
-            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
-        }
-    }
-}
-```
-
-### How to add new extension to the Extension Storage?
-
-Simply upload build artifacts to the S3 bucket.
-Implement a CI step for that. Splitting it from compute-node-image build.
-
-### How do we deal with extension versions and updates?
-
-Currently, we rebuild extensions on every compute-node-image build and store them in the <build-version> prefix.
-This is needed to ensure that `/share` and `/lib` files are in sync.
-
-For extension updates, we rely on the PostgreSQL extension versioning mechanism (sql update scripts) and extension authors to not break backwards compatibility within one major version of PostgreSQL.
-
-### Alternatives
-
-For extensions written on trusted languages we can also adopt
-`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase.
-This will increase the amount supported extensions and decrease the amount of work required to support them.
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -10,9 +10,6 @@ chrono.workspace = true
 serde.workspace = true
 serde_with.workspace = true
 serde_json.workspace = true
-regex.workspace = true

 utils = { path = "../utils" }
-remote_storage = { version = "0.1", path = "../remote_storage/" }
-
 workspace_hack.workspace = true
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -68,45 +68,14 @@ where
 /// Response of the /metrics.json API
 #[derive(Clone, Debug, Default, Serialize)]
 pub struct ComputeMetrics {
-    /// Time spent waiting in pool
    pub wait_for_spec_ms: u64,
-
-    /// Time spent checking if safekeepers are synced
-    pub sync_sk_check_ms: u64,
-
-    /// Time spent syncing safekeepers (walproposer.c).
-    /// In most cases this should be zero.
    pub sync_safekeepers_ms: u64,
-
-    /// Time it took to establish a pg connection to the pageserver.
-    /// This is two roundtrips, so it's a good proxy for compute-pageserver
-    /// latency. The latency is usually 0.2ms, but it's not safe to assume
-    /// that.
-    pub pageserver_connect_micros: u64,
-
-    /// Time to get basebackup from pageserver and write it to disk.
+    pub sync_sk_check_ms: u64,
    pub basebackup_ms: u64,
-
-    /// Compressed size of basebackup received.
    pub basebackup_bytes: u64,
-
-    /// Time spent starting potgres. This includes initialization of shared
-    /// buffers, preloading extensions, and other pg operations.
    pub start_postgres_ms: u64,
-
-    /// Time spent applying pg catalog updates that were made in the console
-    /// UI. This should be 0 when startup time matters, since cplane tries
-    /// to do these updates eagerly, and passes the skip_pg_catalog_updates
-    /// when it's safe to skip this step.
    pub config_ms: u64,
-
-    /// Total time, from when we receive the spec to when we're ready to take
-    /// pg connections.
    pub total_startup_ms: u64,
-    pub load_ext_ms: u64,
-    pub num_ext_downloaded: u64,
-    pub largest_ext_size: u64, // these are measured in bytes
-    pub total_ext_download_size: u64,
 }

 /// Response of the `/computes/{compute_id}/spec` control-plane API.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -3,16 +3,11 @@
 //! The spec.json file is used to pass information to 'compute_ctl'. It contains
 //! all the information needed to start up the right version of PostgreSQL,
 //! and connect it to the storage nodes.
-use std::collections::HashMap;
-
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

-use regex::Regex;
-use remote_storage::RemotePath;
-
 /// String type alias representing Postgres identifier and
 /// intended to be used for DB / role names.
 pub type PgIdent = String;
@@ -65,56 +60,6 @@ pub struct ComputeSpec {
    /// If set, 'storage_auth_token' is used as the password to authenticate to
    /// the pageserver and safekeepers.
    pub storage_auth_token: Option<String>,
-
-    // information about available remote extensions
-    pub remote_extensions: Option<RemoteExtSpec>,
-}
-
-#[derive(Clone, Debug, Default, Deserialize, Serialize)]
-pub struct RemoteExtSpec {
-    pub public_extensions: Option<Vec<String>>,
-    pub custom_extensions: Option<Vec<String>>,
-    pub library_index: HashMap<String, String>,
-    pub extension_data: HashMap<String, ExtensionData>,
-}
-
-#[derive(Clone, Debug, Serialize, Deserialize)]
-pub struct ExtensionData {
-    pub control_data: HashMap<String, String>,
-    pub archive_path: String,
-}
-
-impl RemoteExtSpec {
-    pub fn get_ext(
-        &self,
-        ext_name: &str,
-        is_library: bool,
-    ) -> anyhow::Result<(String, RemotePath)> {
-        let mut real_ext_name = ext_name;
-        if is_library {
-            // sometimes library names might have a suffix like
-            // library.so or library.so.3. We strip this off
-            // because library_index is based on the name without the file extension
-            let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
-            let lib_raw_name = strip_lib_suffix.replace(real_ext_name, "").to_string();
-
-            real_ext_name = self
-                .library_index
-                .get(&lib_raw_name)
-                .ok_or(anyhow::anyhow!("library {} is not found", lib_raw_name))?;
-        }
-
-        match self.extension_data.get(real_ext_name) {
-            Some(ext_data) => Ok((
-                real_ext_name.to_string(),
-                RemotePath::from_string(&ext_data.archive_path)?,
-            )),
-            None => Err(anyhow::anyhow!(
-                "real_ext_name {} is not found",
-                real_ext_name
-            )),
-        }
-    }
 }

 #[serde_as]
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -205,43 +205,5 @@
            "name": "zenith new",
            "new_name": "zenith \"new\""
        }
-    ],
-    "remote_extensions": {
-        "library_index": {
-          "anon": "anon",
-          "postgis-3": "postgis",
-          "libpgrouting-3.4": "postgis",
-          "postgis_raster-3": "postgis",
-          "postgis_sfcgal-3": "postgis",
-          "postgis_topology-3": "postgis",
-          "address_standardizer-3": "postgis"
-        },
-        "extension_data": {
-          "anon": {
-            "archive_path": "5834329303/v15/extensions/anon.tar.zst",
-            "control_data": {
-              "anon.control": "# PostgreSQL Anonymizer (anon) extension\ncomment = ''Data anonymization tools''\ndefault_version = ''1.1.0''\ndirectory=''extension/anon''\nrelocatable = false\nrequires = ''pgcrypto''\nsuperuser = false\nmodule_pathname = ''$libdir/anon''\ntrusted = true\n"
-            }
-          },
-          "postgis": {
-            "archive_path": "5834329303/v15/extensions/postgis.tar.zst",
-            "control_data": {
-              "postgis.control": "# postgis extension\ncomment = ''PostGIS geometry and geography spatial types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis-3''\nrelocatable = false\ntrusted = true\n",
-              "pgrouting.control": "# pgRouting Extension\ncomment = ''pgRouting Extension''\ndefault_version = ''3.4.2''\nmodule_pathname = ''$libdir/libpgrouting-3.4''\nrelocatable = true\nrequires = ''plpgsql''\nrequires = ''postgis''\ntrusted = true\n",
-              "postgis_raster.control": "# postgis_raster extension\ncomment = ''PostGIS raster types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis_raster-3''\nrelocatable = false\nrequires = postgis\ntrusted = true\n",
-              "postgis_sfcgal.control": "# postgis topology extension\ncomment = ''PostGIS SFCGAL functions''\ndefault_version = ''3.3.2''\nrelocatable = true\nrequires = postgis\ntrusted = true\n",
-              "postgis_topology.control": "# postgis topology extension\ncomment = ''PostGIS topology spatial types and functions''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = topology\nrequires = postgis\ntrusted = true\n",
-              "address_standardizer.control": "# address_standardizer extension\ncomment = ''Used to parse an address into constituent elements. Generally used to support geocoding address normalization step.''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n",
-              "postgis_tiger_geocoder.control": "# postgis tiger geocoder extension\ncomment = ''PostGIS tiger geocoder and reverse geocoder''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = tiger\nrequires = ''postgis,fuzzystrmatch''\nsuperuser= false\ntrusted = true\n",
-              "address_standardizer_data_us.control": "# address standardizer us dataset\ncomment = ''Address Standardizer US dataset example''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n"
-            }
-          }
-        },
-        "custom_extensions": [
-          "anon"
-        ],
-        "public_extensions": [
-          "postgis"
-        ]
-      }
+    ]
 }
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::Serialize;

-#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
    #[serde(rename = "absolute")]
@@ -17,32 +17,6 @@ pub enum EventType {
    },
 }

-impl EventType {
-    pub fn absolute_time(&self) -> Option<&DateTime<Utc>> {
-        use EventType::*;
-        match self {
-            Absolute { time } => Some(time),
-            _ => None,
-        }
-    }
-
-    pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
-        // these can most likely be thought of as Range or RangeFull
-        use EventType::*;
-        match self {
-            Incremental {
-                start_time,
-                stop_time,
-            } => Some(start_time..stop_time),
-            _ => None,
-        }
-    }
-
-    pub fn is_incremental(&self) -> bool {
-        matches!(self, EventType::Incremental { .. })
-    }
-}
-
 #[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 pub struct Event<Extra> {
    #[serde(flatten)]
@@ -57,7 +31,7 @@ pub struct Event<Extra> {
    pub extra: Extra,
 }

-pub fn idempotency_key(node_id: &str) -> String {
+pub fn idempotency_key(node_id: String) -> String {
    format!(
        "{}-{}-{:04}",
        Utc::now(),
@@ -71,6 +45,6 @@ pub const CHUNK_SIZE: usize = 1000;
 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
 #[derive(serde::Serialize)]
-pub struct EventChunk<'a, T: Clone> {
-    pub events: std::borrow::Cow<'a, [T]>,
+pub struct EventChunk<'a, T> {
+    pub events: &'a [T],
 }
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -20,7 +20,6 @@ tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
 tokio-util.workspace = true
 toml_edit.workspace = true
 tracing.workspace = true
-scopeguard.workspace = true
 metrics.workspace = true
 utils.workspace = true
 pin-project-lite.workspace = true
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -65,10 +65,6 @@ impl RemotePath {
        Ok(Self(relative_path.to_path_buf()))
    }

-    pub fn from_string(relative_path: &str) -> anyhow::Result<Self> {
-        Self::new(Path::new(relative_path))
-    }
-
    pub fn with_base(&self, base_path: &Path) -> PathBuf {
        base_path.join(&self.0)
    }
@@ -194,20 +190,6 @@ pub enum GenericRemoteStorage {
 }

 impl GenericRemoteStorage {
-    // A function for listing all the files in a "directory"
-    // Example:
-    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
-    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        match self {
-            Self::LocalFs(s) => s.list_files(folder).await,
-            Self::AwsS3(s) => s.list_files(folder).await,
-            Self::Unreliable(s) => s.list_files(folder).await,
-        }
-    }
-
-    // lists common *prefixes*, if any of files
-    // Example:
-    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
    pub async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
@@ -219,6 +201,14 @@ impl GenericRemoteStorage {
        }
    }

+    pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
+        match self {
+            Self::LocalFs(s) => s.list_files(folder).await,
+            Self::AwsS3(s) => s.list_files(folder).await,
+            Self::Unreliable(s) => s.list_files(folder).await,
+        }
+    }
+
    pub async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -10,7 +10,6 @@ use anyhow::Context;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
-    provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
 };
 use aws_credential_types::cache::CredentialsCache;
 use aws_sdk_s3::{
@@ -23,7 +22,6 @@ use aws_sdk_s3::{
 };
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
-use scopeguard::ScopeGuard;
 use tokio::{
    io::{self, AsyncRead},
    sync::Semaphore,
@@ -38,9 +36,82 @@ use crate::{

 const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;

-pub(super) mod metrics;
+pub(super) mod metrics {
+    use metrics::{register_int_counter_vec, IntCounterVec};
+    use once_cell::sync::Lazy;

-use self::metrics::{AttemptOutcome, RequestKind};
+    static S3_REQUESTS_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "remote_storage_s3_requests_count",
+            "Number of s3 requests of particular type",
+            &["request_type"],
+        )
+        .expect("failed to define a metric")
+    });
+
+    static S3_REQUESTS_FAIL_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+        register_int_counter_vec!(
+            "remote_storage_s3_failures_count",
+            "Number of failed s3 requests of particular type",
+            &["request_type"],
+        )
+        .expect("failed to define a metric")
+    });
+
+    pub fn inc_get_object() {
+        S3_REQUESTS_COUNT.with_label_values(&["get_object"]).inc();
+    }
+
+    pub fn inc_get_object_fail() {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["get_object"])
+            .inc();
+    }
+
+    pub fn inc_put_object() {
+        S3_REQUESTS_COUNT.with_label_values(&["put_object"]).inc();
+    }
+
+    pub fn inc_put_object_fail() {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["put_object"])
+            .inc();
+    }
+
+    pub fn inc_delete_object() {
+        S3_REQUESTS_COUNT
+            .with_label_values(&["delete_object"])
+            .inc();
+    }
+
+    pub fn inc_delete_objects(count: u64) {
+        S3_REQUESTS_COUNT
+            .with_label_values(&["delete_object"])
+            .inc_by(count);
+    }
+
+    pub fn inc_delete_object_fail() {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["delete_object"])
+            .inc();
+    }
+
+    pub fn inc_delete_objects_fail(count: u64) {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["delete_object"])
+            .inc_by(count);
+    }
+
+    pub fn inc_list_objects() {
+        S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
+    }
+
+    pub fn inc_list_objects_fail() {
+        S3_REQUESTS_FAIL_COUNT
+            .with_label_values(&["list_objects"])
+            .inc();
+    }
+}

 /// AWS S3 storage.
 pub struct S3Bucket {
@@ -68,29 +139,18 @@ impl S3Bucket {
            aws_config.bucket_name
        );

-        let region = Some(Region::new(aws_config.bucket_region.clone()));
-
        let credentials_provider = {
            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
            CredentialsProviderChain::first_try(
                "env",
                EnvironmentVariableCredentialsProvider::new(),
            )
-            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
-            // needed to access remote extensions bucket
-            .or_else("token", {
-                let provider_conf = ProviderConfig::without_region().with_region(region.clone());
-
-                WebIdentityTokenCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build()
-            })
            // uses imds v2
            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };

        let mut config_builder = Config::builder()
-            .region(region)
+            .region(Region::new(aws_config.bucket_region.clone()))
            .credentials_cache(CredentialsCache::lazy())
            .credentials_provider(credentials_provider);

@@ -140,56 +200,25 @@ impl S3Bucket {
        )
    }

-    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
-        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path
-            .get_path()
-            .to_string_lossy()
-            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
-            .to_string();
-        match &self.prefix_in_bucket {
-            Some(prefix) => prefix.clone() + "/" + &path_string,
-            None => path_string,
+    fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
+        let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
+        for segment in path.0.iter() {
+            full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+            full_path.push_str(segment.to_str().unwrap_or_default());
        }
+        full_path
    }

-    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
-        let started_at = start_counting_cancelled_wait(kind);
-        let permit = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .expect("semaphore is never closed");
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .wait_seconds
-            .observe_elapsed(kind, started_at);
-
-        permit
-    }
-
-    async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit {
-        let started_at = start_counting_cancelled_wait(kind);
+    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
        let permit = self
            .concurrency_limiter
            .clone()
            .acquire_owned()
            .await
-            .expect("semaphore is never closed");
+            .context("Concurrency limiter semaphore got closed during S3 download")
+            .map_err(DownloadError::Other)?;

-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .wait_seconds
-            .observe_elapsed(kind, started_at);
-        permit
-    }
-
-    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
-        let kind = RequestKind::Get;
-        let permit = self.owned_permit(kind).await;
-
-        let started_at = start_measuring_requests(kind);
+        metrics::inc_get_object();

        let get_object = self
            .client
@@ -200,33 +229,26 @@ impl S3Bucket {
            .send()
            .await;

-        let started_at = ScopeGuard::into_inner(started_at);
-
-        if get_object.is_err() {
-            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-                kind,
-                AttemptOutcome::Err,
-                started_at,
-            );
-        }
-
        match get_object {
            Ok(object_output) => {
                let metadata = object_output.metadata().cloned().map(StorageMetadata);
                Ok(Download {
                    metadata,
-                    download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
-                        started_at,
-                        RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
+                    download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
+                        permit,
+                        object_output.body.into_async_read(),
                    ))),
                })
            }
            Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
                Err(DownloadError::NotFound)
            }
-            Err(e) => Err(DownloadError::Other(
-                anyhow::Error::new(e).context("download s3 object"),
-            )),
+            Err(e) => {
+                metrics::inc_get_object_fail();
+                Err(DownloadError::Other(anyhow::anyhow!(
+                    "Failed to download S3 object: {e}"
+                )))
+            }
        }
    }
 }
@@ -257,54 +279,6 @@ impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
    }
 }

-pin_project_lite::pin_project! {
-    /// Times and tracks the outcome of the request.
-    struct TimedDownload<S> {
-        started_at: std::time::Instant,
-        outcome: metrics::AttemptOutcome,
-        #[pin]
-        inner: S
-    }
-
-    impl<S> PinnedDrop for TimedDownload<S> {
-        fn drop(mut this: Pin<&mut Self>) {
-            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
-        }
-    }
-}
-
-impl<S: AsyncRead> TimedDownload<S> {
-    fn new(started_at: std::time::Instant, inner: S) -> Self {
-        TimedDownload {
-            started_at,
-            outcome: metrics::AttemptOutcome::Cancelled,
-            inner,
-        }
-    }
-}
-
-impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
-    fn poll_read(
-        self: std::pin::Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &mut io::ReadBuf<'_>,
-    ) -> std::task::Poll<std::io::Result<()>> {
-        let this = self.project();
-        let before = buf.filled().len();
-        let read = std::task::ready!(this.inner.poll_read(cx, buf));
-
-        let read_eof = buf.filled().len() == before;
-
-        match read {
-            Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
-            Ok(()) => { /* still in progress */ }
-            Err(_) => *this.outcome = AttemptOutcome::Err,
-        }
-
-        std::task::Poll::Ready(read)
-    }
-}
-
 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
    /// See the doc for `RemoteStorage::list_prefixes`
@@ -313,8 +287,6 @@ impl RemoteStorage for S3Bucket {
        &self,
        prefix: Option<&RemotePath>,
    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let kind = RequestKind::List;
-
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_s3_object(p))
@@ -331,10 +303,15 @@ impl RemoteStorage for S3Bucket {
        let mut document_keys = Vec::new();

        let mut continuation_token = None;
-
        loop {
-            let _guard = self.permit(kind).await;
-            let started_at = start_measuring_requests(kind);
+            let _guard = self
+                .concurrency_limiter
+                .acquire()
+                .await
+                .context("Concurrency limiter semaphore got closed during S3 list")
+                .map_err(DownloadError::Other)?;
+
+            metrics::inc_list_objects();

            let fetch_response = self
                .client
@@ -346,16 +323,12 @@ impl RemoteStorage for S3Bucket {
                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
+                .map_err(|e| {
+                    metrics::inc_list_objects_fail();
+                    e
+                })
                .context("Failed to list S3 prefixes")
-                .map_err(DownloadError::Other);
-
-            let started_at = ScopeGuard::into_inner(started_at);
-
-            metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &fetch_response, started_at);
-
-            let fetch_response = fetch_response?;
+                .map_err(DownloadError::Other)?;

            document_keys.extend(
                fetch_response
@@ -365,10 +338,10 @@ impl RemoteStorage for S3Bucket {
                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
            );

-            continuation_token = match fetch_response.next_continuation_token {
-                Some(new_token) => Some(new_token),
+            match fetch_response.next_continuation_token {
+                Some(new_token) => continuation_token = Some(new_token),
                None => break,
-            };
+            }
        }

        Ok(document_keys)
@@ -376,8 +349,6 @@ impl RemoteStorage for S3Bucket {

    /// See the doc for `RemoteStorage::list_files`
    async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
-        let kind = RequestKind::List;
-
        let folder_name = folder
            .map(|p| self.relative_path_to_s3_object(p))
            .or_else(|| self.prefix_in_bucket.clone());
@@ -386,8 +357,12 @@ impl RemoteStorage for S3Bucket {
        let mut continuation_token = None;
        let mut all_files = vec![];
        loop {
-            let _guard = self.permit(kind).await;
-            let started_at = start_measuring_requests(kind);
+            let _guard = self
+                .concurrency_limiter
+                .acquire()
+                .await
+                .context("Concurrency limiter semaphore got closed during S3 list_files")?;
+            metrics::inc_list_objects();

            let response = self
                .client
@@ -398,14 +373,11 @@ impl RemoteStorage for S3Bucket {
                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
-                .context("Failed to list files in S3 bucket");
-
-            let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &response, started_at);
-
-            let response = response?;
+                .map_err(|e| {
+                    metrics::inc_list_objects_fail();
+                    e
+                })
+                .context("Failed to list files in S3 bucket")?;

            for object in response.contents().unwrap_or_default() {
                let object_path = object.key().expect("response does not contain a key");
@@ -427,16 +399,18 @@ impl RemoteStorage for S3Bucket {
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()> {
-        let kind = RequestKind::Put;
-        let _guard = self.permit(kind).await;
+        let _guard = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 upload")?;

-        let started_at = start_measuring_requests(kind);
+        metrics::inc_put_object();

        let body = Body::wrap_stream(ReaderStream::new(from));
        let bytes_stream = ByteStream::new(SdkBody::from(body));

-        let res = self
-            .client
+        self.client
            .put_object()
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(to))
@@ -444,25 +418,19 @@ impl RemoteStorage for S3Bucket {
            .content_length(from_size_bytes.try_into()?)
            .body(bytes_stream)
            .send()
-            .await;
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-
-        res?;
-
+            .await
+            .map_err(|e| {
+                metrics::inc_put_object_fail();
+                e
+            })?;
        Ok(())
    }

    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
-        // if prefix is not none then download file `prefix/from`
-        // if prefix is none then download file `from`
        self.download_object(GetObjectRequest {
            bucket: self.bucket_name.clone(),
            key: self.relative_path_to_s3_object(from),
-            range: None,
+            ..GetObjectRequest::default()
        })
        .await
    }
@@ -489,8 +457,11 @@ impl RemoteStorage for S3Bucket {
        .await
    }
    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        let kind = RequestKind::Delete;
-        let _guard = self.permit(kind).await;
+        let _guard = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 delete")?;

        let mut delete_objects = Vec::with_capacity(paths.len());
        for path in paths {
@@ -501,7 +472,7 @@ impl RemoteStorage for S3Bucket {
        }

        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
-            let started_at = start_measuring_requests(kind);
+            metrics::inc_delete_objects(chunk.len() as u64);

            let resp = self
                .client
@@ -511,17 +482,10 @@ impl RemoteStorage for S3Bucket {
                .send()
                .await;

-            let started_at = ScopeGuard::into_inner(started_at);
-            metrics::BUCKET_METRICS
-                .req_seconds
-                .observe_elapsed(kind, &resp, started_at);
-
            match resp {
                Ok(resp) => {
-                    metrics::BUCKET_METRICS
-                        .deleted_objects_total
-                        .inc_by(chunk.len() as u64);
                    if let Some(errors) = resp.errors {
+                        metrics::inc_delete_objects_fail(errors.len() as u64);
                        return Err(anyhow::format_err!(
                            "Failed to delete {} objects",
                            errors.len()
@@ -529,6 +493,7 @@ impl RemoteStorage for S3Bucket {
                    }
                }
                Err(e) => {
+                    metrics::inc_delete_objects_fail(chunk.len() as u64);
                    return Err(e.into());
                }
            }
@@ -537,89 +502,24 @@ impl RemoteStorage for S3Bucket {
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        let paths = std::array::from_ref(path);
-        self.delete_objects(paths).await
-    }
-}
-
-/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
-fn start_counting_cancelled_wait(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
-        metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc()
-    })
-}
-
-/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`].
-fn start_measuring_requests(
-    kind: RequestKind,
-) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
-        metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
-            kind,
-            AttemptOutcome::Cancelled,
-            started_at,
-        )
-    })
-}
-
-#[cfg(test)]
-mod tests {
-    use std::num::NonZeroUsize;
-    use std::path::Path;
-
-    use crate::{RemotePath, S3Bucket, S3Config};
-
-    #[test]
-    fn relative_path() {
-        let all_paths = vec!["", "some/path", "some/path/"];
-        let all_paths: Vec<RemotePath> = all_paths
-            .iter()
-            .map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
-            .collect();
-        let prefixes = [
-            None,
-            Some(""),
-            Some("test/prefix"),
-            Some("test/prefix/"),
-            Some("/test/prefix/"),
-        ];
-        let expected_outputs = vec![
-            vec!["", "some/path", "some/path"],
-            vec!["/", "/some/path", "/some/path"],
-            vec![
-                "test/prefix/",
-                "test/prefix/some/path",
-                "test/prefix/some/path",
-            ],
-            vec![
-                "test/prefix/",
-                "test/prefix/some/path",
-                "test/prefix/some/path",
-            ],
-            vec![
-                "test/prefix/",
-                "test/prefix/some/path",
-                "test/prefix/some/path",
-            ],
-        ];
-
-        for (prefix_idx, prefix) in prefixes.iter().enumerate() {
-            let config = S3Config {
-                bucket_name: "bucket".to_owned(),
-                bucket_region: "region".to_owned(),
-                prefix_in_bucket: prefix.map(str::to_string),
-                endpoint: None,
-                concurrency_limit: NonZeroUsize::new(100).unwrap(),
-                max_keys_per_list_response: Some(5),
-            };
-            let storage = S3Bucket::new(&config).expect("remote storage init");
-            for (test_path_idx, test_path) in all_paths.iter().enumerate() {
-                let result = storage.relative_path_to_s3_object(test_path);
-                let expected = expected_outputs[prefix_idx][test_path_idx];
-                assert_eq!(result, expected);
-            }
-        }
+        let _guard = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 delete")?;
+
+        metrics::inc_delete_object();
+
+        self.client
+            .delete_object()
+            .bucket(self.bucket_name.clone())
+            .key(self.relative_path_to_s3_object(path))
+            .send()
+            .await
+            .map_err(|e| {
+                metrics::inc_delete_object_fail();
+                e
+            })?;
+        Ok(())
    }
 }
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -1,191 +0,0 @@
-use metrics::{
-    register_histogram_vec, register_int_counter, register_int_counter_vec, Histogram, IntCounter,
-};
-use once_cell::sync::Lazy;
-
-pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);
-
-#[derive(Clone, Copy, Debug)]
-pub(super) enum RequestKind {
-    Get = 0,
-    Put = 1,
-    Delete = 2,
-    List = 3,
-}
-
-use RequestKind::*;
-
-impl RequestKind {
-    const fn as_str(&self) -> &'static str {
-        match self {
-            Get => "get_object",
-            Put => "put_object",
-            Delete => "delete_object",
-            List => "list_objects",
-        }
-    }
-    const fn as_index(&self) -> usize {
-        *self as usize
-    }
-}
-
-pub(super) struct RequestTyped<C>([C; 4]);
-
-impl<C> RequestTyped<C> {
-    pub(super) fn get(&self, kind: RequestKind) -> &C {
-        &self.0[kind.as_index()]
-    }
-
-    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
-        use RequestKind::*;
-        let mut it = [Get, Put, Delete, List].into_iter();
-        let arr = std::array::from_fn::<C, 4, _>(|index| {
-            let next = it.next().unwrap();
-            assert_eq!(index, next.as_index());
-            f(next)
-        });
-
-        if let Some(next) = it.next() {
-            panic!("unexpected {next:?}");
-        }
-
-        RequestTyped(arr)
-    }
-}
-
-impl RequestTyped<Histogram> {
-    pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
-        self.get(kind).observe(started_at.elapsed().as_secs_f64())
-    }
-}
-
-pub(super) struct PassFailCancelledRequestTyped<C> {
-    success: RequestTyped<C>,
-    fail: RequestTyped<C>,
-    cancelled: RequestTyped<C>,
-}
-
-#[derive(Debug, Clone, Copy)]
-pub(super) enum AttemptOutcome {
-    Ok,
-    Err,
-    Cancelled,
-}
-
-impl<T, E> From<&Result<T, E>> for AttemptOutcome {
-    fn from(value: &Result<T, E>) -> Self {
-        match value {
-            Ok(_) => AttemptOutcome::Ok,
-            Err(_) => AttemptOutcome::Err,
-        }
-    }
-}
-
-impl AttemptOutcome {
-    pub(super) fn as_str(&self) -> &'static str {
-        match self {
-            AttemptOutcome::Ok => "ok",
-            AttemptOutcome::Err => "err",
-            AttemptOutcome::Cancelled => "cancelled",
-        }
-    }
-}
-
-impl<C> PassFailCancelledRequestTyped<C> {
-    pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
-        let target = match outcome {
-            AttemptOutcome::Ok => &self.success,
-            AttemptOutcome::Err => &self.fail,
-            AttemptOutcome::Cancelled => &self.cancelled,
-        };
-        target.get(kind)
-    }
-
-    fn build_with(mut f: impl FnMut(RequestKind, AttemptOutcome) -> C) -> Self {
-        let success = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Ok));
-        let fail = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Err));
-        let cancelled = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Cancelled));
-
-        PassFailCancelledRequestTyped {
-            success,
-            fail,
-            cancelled,
-        }
-    }
-}
-
-impl PassFailCancelledRequestTyped<Histogram> {
-    pub(super) fn observe_elapsed(
-        &self,
-        kind: RequestKind,
-        outcome: impl Into<AttemptOutcome>,
-        started_at: std::time::Instant,
-    ) {
-        self.get(kind, outcome.into())
-            .observe(started_at.elapsed().as_secs_f64())
-    }
-}
-
-pub(super) struct BucketMetrics {
-    /// Full request duration until successful completion, error or cancellation.
-    pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
-    /// Total amount of seconds waited on queue.
-    pub(super) wait_seconds: RequestTyped<Histogram>,
-
-    /// Track how many semaphore awaits were cancelled per request type.
-    ///
-    /// This is in case cancellations are happening more than expected.
-    pub(super) cancelled_waits: RequestTyped<IntCounter>,
-
-    /// Total amount of deleted objects in batches or single requests.
-    pub(super) deleted_objects_total: IntCounter,
-}
-
-impl Default for BucketMetrics {
-    fn default() -> Self {
-        let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
-
-        let req_seconds = register_histogram_vec!(
-            "remote_storage_s3_request_seconds",
-            "Seconds to complete a request",
-            &["request_type", "result"],
-            buckets.to_vec(),
-        )
-        .unwrap();
-        let req_seconds = PassFailCancelledRequestTyped::build_with(|kind, outcome| {
-            req_seconds.with_label_values(&[kind.as_str(), outcome.as_str()])
-        });
-
-        let wait_seconds = register_histogram_vec!(
-            "remote_storage_s3_wait_seconds",
-            "Seconds rate limited",
-            &["request_type"],
-            buckets.to_vec(),
-        )
-        .unwrap();
-        let wait_seconds =
-            RequestTyped::build_with(|kind| wait_seconds.with_label_values(&[kind.as_str()]));
-
-        let cancelled_waits = register_int_counter_vec!(
-            "remote_storage_s3_cancelled_waits_total",
-            "Times a semaphore wait has been cancelled per request type",
-            &["request_type"],
-        )
-        .unwrap();
-        let cancelled_waits =
-            RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()]));
-
-        let deleted_objects_total = register_int_counter!(
-            "remote_storage_s3_deleted_objects_total",
-            "Amount of deleted objects in total",
-        )
-        .unwrap();
-
-        Self {
-            req_seconds,
-            wait_seconds,
-            cancelled_waits,
-            deleted_objects_total,
-        }
-    }
-}
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -71,13 +71,6 @@ impl UnreliableWrapper {
            }
        }
    }
-
-    async fn delete_inner(&self, path: &RemotePath, attempt: bool) -> anyhow::Result<()> {
-        if attempt {
-            self.attempt(RemoteOp::Delete(path.clone()))?;
-        }
-        self.inner.delete(path).await
-    }
 }

 #[async_trait::async_trait]
@@ -129,15 +122,15 @@ impl RemoteStorage for UnreliableWrapper {
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        self.delete_inner(path, true).await
+        self.attempt(RemoteOp::Delete(path.clone()))?;
+        self.inner.delete(path).await
    }

    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
        self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
        let mut error_counter = 0;
        for path in paths {
-            // Dont record attempt because it was already recorded above
-            if (self.delete_inner(path, false).await).is_err() {
+            if (self.delete(path).await).is_err() {
                error_counter += 1;
            }
        }
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -19,7 +19,7 @@ static LOGGING_DONE: OnceCell<()> = OnceCell::new();

 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";

-const BASE_PREFIX: &str = "test";
+const BASE_PREFIX: &str = "test/";

 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
--- a/libs/utils/src/backoff.rs
+++ b/libs/utils/src/backoff.rs
@@ -1,188 +0,0 @@
-use std::fmt::{Debug, Display};
-
-use futures::Future;
-
-pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
-pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
-
-pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
-    let backoff_duration_seconds =
-        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
-    if backoff_duration_seconds > 0.0 {
-        tracing::info!(
-            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
-        );
-        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
-    }
-}
-
-pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
-    if n == 0 {
-        0.0
-    } else {
-        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
-    }
-}
-
-/// retries passed operation until one of the following conditions are met:
-/// Encountered error is considered as permanent (non-retryable)
-/// Retries have been exhausted.
-/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
-/// When attempts cross `warn_threshold` function starts to emit log warnings.
-/// `description` argument is added to log messages. Its value should identify the `op` is doing
-pub async fn retry<T, O, F, E>(
-    mut op: O,
-    is_permanent: impl Fn(&E) -> bool,
-    warn_threshold: u32,
-    max_retries: u32,
-    description: &str,
-) -> Result<T, E>
-where
-    // Not std::error::Error because anyhow::Error doesnt implement it.
-    // For context see https://github.com/dtolnay/anyhow/issues/63
-    E: Display + Debug,
-    O: FnMut() -> F,
-    F: Future<Output = Result<T, E>>,
-{
-    let mut attempts = 0;
-    loop {
-        let result = op().await;
-        match result {
-            Ok(_) => {
-                if attempts > 0 {
-                    tracing::info!("{description} succeeded after {attempts} retries");
-                }
-                return result;
-            }
-
-            // These are "permanent" errors that should not be retried.
-            Err(ref e) if is_permanent(e) => {
-                return result;
-            }
-            // Assume that any other failure might be transient, and the operation might
-            // succeed if we just keep trying.
-            Err(err) if attempts < warn_threshold => {
-                tracing::info!("{description} failed, will retry (attempt {attempts}): {err:#}");
-            }
-            Err(err) if attempts < max_retries => {
-                tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
-            }
-            Err(ref err) => {
-                // Operation failed `max_attempts` times. Time to give up.
-                tracing::warn!(
-                    "{description} still failed after {attempts} retries, giving up: {err:?}"
-                );
-                return result;
-            }
-        }
-        // sleep and retry
-        exponential_backoff(
-            attempts,
-            DEFAULT_BASE_BACKOFF_SECONDS,
-            DEFAULT_MAX_BACKOFF_SECONDS,
-        )
-        .await;
-        attempts += 1;
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::io;
-
-    use tokio::sync::Mutex;
-
-    use super::*;
-
-    #[test]
-    fn backoff_defaults_produce_growing_backoff_sequence() {
-        let mut current_backoff_value = None;
-
-        for i in 0..10_000 {
-            let new_backoff_value = exponential_backoff_duration_seconds(
-                i,
-                DEFAULT_BASE_BACKOFF_SECONDS,
-                DEFAULT_MAX_BACKOFF_SECONDS,
-            );
-
-            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
-                assert!(
-                    old_backoff_value <= new_backoff_value,
-                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
-                )
-            }
-        }
-
-        assert_eq!(
-            current_backoff_value.expect("Should have produced backoff values to compare"),
-            DEFAULT_MAX_BACKOFF_SECONDS,
-            "Given big enough of retries, backoff should reach its allowed max value"
-        );
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn retry_always_error() {
-        let count = Mutex::new(0);
-        let err_result = retry(
-            || async {
-                *count.lock().await += 1;
-                Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
-            },
-            |_e| false,
-            1,
-            1,
-            "work",
-        )
-        .await;
-
-        assert!(err_result.is_err());
-
-        assert_eq!(*count.lock().await, 2);
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn retry_ok_after_err() {
-        let count = Mutex::new(0);
-        retry(
-            || async {
-                let mut locked = count.lock().await;
-                if *locked > 1 {
-                    Ok(())
-                } else {
-                    *locked += 1;
-                    Err(io::Error::from(io::ErrorKind::Other))
-                }
-            },
-            |_e| false,
-            2,
-            2,
-            "work",
-        )
-        .await
-        .unwrap();
-    }
-
-    #[tokio::test(start_paused = true)]
-    async fn dont_retry_permanent_errors() {
-        let count = Mutex::new(0);
-        let _ = retry(
-            || async {
-                let mut locked = count.lock().await;
-                if *locked > 1 {
-                    Ok(())
-                } else {
-                    *locked += 1;
-                    Err(io::Error::from(io::ErrorKind::Other))
-                }
-            },
-            |_e| true,
-            2,
-            2,
-            "work",
-        )
-        .await
-        .unwrap_err();
-
-        assert_eq!(*count.lock().await, 1);
-    }
-}
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -111,10 +111,6 @@ pub fn fsync(path: &Path) -> io::Result<()> {
        .map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
 }

-pub async fn fsync_async(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
-    tokio::fs::File::open(path).await?.sync_all().await
-}
-
 #[cfg(test)]
 mod tests {
    use tempfile::tempdir;
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -24,20 +24,6 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
    Ok(dir.next_entry().await?.is_none())
 }

-pub async fn list_dir(path: impl AsRef<Path>) -> anyhow::Result<Vec<String>> {
-    let mut dir = tokio::fs::read_dir(&path)
-        .await
-        .context(format!("read_dir({})", path.as_ref().display()))?;
-
-    let mut content = vec![];
-    while let Some(next) = dir.next_entry().await? {
-        let file_name = next.file_name();
-        content.push(file_name.to_string_lossy().to_string());
-    }
-
-    Ok(content)
-}
-
 pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
    if e.kind() == io::ErrorKind::NotFound {
        Ok(())
@@ -57,7 +43,7 @@ where
 mod test {
    use std::path::PathBuf;

-    use crate::fs_ext::{is_directory_empty, list_dir};
+    use crate::fs_ext::is_directory_empty;

    use super::ignore_absent_files;

@@ -123,25 +109,4 @@ mod test {

        assert!(!file_path.exists());
    }
-
-    #[tokio::test]
-    async fn list_dir_works() {
-        let dir = tempfile::tempdir().unwrap();
-        let dir_path = dir.path();
-
-        assert!(list_dir(dir_path).await.unwrap().is_empty());
-
-        let file_path: PathBuf = dir_path.join("testfile");
-        let _ = std::fs::File::create(&file_path).unwrap();
-
-        assert_eq!(&list_dir(dir_path).await.unwrap(), &["testfile"]);
-
-        let another_dir_path: PathBuf = dir_path.join("testdir");
-        std::fs::create_dir(another_dir_path).unwrap();
-
-        let expected = &["testdir", "testfile"];
-        let mut actual = list_dir(dir_path).await.unwrap();
-        actual.sort();
-        assert_eq!(actual, expected);
-    }
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,8 +1,6 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.

-pub mod backoff;
-
 /// `Lsn` type implements common tasks on Log Sequence Numbers
 pub mod lsn;
 /// SeqWait allows waiting for a future sequence number to arrive
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -23,7 +23,6 @@
 //!      <https://grafana.com/tutorials/build-a-panel-plugin/>
 use anyhow::Result;
 use pageserver::repository::Key;
-use pageserver::METADATA_FILE_NAME;
 use std::cmp::Ordering;
 use std::io::{self, BufRead};
 use std::path::PathBuf;
@@ -72,10 +71,6 @@ pub fn main() -> Result<()> {
        let line = PathBuf::from_str(&line).unwrap();
        let filename = line.file_name().unwrap();
        let filename = filename.to_str().unwrap();
-        if filename == METADATA_FILE_NAME {
-            // Don't try and parse "metadata" like a key-lsn range
-            continue;
-        }
        let range = parse_filename(filename);
        ranges.push(range);
    }
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -107,25 +107,23 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    // min-heap (reserve space for one more element added before eviction)
    let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
    let mut prev_key: Option<Key> = None;
-    tree_reader
-        .visit(
-            &[0u8; DELTA_KEY_SIZE],
-            VisitDirection::Forwards,
-            |key, _value| {
-                let curr = Key::from_slice(&key[..KEY_SIZE]);
-                if let Some(prev) = prev_key {
-                    if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
-                        heap.push(Hole(prev..curr));
-                        if heap.len() > max_holes {
-                            heap.pop(); // remove smallest hole
-                        }
+    tree_reader.visit(
+        &[0u8; DELTA_KEY_SIZE],
+        VisitDirection::Forwards,
+        |key, _value| {
+            let curr = Key::from_slice(&key[..KEY_SIZE]);
+            if let Some(prev) = prev_key {
+                if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH {
+                    heap.push(Hole(prev..curr));
+                    if heap.len() > max_holes {
+                        heap.pop(); // remove smallest hole
                    }
                }
-                prev_key = Some(curr.next());
-                true
-            },
-        )
-        .await?;
+            }
+            prev_key = Some(curr.next());
+            true
+        },
+    )?;
    let mut holes = heap.into_vec();
    holes.sort_by_key(|hole| hole.0.start);
    Ok(holes)
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -59,20 +59,18 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    );
    // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
    let mut all = vec![];
-    tree_reader
-        .visit(
-            &[0u8; DELTA_KEY_SIZE],
-            VisitDirection::Forwards,
-            |key, value_offset| {
-                let curr = Key::from_slice(&key[..KEY_SIZE]);
-                all.push((curr, BlobRef(value_offset)));
-                true
-            },
-        )
-        .await?;
-    let cursor = BlockCursor::new(&file);
+    tree_reader.visit(
+        &[0u8; DELTA_KEY_SIZE],
+        VisitDirection::Forwards,
+        |key, value_offset| {
+            let curr = Key::from_slice(&key[..KEY_SIZE]);
+            all.push((curr, BlobRef(value_offset)));
+            true
+        },
+    )?;
+    let mut cursor = BlockCursor::new(&file);
    for (k, v) in all {
-        let value = cursor.read_blob(v.pos()).await?;
+        let value = cursor.read_blob(v.pos())?;
        println!("key:{} value_len:{}", k, value.len());
    }
    // TODO(chi): special handling for last key?
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -9,10 +9,8 @@ use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
-use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use remote_storage::GenericRemoteStorage;
-use tokio::time::Instant;
 use tracing::*;

 use metrics::set_build_info_metric;
@@ -40,6 +38,8 @@ const PID_FILE_NAME: &str = "pageserver.pid";
 const FEATURES: &[&str] = &[
    #[cfg(feature = "testing")]
    "testing",
+    #[cfg(feature = "fail/failpoints")]
+    "fail/failpoints",
 ];

 fn version() -> String {
@@ -226,19 +226,6 @@ fn start_pageserver(
    launch_ts: &'static LaunchTimestamp,
    conf: &'static PageServerConf,
 ) -> anyhow::Result<()> {
-    // Monotonic time for later calculating startup duration
-    let started_startup_at = Instant::now();
-
-    let startup_checkpoint = move |phase: &str, human_phase: &str| {
-        let elapsed = started_startup_at.elapsed();
-        let secs = elapsed.as_secs_f64();
-        STARTUP_DURATION.with_label_values(&[phase]).set(secs);
-        info!(
-            elapsed_ms = elapsed.as_millis(),
-            "{human_phase} ({secs:.3}s since start)"
-        )
-    };
-
    // Print version and launch timestamp to the log,
    // and expose them as prometheus metrics.
    // A changed version string indicates changed software.
@@ -348,11 +335,6 @@ fn start_pageserver(
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;

-    // Up to this point no significant I/O has been done: this should have been fast.  Record
-    // duration prior to starting I/O intensive phase of startup.
-    startup_checkpoint("initial", "Starting loading tenants");
-    STARTUP_IS_LOADING.set(1);
-
    // Startup staging or optimizing:
    //
    // We want to minimize downtime for `page_service` connections, and trying not to overload
@@ -373,11 +355,12 @@ fn start_pageserver(
    let order = pageserver::InitializationOrder {
        initial_tenant_load: Some(init_done_tx),
        initial_logical_size_can_start: init_done_rx.clone(),
-        initial_logical_size_attempt: Some(init_logical_size_done_tx),
+        initial_logical_size_attempt: init_logical_size_done_tx,
        background_jobs_can_start: background_jobs_barrier.clone(),
    };

    // Scan the local 'tenants/' directory and start loading the tenants
+    let init_started_at = std::time::Instant::now();
    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();

    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
@@ -395,13 +378,18 @@ fn start_pageserver(
            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));

            init_done_rx.wait().await;
-            startup_checkpoint("initial_tenant_load", "Initial load completed");
-            STARTUP_IS_LOADING.set(0);
-
            // initial logical sizes can now start, as they were waiting on init_done_rx.

            scopeguard::ScopeGuard::into_inner(guard);

+            let init_done = std::time::Instant::now();
+            let elapsed = init_done - init_started_at;
+
+            tracing::info!(
+                elapsed_millis = elapsed.as_millis(),
+                "Initial load completed"
+            );
+
            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());

            let timeout = conf.background_task_maximum_delay;
@@ -410,7 +398,12 @@ fn start_pageserver(

            let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
                Ok(_) => {
-                    startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
+                    let now = std::time::Instant::now();
+                    tracing::info!(
+                        from_init_done_millis = (now - init_done).as_millis(),
+                        from_init_millis = (now - init_started_at).as_millis(),
+                        "Initial logical sizes completed"
+                    );
                    None
                }
                Err(_) => {
@@ -426,7 +419,6 @@ fn start_pageserver(

            // allow background jobs to start
            drop(background_jobs_can_start);
-            startup_checkpoint("background_jobs_can_start", "Starting background jobs");

            if let Some(init_sizes_done) = init_sizes_done {
                // ending up here is not a bug; at the latest logical sizes will be queried by
@@ -436,11 +428,14 @@ fn start_pageserver(

                scopeguard::ScopeGuard::into_inner(guard);

-                startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");
+                let now = std::time::Instant::now();
+                tracing::info!(
+                    from_init_done_millis = (now - init_done).as_millis(),
+                    from_init_millis = (now - init_started_at).as_millis(),
+                    "Initial logical sizes completed after timeout (background jobs already started)"
+                );

            }
-
-            startup_checkpoint("complete", "Startup complete");
        };

        async move {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -31,9 +31,7 @@ use utils::{
 use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::{
-    TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
-};
+use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
 use crate::{
    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
    TIMELINE_UNINIT_MARK_SUFFIX,
@@ -615,11 +613,6 @@ impl PageServerConf {
        )
    }

-    pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
-        self.tenant_path(tenant_id)
-            .join(TENANT_DELETED_MARKER_FILE_NAME)
-    }
-
    pub fn traces_path(&self) -> PathBuf {
        self.workdir.join("traces")
    }
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -7,23 +7,27 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use anyhow;
-use chrono::{DateTime, Utc};
+use chrono::Utc;
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use pageserver_api::models::TenantState;
 use reqwest::Url;
 use serde::Serialize;
 use serde_with::{serde_as, DisplayFromStr};
 use std::collections::HashMap;
-use std::sync::Arc;
-use std::time::{Duration, SystemTime};
+use std::time::Duration;
 use tracing::*;
 use utils::id::{NodeId, TenantId, TimelineId};
-use utils::lsn::Lsn;
+
+const WRITTEN_SIZE: &str = "written_size";
+const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size";
+const RESIDENT_SIZE: &str = "resident_size";
+const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
+const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

 #[serde_as]
-#[derive(Serialize, Debug, Clone, Copy)]
+#[derive(Serialize, Debug)]
 struct Ids {
    #[serde_as(as = "DisplayFromStr")]
    tenant_id: TenantId,
@@ -34,142 +38,10 @@ struct Ids {

 /// Key that uniquely identifies the object, this metric describes.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
-struct MetricsKey {
-    tenant_id: TenantId,
-    timeline_id: Option<TimelineId>,
-    metric: &'static str,
-}
-
-impl MetricsKey {
-    const fn absolute_values(self) -> AbsoluteValueFactory {
-        AbsoluteValueFactory(self)
-    }
-    const fn incremental_values(self) -> IncrementalValueFactory {
-        IncrementalValueFactory(self)
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only absolute values.
-struct AbsoluteValueFactory(MetricsKey);
-
-impl AbsoluteValueFactory {
-    fn at(self, time: DateTime<Utc>, val: u64) -> (MetricsKey, (EventType, u64)) {
-        let key = self.0;
-        (key, (EventType::Absolute { time }, val))
-    }
-}
-
-/// Helper type which each individual metric kind can return to produce only incremental values.
-struct IncrementalValueFactory(MetricsKey);
-
-impl IncrementalValueFactory {
-    #[allow(clippy::wrong_self_convention)]
-    fn from_previous_up_to(
-        self,
-        prev_end: DateTime<Utc>,
-        up_to: DateTime<Utc>,
-        val: u64,
-    ) -> (MetricsKey, (EventType, u64)) {
-        let key = self.0;
-        // cannot assert prev_end < up_to because these are realtime clock based
-        (
-            key,
-            (
-                EventType::Incremental {
-                    start_time: prev_end,
-                    stop_time: up_to,
-                },
-                val,
-            ),
-        )
-    }
-
-    fn key(&self) -> &MetricsKey {
-        &self.0
-    }
-}
-
-// the static part of a MetricsKey
-impl MetricsKey {
-    /// Absolute value of [`Timeline::get_last_record_lsn`].
-    ///
-    /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
-    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "written_size",
-        }
-        .absolute_values()
-    }
-
-    /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
-    /// previously sent, starting from the previously sent incremental time range ending at the
-    /// latest absolute measurement.
-    const fn written_size_delta(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> IncrementalValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            // the name here is correctly about data not size, because that is what is wanted by
-            // downstream pipeline
-            metric: "written_data_bytes_delta",
-        }
-        .incremental_values()
-    }
-
-    /// Exact [`Timeline::get_current_logical_size`].
-    ///
-    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
-    const fn timeline_logical_size(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: Some(timeline_id),
-            metric: "timeline_logical_size",
-        }
-        .absolute_values()
-    }
-
-    /// [`Tenant::remote_size`]
-    ///
-    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
-    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "remote_storage_size",
-        }
-        .absolute_values()
-    }
-
-    /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
-    ///
-    /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
-    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "resident_size",
-        }
-        .absolute_values()
-    }
-
-    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
-    ///
-    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
-    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
-        MetricsKey {
-            tenant_id,
-            timeline_id: None,
-            metric: "synthetic_storage_size",
-        }
-        .absolute_values()
-    }
+pub struct PageserverConsumptionMetricsKey {
+    pub tenant_id: TenantId,
+    pub timeline_id: Option<TimelineId>,
+    pub metric: &'static str,
 }

 /// Main thread that serves metrics collection
@@ -207,7 +79,7 @@ pub async fn collect_metrics(
        .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
        .build()
        .expect("Failed to create http client with timeout");
-    let mut cached_metrics = HashMap::new();
+    let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();

    loop {
@@ -247,15 +119,15 @@ pub async fn collect_metrics(
 ///
 /// TODO
 /// - refactor this function (chunking+sending part) to reuse it in proxy module;
-async fn collect_metrics_iteration(
+pub async fn collect_metrics_iteration(
    client: &reqwest::Client,
-    cached_metrics: &mut HashMap<MetricsKey, (EventType, u64)>,
+    cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
    metric_collection_endpoint: &reqwest::Url,
    node_id: NodeId,
    ctx: &RequestContext,
    send_cached: bool,
 ) {
-    let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new();
+    let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
    trace!(
        "starting collect_metrics_iteration. metric_collection_endpoint: {}",
        metric_collection_endpoint
@@ -289,65 +161,99 @@ async fn collect_metrics_iteration(
        let mut tenant_resident_size = 0;

        // iterate through list of timelines in tenant
-        for timeline in tenant.list_timelines() {
+        for timeline in tenant.list_timelines().iter() {
            // collect per-timeline metrics only for active timelines
+            if timeline.is_active() {
+                let timeline_written_size = u64::from(timeline.get_last_record_lsn());

-            let timeline_id = timeline.timeline_id;
-
-            match TimelineSnapshot::collect(&timeline, ctx) {
-                Ok(Some(snap)) => {
-                    snap.to_metrics(
+                current_metrics.push((
+                    PageserverConsumptionMetricsKey {
                        tenant_id,
-                        timeline_id,
-                        Utc::now(),
-                        &mut current_metrics,
-                        cached_metrics,
-                    );
-                }
-                Ok(None) => {}
-                Err(e) => {
-                    error!(
-                        "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
-                        timeline.timeline_id
-                    );
-                    continue;
-                }
+                        timeline_id: Some(timeline.timeline_id),
+                        metric: WRITTEN_SIZE,
+                    },
+                    timeline_written_size,
+                ));
+
+                let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
+                match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
+                    // Only send timeline logical size when it is fully calculated.
+                    Ok((size, is_exact)) if is_exact => {
+                        current_metrics.push((
+                            PageserverConsumptionMetricsKey {
+                                tenant_id,
+                                timeline_id: Some(timeline.timeline_id),
+                                metric: TIMELINE_LOGICAL_SIZE,
+                            },
+                            size,
+                        ));
+                    }
+                    Ok((_, _)) => {}
+                    Err(err) => {
+                        error!(
+                            "failed to get current logical size for timeline {}: {err:?}",
+                            timeline.timeline_id
+                        );
+                        continue;
+                    }
+                };
            }

-            tenant_resident_size += timeline.resident_physical_size();
+            let timeline_resident_size = timeline.get_resident_physical_size();
+            tenant_resident_size += timeline_resident_size;
        }

-        current_metrics
-            .push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size()));
+        match tenant.get_remote_size().await {
+            Ok(tenant_remote_size) => {
+                current_metrics.push((
+                    PageserverConsumptionMetricsKey {
+                        tenant_id,
+                        timeline_id: None,
+                        metric: REMOTE_STORAGE_SIZE,
+                    },
+                    tenant_remote_size,
+                ));
+            }
+            Err(err) => {
+                error!(
+                    "failed to get remote size for tenant {}: {err:?}",
+                    tenant_id
+                );
+            }
+        }

-        current_metrics
-            .push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size));
+        current_metrics.push((
+            PageserverConsumptionMetricsKey {
+                tenant_id,
+                timeline_id: None,
+                metric: RESIDENT_SIZE,
+            },
+            tenant_resident_size,
+        ));

        // Note that this metric is calculated in a separate bgworker
        // Here we only use cached value, which may lag behind the real latest one
-        let synthetic_size = tenant.cached_synthetic_size();
+        let tenant_synthetic_size = tenant.get_cached_synthetic_size();

-        if synthetic_size != 0 {
+        if tenant_synthetic_size != 0 {
            // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics
-                .push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size));
+            current_metrics.push((
+                PageserverConsumptionMetricsKey {
+                    tenant_id,
+                    timeline_id: None,
+                    metric: SYNTHETIC_STORAGE_SIZE,
+                },
+                tenant_synthetic_size,
+            ));
        }
    }

    // Filter metrics, unless we want to send all metrics, including cached ones.
    // See: https://github.com/neondatabase/neon/issues/3485
    if !send_cached {
-        current_metrics.retain(|(curr_key, (kind, curr_val))| {
-            if kind.is_incremental() {
-                // incremental values (currently only written_size_delta) should not get any cache
-                // deduplication because they will be used by upstream for "is still alive."
-                true
-            } else {
-                match cached_metrics.get(curr_key) {
-                    Some((_, val)) => val != curr_val,
-                    None => true,
-                }
-            }
+        current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
+            Some(val) => val != curr_val,
+            None => true,
        });
    }

@@ -362,16 +268,14 @@ async fn collect_metrics_iteration(

    let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);

-    let node_id = node_id.to_string();
-
    for chunk in chunks {
        chunk_to_send.clear();

        // enrich metrics with type,timestamp and idempotency key before sending
-        chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
-            kind: *when,
+        chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event {
+            kind: EventType::Absolute { time: Utc::now() },
            metric: curr_key.metric,
-            idempotency_key: idempotency_key(&node_id),
+            idempotency_key: idempotency_key(node_id.to_string()),
            value: *curr_val,
            extra: Ids {
                tenant_id: curr_key.tenant_id,
@@ -379,14 +283,17 @@ async fn collect_metrics_iteration(
            },
        }));

+        let chunk_json = serde_json::value::to_raw_value(&EventChunk {
+            events: &chunk_to_send,
+        })
+        .expect("PageserverConsumptionMetric should not fail serialization");
+
        const MAX_RETRIES: u32 = 3;

        for attempt in 0..MAX_RETRIES {
            let res = client
                .post(metric_collection_endpoint.clone())
-                .json(&EventChunk {
-                    events: (&chunk_to_send).into(),
-                })
+                .json(&chunk_json)
                .send()
                .await;

@@ -422,130 +329,6 @@ async fn collect_metrics_iteration(
    }
 }

-/// Internal type to make timeline metric production testable.
-///
-/// As this value type contains all of the information needed from a timeline to produce the
-/// metrics, it can easily be created with different values in test.
-struct TimelineSnapshot {
-    loaded_at: (Lsn, SystemTime),
-    last_record_lsn: Lsn,
-    current_exact_logical_size: Option<u64>,
-}
-
-impl TimelineSnapshot {
-    /// Collect the metrics from an actual timeline.
-    ///
-    /// Fails currently only when [`Timeline::get_current_logical_size`] fails.
-    ///
-    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
-    fn collect(
-        t: &Arc<crate::tenant::Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Option<Self>> {
-        use anyhow::Context;
-
-        if !t.is_active() {
-            // no collection for broken or stopping needed, we will still keep the cached values
-            // though at the caller.
-            Ok(None)
-        } else {
-            let loaded_at = t.loaded_at;
-            let last_record_lsn = t.get_last_record_lsn();
-
-            let current_exact_logical_size = {
-                let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
-                let res = span
-                    .in_scope(|| t.get_current_logical_size(ctx))
-                    .context("get_current_logical_size");
-                match res? {
-                    // Only send timeline logical size when it is fully calculated.
-                    (size, is_exact) if is_exact => Some(size),
-                    (_, _) => None,
-                }
-            };
-
-            Ok(Some(TimelineSnapshot {
-                loaded_at,
-                last_record_lsn,
-                current_exact_logical_size,
-            }))
-        }
-    }
-
-    /// Produce the timeline consumption metrics into the `metrics` argument.
-    fn to_metrics(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        now: DateTime<Utc>,
-        metrics: &mut Vec<(MetricsKey, (EventType, u64))>,
-        cache: &HashMap<MetricsKey, (EventType, u64)>,
-    ) {
-        let timeline_written_size = u64::from(self.last_record_lsn);
-
-        let (key, written_size_now) =
-            MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
-
-        // last_record_lsn can only go up, right now at least, TODO: #2592 or related
-        // features might change this.
-
-        let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
-
-        // use this when available, because in a stream of incremental values, it will be
-        // accurate where as when last_record_lsn stops moving, we will only cache the last
-        // one of those.
-        let last_stop_time = cache
-            .get(written_size_delta_key.key())
-            .map(|(until, _val)| {
-                until
-                    .incremental_timerange()
-                    .expect("never create EventType::Absolute for written_size_delta")
-                    .end
-            });
-
-        // by default, use the last sent written_size as the basis for
-        // calculating the delta. if we don't yet have one, use the load time value.
-        let prev = cache
-            .get(&key)
-            .map(|(prev_at, prev)| {
-                // use the prev time from our last incremental update, or default to latest
-                // absolute update on the first round.
-                let prev_at = prev_at
-                    .absolute_time()
-                    .expect("never create EventType::Incremental for written_size");
-                let prev_at = last_stop_time.unwrap_or(prev_at);
-                (*prev_at, *prev)
-            })
-            .unwrap_or_else(|| {
-                // if we don't have a previous point of comparison, compare to the load time
-                // lsn.
-                let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
-                (DateTime::from(*loaded_at), disk_consistent_lsn.0)
-            });
-
-        // written_size_bytes_delta
-        metrics.extend(
-            if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
-                let up_to = written_size_now
-                    .0
-                    .absolute_time()
-                    .expect("never create EventType::Incremental for written_size");
-                let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
-                Some(key_value)
-            } else {
-                None
-            },
-        );
-
-        // written_size
-        metrics.push((key, written_size_now));
-
-        if let Some(size) = self.current_exact_logical_size {
-            metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size));
-        }
-    }
-}
-
 /// Caclculate synthetic size for each active tenant
 pub async fn calculate_synthetic_size_worker(
    synthetic_size_calculation_interval: Duration,
@@ -560,7 +343,7 @@ pub async fn calculate_synthetic_size_worker(
            _ = task_mgr::shutdown_watcher() => {
                return Ok(());
            },
-            tick_at = ticker.tick() => {
+        tick_at = ticker.tick() => {

                let tenants = match mgr::list_tenants().await {
                    Ok(tenants) => tenants,
@@ -596,149 +379,3 @@ pub async fn calculate_synthetic_size_worker(
        }
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use std::collections::HashMap;
-
-    use std::time::SystemTime;
-    use utils::{
-        id::{TenantId, TimelineId},
-        lsn::Lsn,
-    };
-
-    use crate::consumption_metrics::MetricsKey;
-
-    use super::TimelineSnapshot;
-    use chrono::{DateTime, Utc};
-
-    #[test]
-    fn startup_collected_timeline_metrics_before_advancing() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::new();
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, SystemTime::now()),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        let now = DateTime::<Utc>::from(SystemTime::now());
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                    snap.loaded_at.1.into(),
-                    now,
-                    0
-                ),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    #[test]
-    fn startup_collected_timeline_metrics_second_round() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let [now, before, init] = time_backwards();
-
-        let now = DateTime::<Utc>::from(now);
-        let before = DateTime::<Utc>::from(before);
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::from([
-            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
-        ]);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, init),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id)
-                    .from_previous_up_to(before, now, 0),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    #[test]
-    fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
-        let tenant_id = TenantId::generate();
-        let timeline_id = TimelineId::generate();
-
-        let [now, just_before, before, init] = time_backwards();
-
-        let now = DateTime::<Utc>::from(now);
-        let just_before = DateTime::<Utc>::from(just_before);
-        let before = DateTime::<Utc>::from(before);
-
-        let initdb_lsn = Lsn(0x10000);
-        let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
-
-        let mut metrics = Vec::new();
-        let cache = HashMap::from([
-            // at t=before was the last time the last_record_lsn changed
-            MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
-            // end time of this event is used for the next ones
-            MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                before,
-                just_before,
-                0,
-            ),
-        ]);
-
-        let snap = TimelineSnapshot {
-            loaded_at: (disk_consistent_lsn, init),
-            last_record_lsn: disk_consistent_lsn,
-            current_exact_logical_size: Some(0x42000),
-        };
-
-        snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
-
-        assert_eq!(
-            metrics,
-            &[
-                MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
-                    just_before,
-                    now,
-                    0
-                ),
-                MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-                MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
-            ]
-        );
-    }
-
-    fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
-        let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
-        times[0] = std::time::SystemTime::now();
-        for behind in 1..N {
-            times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
-        }
-
-        times
-    }
-}
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -85,7 +85,6 @@
 //! The solution is that all code paths are infected with precisely one
 //! [`RequestContext`] argument. Functions in the middle of the call chain
 //! only need to pass it on.
-
 use crate::task_mgr::TaskKind;

 // The main structure of this module, see module-level comment.
@@ -93,7 +92,6 @@ use crate::task_mgr::TaskKind;
 pub struct RequestContext {
    task_kind: TaskKind,
    download_behavior: DownloadBehavior,
-    access_stats_behavior: AccessStatsBehavior,
 }

 /// Desired behavior if the operation requires an on-demand download
@@ -111,67 +109,6 @@ pub enum DownloadBehavior {
    Error,
 }

-/// Whether this request should update access times used in LRU eviction
-#[derive(Clone, Copy, PartialEq, Eq, Debug)]
-pub(crate) enum AccessStatsBehavior {
-    /// Update access times: this request's access to data should be taken
-    /// as a hint that the accessed layer is likely to be accessed again
-    Update,
-
-    /// Do not update access times: this request is accessing the layer
-    /// but does not want to indicate that the layer should be retained in cache,
-    /// perhaps because the requestor is a compaction routine that will soon cover
-    /// this layer with another.
-    Skip,
-}
-
-pub struct RequestContextBuilder {
-    inner: RequestContext,
-}
-
-impl RequestContextBuilder {
-    /// A new builder with default settings
-    pub fn new(task_kind: TaskKind) -> Self {
-        Self {
-            inner: RequestContext {
-                task_kind,
-                download_behavior: DownloadBehavior::Download,
-                access_stats_behavior: AccessStatsBehavior::Update,
-            },
-        }
-    }
-
-    pub fn extend(original: &RequestContext) -> Self {
-        Self {
-            // This is like a Copy, but avoid implementing Copy because ordinary users of
-            // RequestContext should always move or ref it.
-            inner: RequestContext {
-                task_kind: original.task_kind,
-                download_behavior: original.download_behavior,
-                access_stats_behavior: original.access_stats_behavior,
-            },
-        }
-    }
-
-    /// Configure the DownloadBehavior of the context: whether to
-    /// download missing layers, and/or warn on the download.
-    pub fn download_behavior(mut self, b: DownloadBehavior) -> Self {
-        self.inner.download_behavior = b;
-        self
-    }
-
-    /// Configure the AccessStatsBehavior of the context: whether layer
-    /// accesses should update the access time of the layer.
-    pub(crate) fn access_stats_behavior(mut self, b: AccessStatsBehavior) -> Self {
-        self.inner.access_stats_behavior = b;
-        self
-    }
-
-    pub fn build(self) -> RequestContext {
-        self.inner
-    }
-}
-
 impl RequestContext {
    /// Create a new RequestContext that has no parent.
    ///
@@ -186,9 +123,10 @@ impl RequestContext {
    /// because someone explicitly canceled it.
    /// It has no parent, so it cannot inherit cancellation from there.
    pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        RequestContextBuilder::new(task_kind)
-            .download_behavior(download_behavior)
-            .build()
+        RequestContext {
+            task_kind,
+            download_behavior,
+        }
    }

    /// Create a detached child context for a task that may outlive `self`.
@@ -249,7 +187,10 @@ impl RequestContext {
    }

    fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        Self::new(task_kind, download_behavior)
+        RequestContext {
+            task_kind,
+            download_behavior,
+        }
    }

    pub fn task_kind(&self) -> TaskKind {
@@ -259,8 +200,4 @@ impl RequestContext {
    pub fn download_behavior(&self) -> DownloadBehavior {
        self.download_behavior
    }
-
-    pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior {
-        self.access_stats_behavior
-    }
 }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -304,18 +304,17 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // Debug-log the list of candidates
    let now = SystemTime::now();
    for (i, (partition, candidate)) in candidates.iter().enumerate() {
-        let desc = candidate.layer.layer_desc();
        debug!(
            "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}",
            i + 1,
            candidates.len(),
-            desc.file_size,
+            candidate.layer.file_size(),
            now.duration_since(candidate.last_activity_ts)
                .unwrap()
                .as_micros(),
            partition,
-            desc.tenant_id,
-            desc.timeline_id,
+            candidate.layer.get_tenant_id(),
+            candidate.layer.get_timeline_id(),
            candidate.layer,
        );
    }
@@ -347,7 +346,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            warned = Some(usage_planned);
        }

-        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
+        usage_planned.add_available_bytes(candidate.layer.file_size());

        batched
            .entry(TimelineKey(candidate.timeline))
@@ -390,16 +389,15 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                Ok(results) => {
                    assert_eq!(results.len(), batch.len());
                    for (result, layer) in results.into_iter().zip(batch.iter()) {
-                        let file_size = layer.layer_desc().file_size;
                        match result {
                            Some(Ok(())) => {
-                                usage_assumed.add_available_bytes(file_size);
+                                usage_assumed.add_available_bytes(layer.file_size());
                            }
                            Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
                                unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
                            }
                            Some(Err(EvictionError::FileNotFound)) => {
-                                evictions_failed.file_sizes += file_size;
+                                evictions_failed.file_sizes += layer.file_size();
                                evictions_failed.count += 1;
                            }
                            Some(Err(
@@ -408,7 +406,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                            )) => {
                                let e = utils::error::report_compact_sources(&e);
                                warn!(%layer, "failed to evict layer: {e}");
-                                evictions_failed.file_sizes += file_size;
+                                evictions_failed.file_sizes += layer.file_size();
                                evictions_failed.count += 1;
                            }
                            None => {
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -93,47 +93,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-    delete:
-      description: |
-        Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
-        404 means that deletion successfully finished"
-      responses:
-        "400":
-          description: Error when no tenant id found in path
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Tenant not found
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "409":
-          description: Deletion is already in progress, continue polling
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ConflictError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"

  /v1/tenant/{tenant_id}/timeline:
    parameters:
@@ -861,7 +820,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-
  /v1/tenant/config:
    put:
      description: |
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -187,7 +187,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
                format!("Cannot delete timeline which has child timelines: {children:?}")
                    .into_boxed_str(),
            ),
-            a @ AlreadyInProgress(_) => ApiError::Conflict(a.to_string()),
+            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
            Other(e) => ApiError::InternalServerError(e),
        }
    }
@@ -208,19 +208,6 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
    }
 }

-impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
-    fn from(value: crate::tenant::delete::DeleteTenantError) -> Self {
-        use crate::tenant::delete::DeleteTenantError::*;
-        match value {
-            Get(g) => ApiError::from(g),
-            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
-            Timeline(t) => ApiError::from(t),
-            Other(o) => ApiError::InternalServerError(o),
-            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
-        }
-    }
-}
-
 // Helper function to construct a TimelineInfo struct for a timeline
 async fn build_timeline_info(
    timeline: &Arc<Timeline>,
@@ -630,23 +617,6 @@ async fn tenant_status(
    json_response(StatusCode::OK, tenant_info)
 }

-async fn tenant_delete_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    // TODO openapi spec
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
-
-    let state = get_state(&request);
-
-    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
-        .instrument(info_span!("tenant_delete_handler", %tenant_id))
-        .await?;
-
-    json_response(StatusCode::ACCEPTED, ())
-}
-
 /// HTTP endpoint to query the current tenant_size of a tenant.
 ///
 /// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
@@ -1375,9 +1345,6 @@ pub fn make_router(
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
-        .delete("/v1/tenant/:tenant_id", |r| {
-            api_handler(r, tenant_delete_handler)
-        })
        .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
            api_handler(r, tenant_size_handler)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -7,7 +7,7 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod keyspace;
-pub mod metrics;
+pub(crate) mod metrics;
 pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
@@ -47,54 +47,50 @@ pub use crate::metrics::preinitialize_metrics;

 #[tracing::instrument]
 pub async fn shutdown_pageserver(exit_code: i32) {
-    use std::time::Duration;
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
-    timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None),
-        "shutdown LibpqEndpointListener",
-        Duration::from_secs(1),
-    )
-    .await;
+    task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await;

    // Shut down any page service tasks.
-    timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
-        "shutdown PageRequestHandlers",
-        Duration::from_secs(1),
-    )
-    .await;
+    task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await;

    // Shut down all the tenants. This flushes everything to disk and kills
    // the checkpoint and GC tasks.
-    timed(
-        tenant::mgr::shutdown_all_tenants(),
-        "shutdown all tenants",
-        Duration::from_secs(5),
-    )
-    .await;
+    tenant::mgr::shutdown_all_tenants().await;

    // Shut down the HTTP endpoint last, so that you can still check the server's
    // status while it's shutting down.
    // FIXME: We should probably stop accepting commands like attach/detach earlier.
-    timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None),
-        "shutdown http",
-        Duration::from_secs(1),
-    )
-    .await;
+    task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await;

    // There should be nothing left, but let's be sure
-    timed(
-        task_mgr::shutdown_tasks(None, None, None),
-        "shutdown leftovers",
-        Duration::from_secs(1),
-    )
-    .await;
+    task_mgr::shutdown_tasks(None, None, None).await;
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }

+const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
+const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
+
+async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
+    let backoff_duration_seconds =
+        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
+    if backoff_duration_seconds > 0.0 {
+        info!(
+            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
+        );
+        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
+    }
+}
+
+pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
+    if n == 0 {
+        0.0
+    } else {
+        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
+    }
+}
+
 /// The name of the metadata file pageserver creates per timeline.
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
 pub const METADATA_FILE_NAME: &str = "metadata";
@@ -168,7 +164,7 @@ pub struct InitializationOrder {

    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
    /// attempt. It is important to drop this once the attempt has completed.
-    pub initial_logical_size_attempt: Option<utils::completion::Completion>,
+    pub initial_logical_size_attempt: utils::completion::Completion,

    /// Barrier for when we can start any background jobs.
    ///
@@ -176,75 +172,33 @@ pub struct InitializationOrder {
    pub background_jobs_can_start: utils::completion::Barrier,
 }

-/// Time the future with a warning when it exceeds a threshold.
-async fn timed<Fut: std::future::Future>(
-    fut: Fut,
-    name: &str,
-    warn_at: std::time::Duration,
-) -> <Fut as std::future::Future>::Output {
-    let started = std::time::Instant::now();
-
-    let mut fut = std::pin::pin!(fut);
-
-    match tokio::time::timeout(warn_at, &mut fut).await {
-        Ok(ret) => {
-            tracing::info!(
-                task = name,
-                elapsed_ms = started.elapsed().as_millis(),
-                "completed"
-            );
-            ret
-        }
-        Err(_) => {
-            tracing::info!(
-                task = name,
-                elapsed_ms = started.elapsed().as_millis(),
-                "still waiting, taking longer than expected..."
-            );
-
-            let ret = fut.await;
-
-            // this has a global allowed_errors
-            tracing::warn!(
-                task = name,
-                elapsed_ms = started.elapsed().as_millis(),
-                "completed, took longer than expected"
-            );
-
-            ret
-        }
-    }
-}
-
 #[cfg(test)]
-mod timed_tests {
-    use super::timed;
-    use std::time::Duration;
+mod backoff_defaults_tests {
+    use super::*;

-    #[tokio::test]
-    async fn timed_completes_when_inner_future_completes() {
-        // A future that completes on time should have its result returned
-        let r1 = timed(
-            async move {
-                tokio::time::sleep(Duration::from_millis(10)).await;
-                123
-            },
-            "test 1",
-            Duration::from_millis(50),
-        )
-        .await;
-        assert_eq!(r1, 123);
+    #[test]
+    fn backoff_defaults_produce_growing_backoff_sequence() {
+        let mut current_backoff_value = None;

-        // A future that completes too slowly should also have its result returned
-        let r1 = timed(
-            async move {
-                tokio::time::sleep(Duration::from_millis(50)).await;
-                456
-            },
-            "test 1",
-            Duration::from_millis(10),
-        )
-        .await;
-        assert_eq!(r1, 456);
+        for i in 0..10_000 {
+            let new_backoff_value = exponential_backoff_duration_seconds(
+                i,
+                DEFAULT_BASE_BACKOFF_SECONDS,
+                DEFAULT_MAX_BACKOFF_SECONDS,
+            );
+
+            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
+                assert!(
+                    old_backoff_value <= new_backoff_value,
+                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
+                )
+            }
+        }
+
+        assert_eq!(
+            current_backoff_value.expect("Should have produced backoff values to compare"),
+            DEFAULT_MAX_BACKOFF_SECONDS,
+            "Given big enough of retries, backoff should reach its allowed max value"
+        );
    }
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,9 +1,9 @@
 use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
-    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
-    register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
-    register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
-    HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
+    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
+    register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
+    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use strum::VariantNames;
@@ -394,35 +394,6 @@ pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(||
    .expect("failed to define a metric")
 });

-/// How long did we take to start up?  Broken down by labels to describe
-/// different phases of startup.
-pub static STARTUP_DURATION: Lazy<GaugeVec> = Lazy::new(|| {
-    register_gauge_vec!(
-        "pageserver_startup_duration_seconds",
-        "Time taken by phases of pageserver startup, in seconds",
-        &["phase"]
-    )
-    .expect("Failed to register pageserver_startup_duration_seconds metric")
-});
-
-pub static STARTUP_IS_LOADING: Lazy<UIntGauge> = Lazy::new(|| {
-    register_uint_gauge!(
-        "pageserver_startup_is_loading",
-        "1 while in initial startup load of tenants, 0 at other times"
-    )
-    .expect("Failed to register pageserver_startup_is_loading")
-});
-
-/// How long did tenants take to go from construction to active state?
-pub(crate) static TENANT_ACTIVATION: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_tenant_activation_seconds",
-        "Time taken by tenants to activate, in seconds",
-        CRITICAL_OP_BUCKETS.into()
-    )
-    .expect("Failed to register pageserver_tenant_activation_seconds metric")
-});
-
 /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
 pub struct EvictionsWithLowResidenceDuration {
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -53,7 +53,7 @@ use utils::{
    lsn::Lsn,
 };

-use crate::tenant::{block_io, ephemeral_file, writeback_ephemeral_file};
+use crate::tenant::writeback_ephemeral_file;
 use crate::{metrics::PageCacheSizeMetrics, repository::Key};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -98,11 +98,11 @@ enum CacheKey {
        lsn: Lsn,
    },
    EphemeralPage {
-        file_id: ephemeral_file::FileId,
+        file_id: u64,
        blkno: u32,
    },
    ImmutableFilePage {
-        file_id: block_io::FileId,
+        file_id: u64,
        blkno: u32,
    },
 }
@@ -177,9 +177,9 @@ pub struct PageCache {
    /// can have a separate mapping map, next to this field.
    materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,

-    ephemeral_page_map: RwLock<HashMap<(ephemeral_file::FileId, u32), usize>>,
+    ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,

-    immutable_page_map: RwLock<HashMap<(block_io::FileId, u32), usize>>,
+    immutable_page_map: RwLock<HashMap<(u64, u32), usize>>,

    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,
@@ -390,28 +390,20 @@ impl PageCache {

    // Section 1.2: Public interface functions for working with Ephemeral pages.

-    pub fn read_ephemeral_buf(
-        &self,
-        file_id: ephemeral_file::FileId,
-        blkno: u32,
-    ) -> anyhow::Result<ReadBufResult> {
+    pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
        let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };

        self.lock_for_read(&mut cache_key)
    }

-    pub fn write_ephemeral_buf(
-        &self,
-        file_id: ephemeral_file::FileId,
-        blkno: u32,
-    ) -> anyhow::Result<WriteBufResult> {
+    pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<WriteBufResult> {
        let cache_key = CacheKey::EphemeralPage { file_id, blkno };

        self.lock_for_write(&cache_key)
    }

    /// Immediately drop all buffers belonging to given file, without writeback
-    pub fn drop_buffers_for_ephemeral(&self, drop_file_id: ephemeral_file::FileId) {
+    pub fn drop_buffers_for_ephemeral(&self, drop_file_id: u64) {
        for slot_idx in 0..self.slots.len() {
            let slot = &self.slots[slot_idx];

@@ -432,18 +424,14 @@ impl PageCache {

    // Section 1.3: Public interface functions for working with immutable file pages.

-    pub fn read_immutable_buf(
-        &self,
-        file_id: block_io::FileId,
-        blkno: u32,
-    ) -> anyhow::Result<ReadBufResult> {
+    pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };

        self.lock_for_read(&mut cache_key)
    }

    /// Immediately drop all buffers belonging to given file, without writeback
-    pub fn drop_buffers_for_immutable(&self, drop_file_id: block_io::FileId) {
+    pub fn drop_buffers_for_immutable(&self, drop_file_id: u64) {
        for slot_idx in 0..self.slots.len() {
            let slot = &self.slots[slot_idx];

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -28,7 +28,6 @@ use std::cmp::min;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
-use std::fmt::Debug;
 use std::fs;
 use std::fs::File;
 use std::fs::OpenOptions;
@@ -47,10 +46,9 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

 use self::config::TenantConf;
-use self::delete::DeleteTenantFlow;
+use self::delete::DeleteTimelineFlow;
 use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
-use self::mgr::TenantsMap;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
@@ -59,7 +57,6 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir;
 use crate::is_uninit_mark;
-use crate::metrics::TENANT_ACTIVATION;
 use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
 use crate::repository::GcResult;
 use crate::task_mgr;
@@ -73,7 +70,6 @@ use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
 use crate::InitializationOrder;

-use crate::tenant::timeline::delete::DeleteTimelineFlow;
 use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
@@ -109,7 +105,6 @@ macro_rules! pausable_failpoint {

 pub mod blob_io;
 pub mod block_io;
-
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
@@ -150,8 +145,6 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines";

 pub const TENANT_ATTACHING_MARKER_FILENAME: &str = "attaching";

-pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
-
 ///
 /// Tenant consists of multiple timelines. Keep them in a hash table.
 ///
@@ -190,8 +183,6 @@ pub struct Tenant {
    cached_synthetic_tenant_size: Arc<AtomicU64>,

    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
-
-    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
 }

 // We should not blindly overwrite local metadata with remote one.
@@ -283,7 +274,7 @@ pub enum LoadLocalTimelineError {
    ResumeDeletion(#[source] anyhow::Error),
 }

-#[derive(thiserror::Error)]
+#[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
    #[error("NotFound")]
    NotFound,
@@ -292,37 +283,17 @@ pub enum DeleteTimelineError {
    HasChildren(Vec<TimelineId>),

    #[error("Timeline deletion is already in progress")]
-    AlreadyInProgress(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>),
+    AlreadyInProgress,

    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }

-impl Debug for DeleteTimelineError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            Self::NotFound => write!(f, "NotFound"),
-            Self::HasChildren(c) => f.debug_tuple("HasChildren").field(c).finish(),
-            Self::AlreadyInProgress(_) => f.debug_tuple("AlreadyInProgress").finish(),
-            Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
-        }
-    }
-}
-
 pub enum SetStoppingError {
    AlreadyStopping(completion::Barrier),
    Broken,
 }

-impl Debug for SetStoppingError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            Self::AlreadyStopping(_) => f.debug_tuple("AlreadyStopping").finish(),
-            Self::Broken => write!(f, "Broken"),
-        }
-    }
-}
-
 struct RemoteStartupData {
    index_part: IndexPart,
    remote_metadata: TimelineMetadata,
@@ -645,7 +616,7 @@ impl Tenant {
        // For every timeline, download the metadata file, scan the local directory,
        // and build a layer map that contains an entry for each remote and local
        // layer file.
-        let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
+        let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
        for (timeline_id, remote_metadata) in sorted_timelines {
            let (index_part, remote_client) = remote_index_and_client
                .remove(&timeline_id)
@@ -674,19 +645,20 @@ impl Tenant {
        Ok(())
    }

-    /// Get sum of all remote timelines sizes
+    /// get size of all remote timelines
    ///
    /// This function relies on the index_part instead of listing the remote storage
-    pub fn remote_size(&self) -> u64 {
+    ///
+    pub async fn get_remote_size(&self) -> anyhow::Result<u64> {
        let mut size = 0;

-        for timeline in self.list_timelines() {
+        for timeline in self.list_timelines().iter() {
            if let Some(remote_client) = &timeline.remote_client {
                size += remote_client.get_remote_physical_size();
            }
        }

-        size
+        Ok(size)
    }

    #[instrument(skip_all, fields(timeline_id=%timeline_id))]
@@ -769,13 +741,12 @@ impl Tenant {
    /// If the loading fails for some reason, the Tenant will go into Broken
    /// state.
    #[instrument(skip_all, fields(tenant_id=%tenant_id))]
-    pub(crate) fn spawn_load(
+    pub fn spawn_load(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
        broker_client: storage_broker::BrokerClientChannel,
        remote_storage: Option<GenericRemoteStorage>,
        init_order: Option<InitializationOrder>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        ctx: &RequestContext,
    ) -> Arc<Tenant> {
        span::debug_assert_current_span_has_tenant_id();
@@ -795,7 +766,7 @@ impl Tenant {
            tenant_conf,
            wal_redo_manager,
            tenant_id,
-            remote_storage.clone(),
+            remote_storage,
        );
        let tenant = Arc::new(tenant);

@@ -811,83 +782,27 @@ impl Tenant {
            "initial tenant load",
            false,
            async move {
-                let make_broken = |t: &Tenant, err: anyhow::Error| {
-                    error!("load failed, setting tenant state to Broken: {err:?}");
-                    t.state.send_modify(|state| {
-                        assert!(
-                            matches!(*state, TenantState::Loading | TenantState::Stopping { .. }),
-                            "the loading task owns the tenant state until activation is complete"
-                        );
-                        *state = TenantState::broken_from_reason(err.to_string());
-                    });
-                };
-
                let mut init_order = init_order;

                // take the completion because initial tenant loading will complete when all of
                // these tasks complete.
-                let _completion = init_order
-                    .as_mut()
-                    .and_then(|x| x.initial_tenant_load.take());
-
-                // Dont block pageserver startup on figuring out deletion status
-                let pending_deletion = {
-                    match DeleteTenantFlow::should_resume_deletion(
-                        conf,
-                        remote_storage.as_ref(),
-                        &tenant_clone,
-                    )
-                    .await
-                    {
-                        Ok(should_resume_deletion) => should_resume_deletion,
-                        Err(err) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(err));
-                            return Ok(());
-                        }
-                    }
-                };
-
-                info!("pending deletion {}", pending_deletion.is_some());
-
-                if let Some(deletion) = pending_deletion {
-                    // as we are no longer loading, signal completion by dropping
-                    // the completion while we resume deletion
-                    drop(_completion);
-                    // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
-                    let _ = init_order
-                        .as_mut()
-                        .and_then(|x| x.initial_logical_size_attempt.take());
-
-                    match DeleteTenantFlow::resume(
-                        deletion,
-                        &tenant_clone,
-                        init_order.as_ref(),
-                        tenants,
-                        &ctx,
-                    )
-                    .await
-                    {
-                        Err(err) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(err));
-                            return Ok(());
-                        }
-                        Ok(()) => return Ok(()),
-                    }
-                }
-
-                let background_jobs_can_start =
-                    init_order.as_ref().map(|x| &x.background_jobs_can_start);
+                let _completion = init_order.as_mut().and_then(|x| x.initial_tenant_load.take());

                match tenant_clone.load(init_order.as_ref(), &ctx).await {
                    Ok(()) => {
-                        debug!("load finished",);
-
+                        debug!("load finished, activating");
+                        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
                        tenant_clone.activate(broker_client, background_jobs_can_start, &ctx);
                    }
-                    Err(err) => make_broken(&tenant_clone, err),
+                    Err(err) => {
+                        error!("load failed, setting tenant state to Broken: {err:?}");
+                        tenant_clone.state.send_modify(|state| {
+                            assert_eq!(*state, TenantState::Loading, "the loading task owns the tenant state until activation is complete");
+                            *state = TenantState::broken_from_reason(err.to_string());
+                        });
+                    }
                }
-
-                Ok(())
+               Ok(())
            }
            .instrument({
                let span = tracing::info_span!(parent: None, "load", tenant_id=%tenant_id);
@@ -963,8 +878,6 @@ impl Tenant {
                        )
                    })?;

-                info!("Found deletion mark for timeline {}", timeline_id);
-
                match load_metadata(self.conf, &self.tenant_id, &timeline_id) {
                    Ok(metadata) => {
                        timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
@@ -1054,11 +967,9 @@ impl Tenant {

        // Sort the array of timeline IDs into tree-order, so that parent comes before
        // all its children.
-        tree_sort_timelines(timelines_to_load, |m| m.ancestor_timeline()).map(|sorted_timelines| {
-            TenantDirectoryScan {
-                sorted_timelines_to_load: sorted_timelines,
-                timelines_to_resume_deletion,
-            }
+        tree_sort_timelines(timelines_to_load).map(|sorted_timelines| TenantDirectoryScan {
+            sorted_timelines_to_load: sorted_timelines,
+            timelines_to_resume_deletion,
        })
    }

@@ -1104,9 +1015,8 @@ impl Tenant {
            {
                match e {
                    LoadLocalTimelineError::Load(source) => {
-                        return Err(anyhow::anyhow!(source)).with_context(|| {
-                            format!("Failed to load local timeline: {timeline_id}")
-                        })
+                        return Err(anyhow::anyhow!(source)
+                            .context("Failed to load local timeline: {timeline_id}"))
                    }
                    LoadLocalTimelineError::ResumeDeletion(source) => {
                        // Make sure resumed deletion wont fail loading for entire tenant.
@@ -1731,8 +1641,6 @@ impl Tenant {
                    post_state = <&'static str>::from(&*current_state),
                    "activation attempt finished"
                );
-
-                TENANT_ACTIVATION.observe(elapsed.as_secs_f64());
            });
        }
    }
@@ -1773,7 +1681,7 @@ impl Tenant {
        // It's mesed up.
        // we just ignore the failure to stop

-        match self.set_stopping(shutdown_progress, false).await {
+        match self.set_stopping(shutdown_progress).await {
            Ok(()) => {}
            Err(SetStoppingError::Broken) => {
                // assume that this is acceptable
@@ -1813,25 +1721,18 @@ impl Tenant {
    /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
    ///
    /// This function is not cancel-safe!
-    ///
-    /// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant.
-    async fn set_stopping(
-        &self,
-        progress: completion::Barrier,
-        allow_transition_from_loading: bool,
-    ) -> Result<(), SetStoppingError> {
+    async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> {
        let mut rx = self.state.subscribe();

        // cannot stop before we're done activating, so wait out until we're done activating
        rx.wait_for(|state| match state {
-            TenantState::Activating(_) | TenantState::Attaching => {
+            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
                info!(
                    "waiting for {} to turn Active|Broken|Stopping",
                    <&'static str>::from(state)
                );
                false
            }
-            TenantState::Loading => allow_transition_from_loading,
            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
        })
        .await
@@ -1840,16 +1741,9 @@ impl Tenant {
        // we now know we're done activating, let's see whether this task is the winner to transition into Stopping
        let mut err = None;
        let stopping = self.state.send_if_modified(|current_state| match current_state {
-            TenantState::Activating(_) | TenantState::Attaching => {
+            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
                unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
            }
-            TenantState::Loading => {
-                if !allow_transition_from_loading {
-                    unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
-                };
-                *current_state = TenantState::Stopping { progress };
-                true
-            }
            TenantState::Active => {
                // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
                // are created after the transition to Stopping. That's harmless, as the Timelines
@@ -1918,10 +1812,6 @@ impl Tenant {
        .expect("cannot drop self.state while on a &self method");

        // we now know we're done activating, let's see whether this task is the winner to transition into Broken
-        self.set_broken_no_wait(reason)
-    }
-
-    pub(crate) fn set_broken_no_wait(&self, reason: String) {
        self.state.send_modify(|current_state| {
            match *current_state {
                TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
@@ -1987,28 +1877,22 @@ impl Tenant {
 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
 /// perform a topological sort, so that the parent of each timeline comes
 /// before the children.
-/// E extracts the ancestor from T
-/// This allows for T to be different. It can be TimelineMetadata, can be Timeline itself, etc.
-fn tree_sort_timelines<T, E>(
-    timelines: HashMap<TimelineId, T>,
-    extractor: E,
-) -> anyhow::Result<Vec<(TimelineId, T)>>
-where
-    E: Fn(&T) -> Option<TimelineId>,
-{
+fn tree_sort_timelines(
+    timelines: HashMap<TimelineId, TimelineMetadata>,
+) -> anyhow::Result<Vec<(TimelineId, TimelineMetadata)>> {
    let mut result = Vec::with_capacity(timelines.len());

    let mut now = Vec::with_capacity(timelines.len());
    // (ancestor, children)
-    let mut later: HashMap<TimelineId, Vec<(TimelineId, T)>> =
+    let mut later: HashMap<TimelineId, Vec<(TimelineId, TimelineMetadata)>> =
        HashMap::with_capacity(timelines.len());

-    for (timeline_id, value) in timelines {
-        if let Some(ancestor_id) = extractor(&value) {
+    for (timeline_id, metadata) in timelines {
+        if let Some(ancestor_id) = metadata.ancestor_timeline() {
            let children = later.entry(ancestor_id).or_default();
-            children.push((timeline_id, value));
+            children.push((timeline_id, metadata));
        } else {
-            now.push((timeline_id, value));
+            now.push((timeline_id, metadata));
        }
    }

@@ -2177,7 +2061,7 @@ impl Tenant {
            remote_client,
            pg_version,
            initial_logical_size_can_start.cloned(),
-            initial_logical_size_attempt.cloned().flatten(),
+            initial_logical_size_attempt.cloned(),
            state,
        );

@@ -2261,7 +2145,6 @@ impl Tenant {
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
-            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
        }
    }

@@ -2278,7 +2161,6 @@ impl Tenant {
        // FIXME If the config file is not found, assume that we're attaching
        // a detached tenant and config is passed via attach command.
        // https://github.com/neondatabase/neon/issues/1555
-        // OR: we're loading after incomplete deletion that managed to remove config.
        if !target_config_path.exists() {
            info!("tenant config not found in {target_config_display}");
            return Ok(TenantConfOpt::default());
@@ -3008,7 +2890,7 @@ impl Tenant {
            .set(size);
    }

-    pub fn cached_synthetic_size(&self) -> u64 {
+    pub fn get_cached_synthetic_size(&self) -> u64 {
        self.cached_synthetic_tenant_size.load(Ordering::Relaxed)
    }
 }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -21,15 +21,15 @@ where
    R: BlockReader,
 {
    /// Read a blob into a new buffer.
-    pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    pub fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
-        self.read_blob_into_buf(offset, &mut buf).await?;
+        self.read_blob_into_buf(offset, &mut buf)?;
        Ok(buf)
    }
    /// Read blob into the given buffer. Any previous contents in the buffer
    /// are overwritten.
-    pub async fn read_blob_into_buf(
-        &self,
+    pub fn read_blob_into_buf(
+        &mut self,
        offset: u64,
        dstbuf: &mut Vec<u8>,
    ) -> Result<(), std::io::Error> {
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -2,7 +2,8 @@
 //! Low-level Block-oriented I/O functions
 //!

-use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
+use crate::page_cache;
+use crate::page_cache::{ReadBufResult, PAGE_SZ};
 use bytes::Bytes;
 use std::ops::{Deref, DerefMut};
 use std::os::unix::fs::FileExt;
@@ -14,12 +15,14 @@ use std::sync::atomic::AtomicU64;
 /// There are currently two implementations: EphemeralFile, and FileBlockReader
 /// below.
 pub trait BlockReader {
+    type BlockLease: Deref<Target = [u8; PAGE_SZ]> + 'static;
+
    ///
    /// Read a block. Returns a "lease" object that can be used to
    /// access to the contents of the page. (For the page cache, the
    /// lease object represents a lock on the buffer.)
    ///
-    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error>;
+    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error>;

    ///
    /// Create a new "cursor" for reading from this reader.
@@ -38,48 +41,13 @@ impl<B> BlockReader for &B
 where
    B: BlockReader,
 {
-    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+    type BlockLease = B::BlockLease;
+
+    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
        (*self).read_blk(blknum)
    }
 }

-/// A block accessible for reading
-///
-/// During builds with `#[cfg(test)]`, this is a proper enum
-/// with two variants to support testing code. During normal
-/// builds, it just has one variant and is thus a cheap newtype
-/// wrapper of [`PageReadGuard`]
-pub enum BlockLease {
-    PageReadGuard(PageReadGuard<'static>),
-    #[cfg(test)]
-    Rc(std::rc::Rc<[u8; PAGE_SZ]>),
-}
-
-impl From<PageReadGuard<'static>> for BlockLease {
-    fn from(value: PageReadGuard<'static>) -> Self {
-        BlockLease::PageReadGuard(value)
-    }
-}
-
-#[cfg(test)]
-impl From<std::rc::Rc<[u8; PAGE_SZ]>> for BlockLease {
-    fn from(value: std::rc::Rc<[u8; PAGE_SZ]>) -> Self {
-        BlockLease::Rc(value)
-    }
-}
-
-impl Deref for BlockLease {
-    type Target = [u8; PAGE_SZ];
-
-    fn deref(&self) -> &Self::Target {
-        match self {
-            BlockLease::PageReadGuard(v) => v.deref(),
-            #[cfg(test)]
-            BlockLease::Rc(v) => v.deref(),
-        }
-    }
-}
-
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
@@ -112,17 +80,11 @@ where
        BlockCursor { reader }
    }

-    pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+    pub fn read_blk(&mut self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
        self.reader.read_blk(blknum)
    }
 }
 static NEXT_ID: AtomicU64 = AtomicU64::new(1);
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
-pub struct FileId(u64);
-
-fn next_file_id() -> FileId {
-    FileId(NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed))
-}

 /// An adapter for reading a (virtual) file using the page cache.
 ///
@@ -132,7 +94,7 @@ pub struct FileBlockReader<F> {
    pub file: F,

    /// Unique ID of this file, used as key in the page cache.
-    file_id: FileId,
+    file_id: u64,
 }

 impl<F> FileBlockReader<F>
@@ -140,7 +102,7 @@ where
    F: FileExt,
 {
    pub fn new(file: F) -> Self {
-        let file_id = next_file_id();
+        let file_id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);

        FileBlockReader { file_id, file }
    }
@@ -156,7 +118,9 @@ impl<F> BlockReader for FileBlockReader<F>
 where
    F: FileExt,
 {
-    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+    type BlockLease = page_cache::PageReadGuard<'static>;
+
+    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
        // Look up the right page
        let cache = page_cache::get();
        loop {
@@ -168,7 +132,7 @@ where
                        format!("Failed to read immutable buf: {e:#}"),
                    )
                })? {
-                ReadBufResult::Found(guard) => break Ok(guard.into()),
+                ReadBufResult::Found(guard) => break Ok(guard),
                ReadBufResult::NotFound(mut write_guard) => {
                    // Read the page from disk into the buffer
                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -1,99 +1,110 @@
 use std::{
-    path::{Path, PathBuf},
+    ops::{Deref, DerefMut},
    sync::Arc,
 };

 use anyhow::Context;
-use pageserver_api::models::TenantState;
-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use pageserver_api::models::TimelineState;
 use tokio::sync::OwnedMutexGuard;
-use tracing::{error, info, instrument, warn, Instrument, Span};
-
+use tracing::{debug, error, info, instrument, warn, Instrument, Span};
 use utils::{
-    backoff, completion, crashsafe, fs_ext,
+    crashsafe, fs_ext,
    id::{TenantId, TimelineId},
 };

 use crate::{
    config::PageServerConf,
-    context::RequestContext,
    task_mgr::{self, TaskKind},
+    tenant::{remote_timeline_client, DeleteTimelineError},
    InitializationOrder,
 };

 use super::{
-    mgr::{GetTenantError, TenantsMap},
-    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
-    span,
-    timeline::delete::DeleteTimelineFlow,
-    tree_sort_timelines, DeleteTimelineError, Tenant,
+    metadata::TimelineMetadata,
+    remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
+    CreateTimelineCause, Tenant, Timeline,
 };

-const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
+/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
+async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
+    // Stop the walreceiver first.
+    debug!("waiting for wal receiver to shutdown");
+    let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
+    if let Some(walreceiver) = maybe_started_walreceiver {
+        walreceiver.stop().await;
+    }
+    debug!("wal receiver shutdown confirmed");

-#[derive(Debug, thiserror::Error)]
-pub enum DeleteTenantError {
-    #[error("GetTenant {0}")]
-    Get(#[from] GetTenantError),
+    // Prevent new uploads from starting.
+    if let Some(remote_client) = timeline.remote_client.as_ref() {
+        let res = remote_client.stop();
+        match res {
+            Ok(()) => {}
+            Err(e) => match e {
+                remote_timeline_client::StopError::QueueUninitialized => {
+                    // This case shouldn't happen currently because the
+                    // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
+                    // That is, before we declare the Tenant as Active.
+                    // But we only allow calls to delete_timeline on Active tenants.
+                    return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
+                }
+            },
+        }
+    }

-    #[error("Invalid state {0}. Expected Active or Broken")]
-    InvalidState(TenantState),
-
-    #[error("Tenant deletion is already in progress")]
-    AlreadyInProgress,
-
-    #[error("Timeline {0}")]
-    Timeline(#[from] DeleteTimelineError),
-
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
-}
-
-type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
-
-fn remote_tenant_delete_mark_path(
-    conf: &PageServerConf,
-    tenant_id: &TenantId,
-) -> anyhow::Result<RemotePath> {
-    let tenant_remote_path = conf
-        .tenant_path(tenant_id)
-        .strip_prefix(&conf.workdir)
-        .context("Failed to strip workdir prefix")
-        .and_then(RemotePath::new)
-        .context("tenant path")?;
-    Ok(tenant_remote_path.join(Path::new("deleted")))
-}
-
-async fn create_remote_delete_mark(
-    conf: &PageServerConf,
-    remote_storage: &GenericRemoteStorage,
-    tenant_id: &TenantId,
-) -> Result<(), DeleteTenantError> {
-    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
-
-    let data: &[u8] = &[];
-    backoff::retry(
-        || async {
-            remote_storage
-                .upload(data, 0, &remote_mark_path, None)
-                .await
-        },
-        |_e| false,
-        FAILED_UPLOAD_WARN_THRESHOLD,
-        FAILED_REMOTE_OP_RETRIES,
-        "mark_upload",
-    )
-    .await
-    .context("mark_upload")?;
+    // Stop & wait for the remaining timeline tasks, including upload tasks.
+    // NB: This and other delete_timeline calls do not run as a task_mgr task,
+    //     so, they are not affected by this shutdown_tasks() call.
+    info!("waiting for timeline tasks to shutdown");
+    task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await;

+    fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: timeline-delete-before-index-deleted-at"
+        ))?
+    });
    Ok(())
 }

-async fn create_local_delete_mark(
+/// Mark timeline as deleted in S3 so we won't pick it up next time
+/// during attach or pageserver restart.
+/// See comment in persist_index_part_with_deleted_flag.
+async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
+    if let Some(remote_client) = timeline.remote_client.as_ref() {
+        match remote_client.persist_index_part_with_deleted_flag().await {
+            // If we (now, or already) marked it successfully as deleted, we can proceed
+            Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
+            // Bail out otherwise
+            //
+            // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
+            // two tasks from performing the deletion at the same time. The first task
+            // that starts deletion should run it to completion.
+            Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
+            | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
+                return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
+            }
+        }
+    }
+    Ok(())
+}
+
+// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued.
+// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json
+// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote"
+// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file.
+// After index part is deleted presence of this mark file indentifies that it was a deletion intention.
+// So we can just remove the mark file.
+async fn create_delete_mark(
    conf: &PageServerConf,
-    tenant_id: &TenantId,
-) -> Result<(), DeleteTenantError> {
-    let marker_path = conf.tenant_deleted_mark_file_path(tenant_id);
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+) -> Result<(), DeleteTimelineError> {
+    fail::fail_point!("timeline-delete-before-delete-mark", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: timeline-delete-before-delete-mark"
+        ))?
+    });
+    let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id);

    // Note: we're ok to replace existing file.
    let _ = std::fs::OpenOptions::new()
@@ -103,259 +114,271 @@ async fn create_local_delete_mark(
        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;

    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
-
    Ok(())
 }

-async fn schedule_ordered_timeline_deletions(
-    tenant: &Arc<Tenant>,
-) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
-    // Tenant is stopping at this point. We know it will be deleted.
-    // No new timelines should be created.
-    // Tree sort timelines to delete from leafs to the root.
-    // NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
-    // can complete and remove timeline from the map in between our call to clone
-    // and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
-    // timelines.lock is currently synchronous so we cant hold it across await point.
-    // So just ignore NotFound error if we get it from `run`.
-    // Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
-    let timelines = tenant.timelines.lock().unwrap().clone();
-    let sorted =
-        tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
+/// Grab the layer_removal_cs lock, and actually perform the deletion.
+///
+/// This lock prevents prevents GC or compaction from running at the same time.
+/// The GC task doesn't register itself with the timeline it's operating on,
+/// so it might still be running even though we called `shutdown_tasks`.
+///
+/// Note that there are still other race conditions between
+/// GC, compaction and timeline deletion. See
+/// <https://github.com/neondatabase/neon/issues/2671>
+///
+/// No timeout here, GC & Compaction should be responsive to the
+/// `TimelineState::Stopping` change.
+async fn delete_local_layer_files(
+    conf: &PageServerConf,
+    tenant_id: TenantId,
+    timeline: &Timeline,
+) -> anyhow::Result<()> {
+    info!("waiting for layer_removal_cs.lock()");
+    let layer_removal_guard = timeline.layer_removal_cs.lock().await;
+    info!("got layer_removal_cs.lock(), deleting layer files");

-    let mut already_running_deletions = vec![];
+    // NB: storage_sync upload tasks that reference these layers have been cancelled
+    //     by the caller.

-    for (timeline_id, _) in sorted.into_iter().rev() {
-        if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await {
-            match e {
-                DeleteTimelineError::NotFound => {
-                    // Timeline deletion finished after call to clone above but before call
-                    // to `DeleteTimelineFlow::run` and removed timeline from the map.
-                    continue;
-                }
-                DeleteTimelineError::AlreadyInProgress(guard) => {
-                    already_running_deletions.push((guard, timeline_id));
-                    continue;
-                }
-                e => return Err(DeleteTenantError::Timeline(e)),
+    let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id);
+
+    fail::fail_point!("timeline-delete-before-rm", |_| {
+        Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
+    });
+
+    // NB: This need not be atomic because the deleted flag in the IndexPart
+    // will be observed during tenant/timeline load. The deletion will be resumed there.
+    //
+    // For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
+    //
+    // Note that here we do not bail out on std::io::ErrorKind::NotFound.
+    // This can happen if we're called a second time, e.g.,
+    // because of a previous failure/cancellation at/after
+    // failpoint timeline-delete-after-rm.
+    //
+    // It can also happen if we race with tenant detach, because,
+    // it doesn't grab the layer_removal_cs lock.
+    //
+    // For now, log and continue.
+    // warn! level is technically not appropriate for the
+    // first case because we should expect retries to happen.
+    // But the error is so rare, it seems better to get attention if it happens.
+    //
+    // Note that metadata removal is skipped, this is not technically needed,
+    // but allows to reuse timeline loading code during resumed deletion.
+    // (we always expect that metadata is in place when timeline is being loaded)
+
+    #[cfg(feature = "testing")]
+    let mut counter = 0;
+
+    // Timeline directory may not exist if we failed to delete mark file and request was retried.
+    if !local_timeline_directory.exists() {
+        return Ok(());
+    }
+
+    let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id);
+
+    for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
+        #[cfg(feature = "testing")]
+        {
+            counter += 1;
+            if counter == 2 {
+                fail::fail_point!("timeline-delete-during-rm", |_| {
+                    Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
+                });
            }
        }
-    }

-    Ok(already_running_deletions)
-}
-
-async fn ensure_timelines_dir_empty(timelines_path: &Path) -> Result<(), DeleteTenantError> {
-    // Assert timelines dir is empty.
-    if !fs_ext::is_directory_empty(timelines_path).await? {
-        // Display first 10 items in directory
-        let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
-        return Err(DeleteTenantError::Other(anyhow::anyhow!(
-            "Timelines directory is not empty after all timelines deletion: {list:?}"
-        )));
-    }
-
-    Ok(())
-}
-
-async fn remove_tenant_remote_delete_mark(
-    conf: &PageServerConf,
-    remote_storage: Option<&GenericRemoteStorage>,
-    tenant_id: &TenantId,
-) -> Result<(), DeleteTenantError> {
-    if let Some(remote_storage) = remote_storage {
-        let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
-        backoff::retry(
-            || async { remote_storage.delete(&path).await },
-            |_e| false,
-            FAILED_UPLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "remove_tenant_remote_delete_mark",
-        )
-        .await
-        .context("remove_tenant_remote_delete_mark")?;
-    }
-    Ok(())
-}
-
-// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
-async fn cleanup_remaining_fs_traces(
-    conf: &PageServerConf,
-    tenant_id: &TenantId,
-) -> Result<(), DeleteTenantError> {
-    let rm = |p: PathBuf, is_dir: bool| async move {
-        if is_dir {
-            tokio::fs::remove_dir(&p).await
-        } else {
-            tokio::fs::remove_file(&p).await
+        let entry = entry?;
+        if entry.path() == metadata_path {
+            debug!("found metadata, skipping");
+            continue;
        }
-        .or_else(fs_ext::ignore_not_found)
-        .with_context(|| {
-            let to_display = p.display();
-            format!("failed to delete {to_display}")
-        })
+
+        if entry.path() == local_timeline_directory {
+            // Keeping directory because metedata file is still there
+            debug!("found timeline dir itself, skipping");
+            continue;
+        }
+
+        let metadata = match entry.metadata() {
+            Ok(metadata) => metadata,
+            Err(e) => {
+                if crate::is_walkdir_io_not_found(&e) {
+                    warn!(
+                        timeline_dir=?local_timeline_directory,
+                        path=?entry.path().display(),
+                        "got not found err while removing timeline dir, proceeding anyway"
+                    );
+                    continue;
+                }
+                anyhow::bail!(e);
+            }
+        };
+
+        let r = if metadata.is_dir() {
+            // There shouldnt be any directories inside timeline dir as of current layout.
+            tokio::fs::remove_dir(entry.path()).await
+        } else {
+            tokio::fs::remove_file(entry.path()).await
+        };
+
+        if let Err(e) = r {
+            if e.kind() == std::io::ErrorKind::NotFound {
+                warn!(
+                    timeline_dir=?local_timeline_directory,
+                    path=?entry.path().display(),
+                    "got not found err while removing timeline dir, proceeding anyway"
+                );
+                continue;
+            }
+            anyhow::bail!(anyhow::anyhow!(
+                "Failed to remove: {}. Error: {e}",
+                entry.path().display()
+            ));
+        }
+    }
+
+    info!("finished deleting layer files, releasing layer_removal_cs.lock()");
+    drop(layer_removal_guard);
+
+    fail::fail_point!("timeline-delete-after-rm", |_| {
+        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
+    });
+
+    Ok(())
+}
+
+/// Removes remote layers and an index file after them.
+async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
+    if let Some(remote_client) = &timeline.remote_client {
+        remote_client.delete_all().await.context("delete_all")?
    };

-    rm(conf.tenant_config_path(tenant_id), false).await?;
+    Ok(())
+}

-    fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
+// This function removs remaining traces of a timeline on disk.
+// Namely: metadata file, timeline directory, delete mark.
+// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
+// delete mark should be present because it is the last step during deletion.
+// (nothing can fail after its deletion)
+async fn cleanup_remaining_timeline_fs_traces(
+    conf: &PageServerConf,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+) -> anyhow::Result<()> {
+    // Remove local metadata
+    tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id))
+        .await
+        .or_else(fs_ext::ignore_not_found)
+        .context("remove metadata")?;
+
+    fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-remove-timelines-dir"
+            "failpoint: timeline-delete-after-rm-metadata"
        ))?
    });

-    rm(conf.timelines_path(tenant_id), true).await?;
+    // Remove timeline dir
+    tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id))
+        .await
+        .or_else(fs_ext::ignore_not_found)
+        .context("timeline dir")?;

-    fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-remove-deleted-mark"
-        ))?
+    fail::fail_point!("timeline-delete-after-rm-dir", |_| {
+        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
    });

-    // Make sure previous deletions are ordered before mark removal.
-    // Otherwise there is no guarantee that they reach the disk before mark deletion.
-    // So its possible for mark to reach disk first and for other deletions
-    // to be reordered later and thus missed if a crash occurs.
-    // Note that we dont need to sync after mark file is removed
-    // because we can tolerate the case when mark file reappears on startup.
-    let tenant_path = &conf.tenant_path(tenant_id);
-    if tenant_path.exists() {
-        crashsafe::fsync_async(&conf.tenant_path(tenant_id))
-            .await
-            .context("fsync_pre_mark_remove")?;
+    // Remove delete mark
+    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
+        .await
+        .context("remove delete mark")
+}
+
+/// It is important that this gets called when DeletionGuard is being held.
+/// For more context see comments in [`DeleteTimelineFlow::prepare`]
+async fn remove_timeline_from_tenant(
+    tenant: &Tenant,
+    timeline_id: TimelineId,
+    _: &DeletionGuard, // using it as a witness
+) -> anyhow::Result<()> {
+    // Remove the timeline from the map.
+    let mut timelines = tenant.timelines.lock().unwrap();
+    let children_exist = timelines
+        .iter()
+        .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
+    // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
+    // We already deleted the layer files, so it's probably best to panic.
+    // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
+    if children_exist {
+        panic!("Timeline grew children while we removed layer files");
    }

-    rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
+    timelines
+        .remove(&timeline_id)
+        .expect("timeline that we were deleting was concurrently removed from 'timelines' map");

-    fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-remove-tenant-dir"
-        ))?
-    });
-
-    rm(conf.tenant_path(tenant_id), true).await?;
+    drop(timelines);

    Ok(())
 }

-pub(crate) async fn remote_delete_mark_exists(
-    conf: &PageServerConf,
-    tenant_id: &TenantId,
-    remote_storage: &GenericRemoteStorage,
-) -> anyhow::Result<bool> {
-    // If remote storage is there we rely on it
-    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?;
-
-    let result = backoff::retry(
-        || async { remote_storage.download(&remote_mark_path).await },
-        |e| matches!(e, DownloadError::NotFound),
-        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
-        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
-        "fetch_tenant_deletion_mark",
-    )
-    .await;
-
-    match result {
-        Ok(_) => Ok(true),
-        Err(DownloadError::NotFound) => Ok(false),
-        Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?,
-    }
-}
-
-/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
+/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
 /// and deletes its data from both disk and s3.
 /// The sequence of steps:
-/// 1. Upload remote deletion mark.
+/// 1. Set deleted_at in remote index part.
 /// 2. Create local mark file.
-/// 3. Shutdown tasks
-/// 4. Run ordered timeline deletions
-/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
-/// 6. Remove remote mark
-/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
+/// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
+/// 4. Delete remote layers
+/// 5. Delete index part
+/// 6. Delete meta, timeline directory
+/// 7. Delete mark file
 /// It is resumable from any step in case a crash/restart occurs.
 /// There are three entrypoints to the process:
-/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
-/// 2. [`DeleteTenantFlow::resume`] is called during restarts when local or remote deletion marks are still there.
-/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
+/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
+/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
+/// and we possibly neeed to continue deletion of remote files.
+/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
+/// index but still have local metadata, timeline directory and delete mark.
+/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
 #[derive(Default)]
-pub enum DeleteTenantFlow {
+pub enum DeleteTimelineFlow {
    #[default]
    NotStarted,
    InProgress,
    Finished,
 }

-impl DeleteTenantFlow {
+impl DeleteTimelineFlow {
    // These steps are run in the context of management api request handler.
    // Long running steps are continued to run in the background.
    // NB: If this fails half-way through, and is retried, the retry will go through
    // all the same steps again. Make sure the code here is idempotent, and don't
    // error out if some of the shutdown tasks have already been completed!
-    // NOTE: static needed for background part.
-    // We assume that calling code sets up the span with tenant_id.
-    #[instrument(skip_all)]
-    pub(crate) async fn run(
-        conf: &'static PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        tenant_id: TenantId,
-    ) -> Result<(), DeleteTenantError> {
-        span::debug_assert_current_span_has_tenant_id();
+    #[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))]
+    pub async fn run(
+        tenant: &Arc<Tenant>,
+        timeline_id: TimelineId,
+    ) -> Result<(), DeleteTimelineError> {
+        let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;

-        let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
-
-        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
-            tenant.set_broken(format!("{e:#}")).await;
-            return Err(e);
-        }
-
-        Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
-
-        Ok(())
-    }
-
-    // Helper function needed to be able to match once on returned error and transition tenant into broken state.
-    // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
-    // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
-    // So the solution is to set tenant state to broken.
-    async fn run_inner(
-        guard: &mut OwnedMutexGuard<Self>,
-        conf: &'static PageServerConf,
-        remote_storage: Option<&GenericRemoteStorage>,
-        tenant: &Tenant,
-    ) -> Result<(), DeleteTenantError> {
        guard.mark_in_progress()?;

-        fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
+        stop_tasks(&timeline).await?;
+
+        set_deleted_in_remote_index(&timeline).await?;
+
+        create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?;
+
+        fail::fail_point!("timeline-delete-before-schedule", |_| {
            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-create-remote-mark"
+                "failpoint: timeline-delete-before-schedule"
            ))?
        });

-        // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
-        // Though sounds scary, different mark name?
-        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
-        if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
-                .await
-                .context("remote_mark")?
-        }
-
-        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-create-local-mark"
-            ))?
-        });
-
-        create_local_delete_mark(conf, &tenant.tenant_id)
-            .await
-            .context("local delete mark")?;
-
-        fail::fail_point!("tenant-delete-before-background", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-background"
-            ))?
-        });
+        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);

        Ok(())
    }
@@ -372,148 +395,140 @@ impl DeleteTenantFlow {
        Ok(())
    }

-    pub async fn should_resume_deletion(
-        conf: &'static PageServerConf,
-        remote_storage: Option<&GenericRemoteStorage>,
-        tenant: &Tenant,
-    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
-        let acquire = |t: &Tenant| {
-            Some(
-                Arc::clone(&t.delete_progress)
-                    .try_lock_owned()
-                    .expect("we're the only owner during init"),
-            )
-        };
-
-        let tenant_id = tenant.tenant_id;
-        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
-        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
-            return Ok(acquire(tenant));
-        }
-
-        let remote_storage = match remote_storage {
-            Some(remote_storage) => remote_storage,
-            None => return Ok(None),
-        };
-
-        if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? {
-            Ok(acquire(tenant))
-        } else {
-            Ok(None)
-        }
-    }
-
-    pub(crate) async fn resume(
-        guard: DeletionGuard,
-        tenant: &Arc<Tenant>,
+    /// Shortcut to create Timeline in stopping state and spawn deletion task.
+    pub async fn resume_deletion(
+        tenant: Arc<Tenant>,
+        timeline_id: TimelineId,
+        local_metadata: &TimelineMetadata,
+        remote_client: Option<RemoteTimelineClient>,
        init_order: Option<&InitializationOrder>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        ctx: &RequestContext,
-    ) -> Result<(), DeleteTenantError> {
-        let (_, progress) = completion::channel();
+    ) -> anyhow::Result<()> {
+        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
+        // RemoteTimelineClient is the only functioning part.
+        let timeline = tenant
+            .create_timeline_struct(
+                timeline_id,
+                local_metadata,
+                None, // Ancestor is not needed for deletion.
+                remote_client,
+                init_order,
+                // Important. We dont pass ancestor above because it can be missing.
+                // Thus we need to skip the validation here.
+                CreateTimelineCause::Delete,
+            )
+            .context("create_timeline_struct")?;

-        tenant
-            .set_stopping(progress, true)
-            .await
-            .expect("cant be stopping or broken");
+        let mut guard = DeletionGuard(
+            Arc::clone(&timeline.delete_progress)
+                .try_lock_owned()
+                .expect("cannot happen because we're the only owner"),
+        );

-        // Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
-        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
-        if let Some(background) = background_jobs_can_start {
-            info!("waiting for backgound jobs barrier");
-            background.clone().wait().await;
-            info!("ready for backgound jobs barrier");
+        // We meed to do this because when console retries delete request we shouldnt answer with 404
+        // because 404 means successful deletion.
+        {
+            let mut locked = tenant.timelines.lock().unwrap();
+            locked.insert(timeline_id, Arc::clone(&timeline));
        }

-        // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
-        let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
-        if timelines_path.exists() {
-            tenant.load(init_order, ctx).await.context("load")?;
-        }
+        guard.mark_in_progress()?;

-        Self::background(
-            guard,
-            tenant.conf,
-            tenant.remote_storage.clone(),
-            tenants,
-            tenant,
-        )
-        .await
+        // Note that delete mark can be missing on resume
+        // because we create delete mark after we set deleted_at in the index part.
+        create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?;
+
+        Self::schedule_background(guard, tenant.conf, tenant, timeline);
+
+        Ok(())
    }

-    async fn prepare(
-        tenants: &tokio::sync::RwLock<TenantsMap>,
-        tenant_id: TenantId,
-    ) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
-        let m = tenants.read().await;
+    pub async fn cleanup_remaining_timeline_fs_traces(
+        tenant: &Tenant,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<()> {
+        cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await
+    }

-        let tenant = m
-            .get(&tenant_id)
-            .ok_or(GetTenantError::NotFound(tenant_id))?;
+    fn prepare(
+        tenant: &Tenant,
+        timeline_id: TimelineId,
+    ) -> Result<(Arc<Timeline>, DeletionGuard), DeleteTimelineError> {
+        // Note the interaction between this guard and deletion guard.
+        // Here we attempt to lock deletion guard when we're holding a lock on timelines.
+        // This is important because when you take into account `remove_timeline_from_tenant`
+        // we remove timeline from memory when we still hold the deletion guard.
+        // So here when timeline deletion is finished timeline wont be present in timelines map at all
+        // which makes the following sequence impossible:
+        // T1: get preempted right before the try_lock on `Timeline::delete_progress`
+        // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
+        // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
+        // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
+        let timelines = tenant.timelines.lock().unwrap();

-        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
-        // so at least for now allow deletions only for active tenants. TODO recheck
-        // Broken and Stopping is needed for retries.
-        if !matches!(
-            tenant.current_state(),
-            TenantState::Active | TenantState::Broken { .. }
-        ) {
-            return Err(DeleteTenantError::InvalidState(tenant.current_state()));
+        let timeline = match timelines.get(&timeline_id) {
+            Some(t) => t,
+            None => return Err(DeleteTimelineError::NotFound),
+        };
+
+        // Ensure that there are no child timelines **attached to that pageserver**,
+        // because detach removes files, which will break child branches
+        let children: Vec<TimelineId> = timelines
+            .iter()
+            .filter_map(|(id, entry)| {
+                if entry.get_ancestor_timeline_id() == Some(timeline_id) {
+                    Some(*id)
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        if !children.is_empty() {
+            return Err(DeleteTimelineError::HasChildren(children));
        }

-        let guard = Arc::clone(&tenant.delete_progress)
-            .try_lock_owned()
-            .map_err(|_| DeleteTenantError::AlreadyInProgress)?;
+        // Note that using try_lock here is important to avoid a deadlock.
+        // Here we take lock on timelines and then the deletion guard.
+        // At the end of the operation we're holding the guard and need to lock timelines map
+        // to remove the timeline from it.
+        // Always if you have two locks that are taken in different order this can result in a deadlock.
+        let delete_lock_guard = DeletionGuard(
+            Arc::clone(&timeline.delete_progress)
+                .try_lock_owned()
+                .map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
+        );

-        fail::fail_point!("tenant-delete-before-shutdown", |_| {
-            Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
-        });
+        timeline.set_state(TimelineState::Stopping);

-        // make pageserver shutdown not to wait for our completion
-        let (_, progress) = completion::channel();
-
-        // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
-        // i e it is an error to do:
-        // tenant.set_stopping
-        // tenant.shutdown
-        // Its also bad that we're holding tenants.read here.
-        // TODO relax set_stopping to be idempotent?
-        if tenant.shutdown(progress, false).await.is_err() {
-            return Err(DeleteTenantError::Other(anyhow::anyhow!(
-                "tenant shutdown is already in progress"
-            )));
-        }
-
-        Ok((Arc::clone(tenant), guard))
+        Ok((Arc::clone(timeline), delete_lock_guard))
    }

    fn schedule_background(
-        guard: OwnedMutexGuard<Self>,
+        guard: DeletionGuard,
        conf: &'static PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        tenant: Arc<Tenant>,
+        timeline: Arc<Timeline>,
    ) {
-        let tenant_id = tenant.tenant_id;
+        let tenant_id = timeline.tenant_id;
+        let timeline_id = timeline.timeline_id;

        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            TaskKind::TimelineDeletionWorker,
            Some(tenant_id),
-            None,
-            "tenant_delete",
+            Some(timeline_id),
+            "timeline_delete",
            false,
            async move {
-                if let Err(err) =
-                    Self::background(guard, conf, remote_storage, tenants, &tenant).await
-                {
+                if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
                    error!("Error: {err:#}");
-                    tenant.set_broken(format!("{err:#}")).await;
+                    timeline.set_broken(format!("{err:#}"))
                };
                Ok(())
            }
            .instrument({
-                let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
+                let span =
+                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
                span.follows_from(Span::current());
                span
            }),
@@ -521,64 +536,39 @@ impl DeleteTenantFlow {
    }

    async fn background(
-        mut guard: OwnedMutexGuard<Self>,
+        mut guard: DeletionGuard,
        conf: &PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        tenant: &Arc<Tenant>,
-    ) -> Result<(), DeleteTenantError> {
-        // Tree sort timelines, schedule delete for them. Mention retries from the console side.
-        // Note that if deletion fails we dont mark timelines as broken,
-        // the whole tenant will become broken as by `Self::schedule_background` logic
-        let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
-            .await
-            .context("schedule_ordered_timeline_deletions")?;
+        tenant: &Tenant,
+        timeline: &Timeline,
+    ) -> Result<(), DeleteTimelineError> {
+        delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;

-        fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-polling-ongoing-deletions"
-            ))?
-        });
+        delete_remote_layers_and_index(timeline).await?;

-        // Wait for deletions that were already running at the moment when tenant deletion was requested.
-        // When we can lock deletion guard it means that corresponding timeline deletion finished.
-        for (guard, timeline_id) in already_running_timeline_deletions {
-            let flow = guard.lock().await;
-            if !flow.is_finished() {
-                return Err(DeleteTenantError::Other(anyhow::anyhow!(
-                    "already running timeline deletion failed: {timeline_id}"
-                )));
-            }
-        }
+        pausable_failpoint!("in_progress_delete");

-        let timelines_path = conf.timelines_path(&tenant.tenant_id);
-        // May not exist if we fail in cleanup_remaining_fs_traces after removing it
-        if timelines_path.exists() {
-            // sanity check to guard against layout changes
-            ensure_timelines_dir_empty(&timelines_path)
-                .await
-                .context("timelines dir not empty")?;
-        }
+        cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?;

-        remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
+        remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;

-        fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
-            ))?
-        });
-
-        cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
-            .await
-            .context("cleanup_remaining_fs_traces")?;
-
-        let mut locked = tenants.write().await;
-        if locked.remove(&tenant.tenant_id).is_none() {
-            warn!("Tenant got removed from tenants map during deletion");
-        };
-
-        *guard = Self::Finished;
+        *guard.0 = Self::Finished;

        Ok(())
    }
 }
+
+struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
+
+impl Deref for DeletionGuard {
+    type Target = DeleteTimelineFlow;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl DerefMut for DeletionGuard {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -20,7 +20,6 @@
 //!
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
-use either::Either;
 use hex;
 use std::{cmp::Ordering, io, result};
 use thiserror::Error;
@@ -231,15 +230,14 @@ where
    ///
    /// Read the value for given key. Returns the value, or None if it doesn't exist.
    ///
-    pub async fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
+    pub fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
        let mut result: Option<u64> = None;
        self.visit(search_key, VisitDirection::Forwards, |key, value| {
            if key == search_key {
                result = Some(value);
            }
            false
-        })
-        .await?;
+        })?;
        Ok(result)
    }

@@ -248,7 +246,7 @@ where
    /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning
    /// backwards)
    ///
-    pub async fn visit<V>(
+    pub fn visit<V>(
        &self,
        search_key: &[u8; L],
        dir: VisitDirection,
@@ -257,77 +255,77 @@ where
    where
        V: FnMut(&[u8], u64) -> bool,
    {
-        let mut stack = Vec::new();
-        stack.push((self.root_blk, None));
-        while let Some((node_blknum, opt_iter)) = stack.pop() {
-            // Locate the node.
-            let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
+        self.search_recurse(self.root_blk, search_key, dir, &mut visitor)
+    }

-            let node = OnDiskNode::deparse(node_buf.as_ref())?;
-            let prefix_len = node.prefix_len as usize;
-            let suffix_len = node.suffix_len as usize;
+    fn search_recurse<V>(
+        &self,
+        node_blknum: u32,
+        search_key: &[u8; L],
+        dir: VisitDirection,
+        visitor: &mut V,
+    ) -> Result<bool>
+    where
+        V: FnMut(&[u8], u64) -> bool,
+    {
+        // Locate the node.
+        let blk = self.reader.read_blk(self.start_blk + node_blknum)?;

-            assert!(node.num_children > 0);
+        // Search all entries on this node
+        self.search_node(blk.as_ref(), search_key, dir, visitor)
+    }

-            let mut keybuf = Vec::new();
-            keybuf.extend(node.prefix);
-            keybuf.resize(prefix_len + suffix_len, 0);
+    fn search_node<V>(
+        &self,
+        node_buf: &[u8],
+        search_key: &[u8; L],
+        dir: VisitDirection,
+        visitor: &mut V,
+    ) -> Result<bool>
+    where
+        V: FnMut(&[u8], u64) -> bool,
+    {
+        let node = OnDiskNode::deparse(node_buf)?;
+        let prefix_len = node.prefix_len as usize;
+        let suffix_len = node.suffix_len as usize;

-            let mut iter = if let Some(iter) = opt_iter {
-                iter
-            } else if dir == VisitDirection::Forwards {
-                // Locate the first match
-                let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
-                    Ok(idx) => idx,
-                    Err(idx) => {
-                        if node.level == 0 {
-                            // Imagine that the node contains the following keys:
-                            //
-                            // 1
-                            // 3  <-- idx
-                            // 5
-                            //
-                            // If the search key is '2' and there is exact match,
-                            // the binary search would return the index of key
-                            // '3'. That's cool, '3' is the first key to return.
-                            idx
-                        } else {
-                            // This is an internal page, so each key represents a lower
-                            // bound for what's in the child page. If there is no exact
-                            // match, we have to return the *previous* entry.
-                            //
-                            // 1  <-- return this
-                            // 3  <-- idx
-                            // 5
-                            idx.saturating_sub(1)
-                        }
-                    }
-                };
-                Either::Left(idx..node.num_children.into())
-            } else {
-                let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
-                    Ok(idx) => {
-                        // Exact match. That's the first entry to return, and walk
-                        // backwards from there.
+        assert!(node.num_children > 0);
+
+        let mut keybuf = Vec::new();
+        keybuf.extend(node.prefix);
+        keybuf.resize(prefix_len + suffix_len, 0);
+
+        if dir == VisitDirection::Forwards {
+            // Locate the first match
+            let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
+                Ok(idx) => idx,
+                Err(idx) => {
+                    if node.level == 0 {
+                        // Imagine that the node contains the following keys:
+                        //
+                        // 1
+                        // 3  <-- idx
+                        // 5
+                        //
+                        // If the search key is '2' and there is exact match,
+                        // the binary search would return the index of key
+                        // '3'. That's cool, '3' is the first key to return.
                        idx
+                    } else {
+                        // This is an internal page, so each key represents a lower
+                        // bound for what's in the child page. If there is no exact
+                        // match, we have to return the *previous* entry.
+                        //
+                        // 1  <-- return this
+                        // 3  <-- idx
+                        // 5
+                        idx.saturating_sub(1)
                    }
-                    Err(idx) => {
-                        // No exact match. The binary search returned the index of the
-                        // first key that's > search_key. Back off by one, and walk
-                        // backwards from there.
-                        if let Some(idx) = idx.checked_sub(1) {
-                            idx
-                        } else {
-                            return Ok(false);
-                        }
-                    }
-                };
-                Either::Right((0..=idx).rev())
+                }
            };
-
            // idx points to the first match now. Keep going from there
-            while let Some(idx) = iter.next() {
-                let key_off = idx * suffix_len;
+            let mut key_off = idx * suffix_len;
+            while idx < node.num_children as usize {
                let suffix = &node.keys[key_off..key_off + suffix_len];
                keybuf[prefix_len..].copy_from_slice(suffix);
                let value = node.value(idx);
@@ -338,8 +336,52 @@ where
                        return Ok(false);
                    }
                } else {
-                    stack.push((node_blknum, Some(iter)));
-                    stack.push((value.to_blknum(), None));
+                    #[allow(clippy::collapsible_if)]
+                    if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
+                        return Ok(false);
+                    }
+                }
+                idx += 1;
+                key_off += suffix_len;
+            }
+        } else {
+            let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
+                Ok(idx) => {
+                    // Exact match. That's the first entry to return, and walk
+                    // backwards from there. (The loop below starts from 'idx -
+                    // 1', so add one here to compensate.)
+                    idx + 1
+                }
+                Err(idx) => {
+                    // No exact match. The binary search returned the index of the
+                    // first key that's > search_key. Back off by one, and walk
+                    // backwards from there. (The loop below starts from idx - 1,
+                    // so we don't need to subtract one here)
+                    idx
+                }
+            };
+
+            // idx points to the first match + 1 now. Keep going from there.
+            let mut key_off = idx * suffix_len;
+            while idx > 0 {
+                idx -= 1;
+                key_off -= suffix_len;
+                let suffix = &node.keys[key_off..key_off + suffix_len];
+                keybuf[prefix_len..].copy_from_slice(suffix);
+                let value = node.value(idx);
+                #[allow(clippy::collapsible_if)]
+                if node.level == 0 {
+                    // leaf
+                    if !visitor(&keybuf, value.to_u64()) {
+                        return Ok(false);
+                    }
+                } else {
+                    #[allow(clippy::collapsible_if)]
+                    if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
+                        return Ok(false);
+                    }
+                }
+                if idx == 0 {
                    break;
                }
            }
@@ -348,42 +390,39 @@ where
    }

    #[allow(dead_code)]
-    pub async fn dump(&self) -> Result<()> {
-        let mut stack = Vec::new();
+    pub fn dump(&self) -> Result<()> {
+        self.dump_recurse(self.root_blk, &[], 0)
+    }

-        stack.push((self.root_blk, String::new(), 0, 0, 0));
+    fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
+        let blk = self.reader.read_blk(self.start_blk + blknum)?;
+        let buf: &[u8] = blk.as_ref();

-        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
-            let blk = self.reader.read_blk(self.start_blk + blknum)?;
-            let buf: &[u8] = blk.as_ref();
-            let node = OnDiskNode::<L>::deparse(buf)?;
+        let node = OnDiskNode::<L>::deparse(buf)?;

-            if child_idx == 0 {
-                print!("{:indent$}", "", indent = depth * 2);
-                let path_prefix = stack
-                    .iter()
-                    .map(|(_blknum, path, ..)| path.as_str())
-                    .collect::<String>();
-                println!(
-                    "blk #{blknum}: path {path_prefix}{path}: prefix {}, suffix_len {}",
-                    hex::encode(node.prefix),
-                    node.suffix_len
-                );
-            }
+        print!("{:indent$}", "", indent = depth * 2);
+        println!(
+            "blk #{}: path {}: prefix {}, suffix_len {}",
+            blknum,
+            hex::encode(path),
+            hex::encode(node.prefix),
+            node.suffix_len
+        );

-            if child_idx + 1 < node.num_children {
-                let key_off = key_off + node.suffix_len as usize;
-                stack.push((blknum, path.clone(), depth, child_idx + 1, key_off));
-            }
+        let mut idx = 0;
+        let mut key_off = 0;
+        while idx < node.num_children {
            let key = &node.keys[key_off..key_off + node.suffix_len as usize];
-            let val = node.value(child_idx as usize);
-
+            let val = node.value(idx as usize);
            print!("{:indent$}", "", indent = depth * 2 + 2);
            println!("{}: {}", hex::encode(key), hex::encode(val.0));

            if node.level > 0 {
-                stack.push((val.to_blknum(), hex::encode(node.prefix), depth + 1, 0, 0));
+                let child_path = [path, node.prefix].concat();
+                self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?;
            }
+            idx += 1;
+            key_off += node.suffix_len as usize;
        }
        Ok(())
    }
@@ -685,7 +724,6 @@ impl<const L: usize> BuildNode<L> {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use crate::tenant::block_io::BlockLease;
    use rand::Rng;
    use std::collections::BTreeMap;
    use std::sync::atomic::{AtomicUsize, Ordering};
@@ -700,10 +738,12 @@ mod tests {
        }
    }
    impl BlockReader for TestDisk {
-        fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
+        type BlockLease = std::rc::Rc<[u8; PAGE_SZ]>;
+
+        fn read_blk(&self, blknum: u32) -> io::Result<Self::BlockLease> {
            let mut buf = [0u8; PAGE_SZ];
            buf.copy_from_slice(&self.blocks[blknum as usize]);
-            Ok(std::rc::Rc::new(buf).into())
+            Ok(std::rc::Rc::new(buf))
        }
    }
    impl BlockWriter for &mut TestDisk {
@@ -714,8 +754,8 @@ mod tests {
        }
    }

-    #[tokio::test]
-    async fn basic() -> Result<()> {
+    #[test]
+    fn basic() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);

@@ -735,16 +775,16 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump().await?;
+        reader.dump()?;

        // Test the `get` function on all the keys.
        for (key, val) in all_data.iter() {
-            assert_eq!(reader.get(key).await?, Some(*val));
+            assert_eq!(reader.get(key)?, Some(*val));
        }
        // And on some keys that don't exist
-        assert_eq!(reader.get(b"aaaaaa").await?, None);
-        assert_eq!(reader.get(b"zzzzzz").await?, None);
-        assert_eq!(reader.get(b"xaaabx").await?, None);
+        assert_eq!(reader.get(b"aaaaaa")?, None);
+        assert_eq!(reader.get(b"zzzzzz")?, None);
+        assert_eq!(reader.get(b"xaaabx")?, None);

        // Test search with `visit` function
        let search_key = b"xabaaa";
@@ -755,12 +795,10 @@ mod tests {
            .collect();

        let mut data = Vec::new();
-        reader
-            .visit(search_key, VisitDirection::Forwards, |key, value| {
-                data.push((key.to_vec(), value));
-                true
-            })
-            .await?;
+        reader.visit(search_key, VisitDirection::Forwards, |key, value| {
+            data.push((key.to_vec(), value));
+            true
+        })?;
        assert_eq!(data, expected);

        // Test a backwards scan
@@ -771,20 +809,16 @@ mod tests {
            .collect();
        expected.reverse();
        let mut data = Vec::new();
-        reader
-            .visit(search_key, VisitDirection::Backwards, |key, value| {
-                data.push((key.to_vec(), value));
-                true
-            })
-            .await?;
+        reader.visit(search_key, VisitDirection::Backwards, |key, value| {
+            data.push((key.to_vec(), value));
+            true
+        })?;
        assert_eq!(data, expected);

        // Backward scan where nothing matches
-        reader
-            .visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
-                panic!("found unexpected key {}: {}", hex::encode(key), value);
-            })
-            .await?;
+        reader.visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
+            panic!("found unexpected key {}: {}", hex::encode(key), value);
+        })?;

        // Full scan
        let expected: Vec<(Vec<u8>, u64)> = all_data
@@ -792,19 +826,17 @@ mod tests {
            .map(|(key, value)| (key.to_vec(), *value))
            .collect();
        let mut data = Vec::new();
-        reader
-            .visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
-                data.push((key.to_vec(), value));
-                true
-            })
-            .await?;
+        reader.visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
+            data.push((key.to_vec(), value));
+            true
+        })?;
        assert_eq!(data, expected);

        Ok(())
    }

-    #[tokio::test]
-    async fn lots_of_keys() -> Result<()> {
+    #[test]
+    fn lots_of_keys() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);

@@ -824,7 +856,7 @@ mod tests {

        let reader = DiskBtreeReader::new(0, root_offset, disk);

-        reader.dump().await?;
+        reader.dump()?;

        use std::sync::Mutex;

@@ -845,15 +877,13 @@ mod tests {
        for search_key_int in 0..(NUM_KEYS * 2 + 10) {
            let search_key = u64::to_be_bytes(search_key_int);
            assert_eq!(
-                reader.get(&search_key).await?,
+                reader.get(&search_key)?,
                all_data.get(&search_key_int).cloned()
            );

            // Test a forward scan starting with this key
            result.lock().unwrap().clear();
-            reader
-                .visit(&search_key, VisitDirection::Forwards, take_ten)
-                .await?;
+            reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
            let expected = all_data
                .range(search_key_int..)
                .take(10)
@@ -863,9 +893,7 @@ mod tests {

            // And a backwards scan
            result.lock().unwrap().clear();
-            reader
-                .visit(&search_key, VisitDirection::Backwards, take_ten)
-                .await?;
+            reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
            let expected = all_data
                .range(..=search_key_int)
                .rev()
@@ -879,9 +907,7 @@ mod tests {
        let search_key = u64::to_be_bytes(0);
        limit.store(usize::MAX, Ordering::Relaxed);
        result.lock().unwrap().clear();
-        reader
-            .visit(&search_key, VisitDirection::Forwards, take_ten)
-            .await?;
+        reader.visit(&search_key, VisitDirection::Forwards, take_ten)?;
        let expected = all_data
            .iter()
            .map(|(&key, &val)| (key, val))
@@ -892,9 +918,7 @@ mod tests {
        let search_key = u64::to_be_bytes(u64::MAX);
        limit.store(usize::MAX, Ordering::Relaxed);
        result.lock().unwrap().clear();
-        reader
-            .visit(&search_key, VisitDirection::Backwards, take_ten)
-            .await?;
+        reader.visit(&search_key, VisitDirection::Backwards, take_ten)?;
        let expected = all_data
            .iter()
            .rev()
@@ -905,8 +929,8 @@ mod tests {
        Ok(())
    }

-    #[tokio::test]
-    async fn random_data() -> Result<()> {
+    #[test]
+    fn random_data() -> Result<()> {
        // Generate random keys with exponential distribution, to
        // exercise the prefix compression
        const NUM_KEYS: usize = 100000;
@@ -933,23 +957,19 @@ mod tests {
        // Test get() operation on all the keys
        for (&key, &val) in all_data.iter() {
            let search_key = u128::to_be_bytes(key);
-            assert_eq!(reader.get(&search_key).await?, Some(val));
+            assert_eq!(reader.get(&search_key)?, Some(val));
        }

        // Test get() operations on random keys, most of which will not exist
        for _ in 0..100000 {
            let key_int = rand::thread_rng().gen::<u128>();
            let search_key = u128::to_be_bytes(key_int);
-            assert!(reader.get(&search_key).await? == all_data.get(&key_int).cloned());
+            assert!(reader.get(&search_key)? == all_data.get(&key_int).cloned());
        }

        // Test boundary cases
-        assert!(
-            reader.get(&u128::to_be_bytes(u128::MIN)).await? == all_data.get(&u128::MIN).cloned()
-        );
-        assert!(
-            reader.get(&u128::to_be_bytes(u128::MAX)).await? == all_data.get(&u128::MAX).cloned()
-        );
+        assert!(reader.get(&u128::to_be_bytes(u128::MIN))? == all_data.get(&u128::MIN).cloned());
+        assert!(reader.get(&u128::to_be_bytes(u128::MAX))? == all_data.get(&u128::MAX).cloned());

        Ok(())
    }
@@ -974,8 +994,8 @@ mod tests {
    ///
    /// This test contains a particular data set, see disk_btree_test_data.rs
    ///
-    #[tokio::test]
-    async fn particular_data() -> Result<()> {
+    #[test]
+    fn particular_data() -> Result<()> {
        // Build a tree from it
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
@@ -991,20 +1011,18 @@ mod tests {

        // Test get() operation on all the keys
        for (key, val) in disk_btree_test_data::TEST_DATA {
-            assert_eq!(reader.get(&key).await?, Some(val));
+            assert_eq!(reader.get(&key)?, Some(val));
        }

        // Test full scan
        let mut count = 0;
-        reader
-            .visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
-                count += 1;
-                true
-            })
-            .await?;
+        reader.visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
+            count += 1;
+            true
+        })?;
        assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

-        reader.dump().await?;
+        reader.dump()?;

        Ok(())
    }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -4,7 +4,7 @@
 use crate::config::PageServerConf;
 use crate::page_cache::{self, ReadBufResult, WriteBufResult, PAGE_SZ};
 use crate::tenant::blob_io::BlobWriter;
-use crate::tenant::block_io::{BlockLease, BlockReader};
+use crate::tenant::block_io::BlockReader;
 use crate::virtual_file::VirtualFile;
 use once_cell::sync::Lazy;
 use std::cmp::min;
@@ -12,39 +12,31 @@ use std::collections::HashMap;
 use std::fs::OpenOptions;
 use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
-use std::os::unix::prelude::FileExt;
 use std::path::PathBuf;
 use std::sync::{Arc, RwLock};
 use tracing::*;
 use utils::id::{TenantId, TimelineId};

+use std::os::unix::fs::FileExt;
+
 ///
 /// This is the global cache of file descriptors (File objects).
 ///
 static EPHEMERAL_FILES: Lazy<RwLock<EphemeralFiles>> = Lazy::new(|| {
    RwLock::new(EphemeralFiles {
-        next_file_id: FileId(1),
+        next_file_id: 1,
        files: HashMap::new(),
    })
 });

-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub struct FileId(u64);
-
-impl std::fmt::Display for FileId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.0)
-    }
-}
-
 pub struct EphemeralFiles {
-    next_file_id: FileId,
+    next_file_id: u64,

-    files: HashMap<FileId, Arc<VirtualFile>>,
+    files: HashMap<u64, Arc<VirtualFile>>,
 }

 pub struct EphemeralFile {
-    file_id: FileId,
+    file_id: u64,
    _tenant_id: TenantId,
    _timeline_id: TimelineId,
    file: Arc<VirtualFile>,
@@ -60,7 +52,7 @@ impl EphemeralFile {
    ) -> Result<EphemeralFile, io::Error> {
        let mut l = EPHEMERAL_FILES.write().unwrap();
        let file_id = l.next_file_id;
-        l.next_file_id = FileId(l.next_file_id.0 + 1);
+        l.next_file_id += 1;

        let filename = conf
            .timeline_path(&tenant_id, &timeline_id)
@@ -102,10 +94,7 @@ impl EphemeralFile {
        Ok(())
    }

-    fn get_buf_for_write(
-        &self,
-        blkno: u32,
-    ) -> Result<page_cache::PageWriteGuard<'static>, io::Error> {
+    fn get_buf_for_write(&self, blkno: u32) -> Result<page_cache::PageWriteGuard, io::Error> {
        // Look up the right page
        let cache = page_cache::get();
        let mut write_guard = match cache
@@ -138,79 +127,121 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
    }
 }

+impl FileExt for EphemeralFile {
+    fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, io::Error> {
+        // Look up the right page
+        let blkno = (offset / PAGE_SZ as u64) as u32;
+        let off = offset as usize % PAGE_SZ;
+        let len = min(PAGE_SZ - off, dstbuf.len());
+
+        let read_guard;
+        let mut write_guard;
+
+        let cache = page_cache::get();
+        let buf = match cache
+            .read_ephemeral_buf(self.file_id, blkno)
+            .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
+        {
+            ReadBufResult::Found(guard) => {
+                read_guard = guard;
+                read_guard.as_ref()
+            }
+            ReadBufResult::NotFound(guard) => {
+                // Read the page from disk into the buffer
+                write_guard = guard;
+                self.fill_buffer(write_guard.deref_mut(), blkno)?;
+                write_guard.mark_valid();
+
+                // And then fall through to read the requested slice from the
+                // buffer.
+                write_guard.as_ref()
+            }
+        };
+
+        dstbuf[0..len].copy_from_slice(&buf[off..(off + len)]);
+        Ok(len)
+    }
+
+    fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, io::Error> {
+        // Look up the right page
+        let blkno = (offset / PAGE_SZ as u64) as u32;
+        let off = offset as usize % PAGE_SZ;
+        let len = min(PAGE_SZ - off, srcbuf.len());
+
+        let mut write_guard;
+        let cache = page_cache::get();
+        let buf = match cache
+            .write_ephemeral_buf(self.file_id, blkno)
+            .map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
+        {
+            WriteBufResult::Found(guard) => {
+                write_guard = guard;
+                write_guard.deref_mut()
+            }
+            WriteBufResult::NotFound(guard) => {
+                // Read the page from disk into the buffer
+                // TODO: if we're overwriting the whole page, no need to read it in first
+                write_guard = guard;
+                self.fill_buffer(write_guard.deref_mut(), blkno)?;
+                write_guard.mark_valid();
+
+                // And then fall through to modify it.
+                write_guard.deref_mut()
+            }
+        };
+
+        buf[off..(off + len)].copy_from_slice(&srcbuf[0..len]);
+        write_guard.mark_dirty();
+        Ok(len)
+    }
+}
+
 impl BlobWriter for EphemeralFile {
    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
-        struct Writer<'a> {
-            ephemeral_file: &'a mut EphemeralFile,
-            /// The block to which the next [`push_bytes`] will write.
-            blknum: u32,
-            /// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
-            off: usize,
-            /// Used by [`push_bytes`] to memoize the page cache write guard across calls to it.
-            memo_page_guard: MemoizedPageWriteGuard,
-        }
-        struct MemoizedPageWriteGuard {
-            guard: page_cache::PageWriteGuard<'static>,
-            /// The block number of the page in `guard`.
-            blknum: u32,
-        }
-        impl<'a> Writer<'a> {
-            fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
-                let blknum = (ephemeral_file.size / PAGE_SZ as u64) as u32;
-                Ok(Writer {
-                    blknum,
-                    off: (ephemeral_file.size % PAGE_SZ as u64) as usize,
-                    memo_page_guard: MemoizedPageWriteGuard {
-                        guard: ephemeral_file.get_buf_for_write(blknum)?,
-                        blknum,
-                    },
-                    ephemeral_file,
-                })
-            }
-            #[inline(always)]
-            fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
-                // `src_remaining` is the remaining bytes to be written
-                let mut src_remaining = src;
-                while !src_remaining.is_empty() {
-                    let page = if self.memo_page_guard.blknum == self.blknum {
-                        &mut self.memo_page_guard.guard
-                    } else {
-                        self.memo_page_guard.guard =
-                            self.ephemeral_file.get_buf_for_write(self.blknum)?;
-                        self.memo_page_guard.blknum = self.blknum;
-                        &mut self.memo_page_guard.guard
-                    };
-                    let dst_remaining = &mut page[self.off..];
-                    let n = min(dst_remaining.len(), src_remaining.len());
-                    dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
-                    self.off += n;
-                    src_remaining = &src_remaining[n..];
-                    if self.off == PAGE_SZ {
-                        // This block is done, move to next one.
-                        self.blknum += 1;
-                        self.off = 0;
-                    }
-                }
-                Ok(())
-            }
-        }
-
        let pos = self.size;
-        let mut writer = Writer::new(self)?;
+
+        let mut blknum = (self.size / PAGE_SZ as u64) as u32;
+        let mut off = (pos % PAGE_SZ as u64) as usize;
+
+        let mut buf = self.get_buf_for_write(blknum)?;

        // Write the length field
        if srcbuf.len() < 0x80 {
-            // short one-byte length header
-            let len_buf = [srcbuf.len() as u8];
-            writer.push_bytes(&len_buf)?;
+            buf[off] = srcbuf.len() as u8;
+            off += 1;
        } else {
            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
            len_buf[0] |= 0x80;
-            writer.push_bytes(&len_buf)?;
+            let thislen = PAGE_SZ - off;
+            if thislen < 4 {
+                // it needs to be split across pages
+                buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]);
+                blknum += 1;
+                buf = self.get_buf_for_write(blknum)?;
+                buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]);
+                off = 4 - thislen;
+            } else {
+                buf[off..off + 4].copy_from_slice(&len_buf);
+                off += 4;
+            }
        }

        // Write the payload
-        writer.push_bytes(srcbuf)?;
+        let mut buf_remain = srcbuf;
+        while !buf_remain.is_empty() {
+            let mut page_remain = PAGE_SZ - off;
+            if page_remain == 0 {
+                blknum += 1;
+                buf = self.get_buf_for_write(blknum)?;
+                off = 0;
+                page_remain = PAGE_SZ;
+            }
+            let this_blk_len = min(page_remain, buf_remain.len());
+            buf[off..(off + this_blk_len)].copy_from_slice(&buf_remain[..this_blk_len]);
+            off += this_blk_len;
+            buf_remain = &buf_remain[this_blk_len..];
+        }
+        drop(buf);

        if srcbuf.len() < 0x80 {
            self.size += 1;
@@ -235,22 +266,16 @@ impl Drop for EphemeralFile {
        // unlink the file
        let res = std::fs::remove_file(&self.file.path);
        if let Err(e) = res {
-            if e.kind() != std::io::ErrorKind::NotFound {
-                // just never log the not found errors, we cannot do anything for them; on detach
-                // the tenant directory is already gone.
-                //
-                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!(
-                    "could not remove ephemeral file '{}': {}",
-                    self.file.path.display(),
-                    e
-                );
-            }
+            warn!(
+                "could not remove ephemeral file '{}': {}",
+                self.file.path.display(),
+                e
+            );
        }
    }
 }

-pub fn writeback(file_id: FileId, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
+pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
    if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
        match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
            Ok(_) => Ok(()),
@@ -272,7 +297,9 @@ pub fn writeback(file_id: FileId, blkno: u32, buf: &[u8]) -> Result<(), io::Erro
 }

 impl BlockReader for EphemeralFile {
-    fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
+    type BlockLease = page_cache::PageReadGuard<'static>;
+
+    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, io::Error> {
        // Look up the right page
        let cache = page_cache::get();
        loop {
@@ -280,7 +307,7 @@ impl BlockReader for EphemeralFile {
                .read_ephemeral_buf(self.file_id, blknum)
                .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
            {
-                ReadBufResult::Found(guard) => return Ok(guard.into()),
+                ReadBufResult::Found(guard) => return Ok(guard),
                ReadBufResult::NotFound(mut write_guard) => {
                    // Read the page from disk into the buffer
                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
@@ -303,7 +330,7 @@ mod tests {
    use super::*;
    use crate::tenant::blob_io::BlobWriter;
    use crate::tenant::block_io::BlockCursor;
-    use rand::{thread_rng, RngCore};
+    use rand::{seq::SliceRandom, thread_rng, RngCore};
    use std::fs;
    use std::str::FromStr;

@@ -324,26 +351,61 @@ mod tests {
        Ok((conf, tenant_id, timeline_id))
    }

-    #[tokio::test]
-    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
+    // Helper function to slurp contents of a file, starting at the current position,
+    // into a string
+    fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, io::Error> {
+        let mut buf = Vec::new();
+        buf.resize(len, 0u8);
+
+        efile.read_exact_at(&mut buf, offset)?;
+
+        Ok(String::from_utf8_lossy(&buf)
+            .trim_end_matches('\0')
+            .to_string())
+    }
+
+    #[test]
+    fn test_ephemeral_files() -> Result<(), io::Error> {
+        let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?;
+
+        let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?;
+
+        file_a.write_all_at(b"foo", 0)?;
+        assert_eq!("foo", read_string(&file_a, 0, 20)?);
+
+        file_a.write_all_at(b"bar", 3)?;
+        assert_eq!("foobar", read_string(&file_a, 0, 20)?);
+
+        // Open a lot of files, enough to cause some page evictions.
+        let mut efiles = Vec::new();
+        for fileno in 0..100 {
+            let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?;
+            efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?;
+            assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
+            efiles.push((fileno, efile));
+        }
+
+        // Check that all the files can still be read from. Use them in random order for
+        // good measure.
+        efiles.as_mut_slice().shuffle(&mut thread_rng());
+        for (fileno, efile) in efiles.iter_mut() {
+            assert_eq!(format!("file {}", fileno), read_string(efile, 0, 10)?);
+        }
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_ephemeral_blobs() -> Result<(), io::Error> {
        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;

        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;

        let pos_foo = file.write_blob(b"foo")?;
-        assert_eq!(
-            b"foo",
-            file.block_cursor().read_blob(pos_foo).await?.as_slice()
-        );
+        assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
        let pos_bar = file.write_blob(b"bar")?;
-        assert_eq!(
-            b"foo",
-            file.block_cursor().read_blob(pos_foo).await?.as_slice()
-        );
-        assert_eq!(
-            b"bar",
-            file.block_cursor().read_blob(pos_bar).await?.as_slice()
-        );
+        assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
+        assert_eq!(b"bar", file.block_cursor().read_blob(pos_bar)?.as_slice());

        let mut blobs = Vec::new();
        for i in 0..10000 {
@@ -358,9 +420,9 @@ mod tests {
            blobs.push((pos, data));
        }

-        let cursor = BlockCursor::new(&file);
+        let mut cursor = BlockCursor::new(&file);
        for (pos, expected) in blobs {
-            let actual = cursor.read_blob(pos).await?;
+            let actual = cursor.read_blob(pos)?;
            assert_eq!(actual, expected);
        }

@@ -369,7 +431,7 @@ mod tests {
        large_data.resize(20000, 0);
        thread_rng().fill_bytes(&mut large_data);
        let pos_large = file.write_blob(&large_data)?;
-        let result = file.block_cursor().read_blob(pos_large).await?;
+        let result = file.block_cursor().read_blob(pos_large)?;
        assert_eq!(result, large_data);

        Ok(())
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -121,7 +121,7 @@ impl BatchedUpdates<'_> {
    ///
    /// This should be called when the corresponding file on disk has been deleted.
    ///
-    pub fn remove_historic(&mut self, layer_desc: &PersistentLayerDesc) {
+    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc) {
        self.layer_map.remove_historic_noflush(layer_desc)
    }

@@ -253,11 +253,11 @@ impl LayerMap {
    ///
    /// Helper function for BatchedUpdates::remove_historic
    ///
-    pub fn remove_historic_noflush(&mut self, layer_desc: &PersistentLayerDesc) {
+    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
        self.historic
-            .remove(historic_layer_coverage::LayerKey::from(layer_desc));
+            .remove(historic_layer_coverage::LayerKey::from(&layer_desc));
        let layer_key = layer_desc.key();
-        if Self::is_l0(layer_desc) {
+        if Self::is_l0(&layer_desc) {
            let len_before = self.l0_delta_layers.len();
            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
            l0_delta_layers.retain(|other| other.key() != layer_key);
@@ -766,7 +766,8 @@ mod tests {
                expected_in_counts
            );

-            map.batch_update().remove_historic(downloaded.layer_desc());
+            map.batch_update()
+                .remove_historic(downloaded.layer_desc().clone());
            assert_eq!(count_layer_in(&map, downloaded.layer_desc()), (0, 0));
        }

--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -20,19 +20,17 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};

 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};

-use super::delete::{remote_delete_mark_exists, DeleteTenantError};
-use super::timeline::delete::DeleteTimelineFlow;
+use super::delete::DeleteTimelineFlow;

 /// The tenants known to the pageserver.
 /// The enum variants are used to distinguish the different states that the pageserver can be in.
-pub(crate) enum TenantsMap {
+enum TenantsMap {
    /// [`init_tenant_mgr`] is not done yet.
    Initializing,
    /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
@@ -44,13 +42,13 @@ pub(crate) enum TenantsMap {
 }

 impl TenantsMap {
-    pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
+    fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id),
        }
    }
-    pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
+    fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
@@ -99,9 +97,7 @@ pub async fn init_tenant_mgr(
                        );
                    }
                } else {
-                    // This case happens if we:
-                    // * crash during attach before creating the attach marker file
-                    // * crash during tenant delete before removing tenant directory
+                    // This case happens if we crash during attach before creating the attach marker file
                    let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
                        format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
                    })?;
@@ -128,7 +124,6 @@ pub async fn init_tenant_mgr(
                        broker_client.clone(),
                        remote_storage.clone(),
                        Some(init_order.clone()),
-                        &TENANTS,
                        &ctx,
                    ) {
                        Ok(tenant) => {
@@ -159,13 +154,12 @@ pub async fn init_tenant_mgr(
    Ok(())
 }

-pub(crate) fn schedule_local_tenant_processing(
+pub fn schedule_local_tenant_processing(
    conf: &'static PageServerConf,
    tenant_path: &Path,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
    init_order: Option<InitializationOrder>,
-    tenants: &'static tokio::sync::RwLock<TenantsMap>,
    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
    anyhow::ensure!(
@@ -225,7 +219,6 @@ pub(crate) fn schedule_local_tenant_processing(
            broker_client,
            remote_storage,
            init_order,
-            tenants,
            ctx,
        )
    };
@@ -273,77 +266,71 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
        }
    };

-    let started_at = std::time::Instant::now();
    let mut join_set = JoinSet::new();
    for (tenant_id, tenant) in tenants_to_shut_down {
        join_set.spawn(
            async move {
-                let freeze_and_flush = true;
+                // ordering shouldn't matter for this, either we store true right away or never
+                let ordering = std::sync::atomic::Ordering::Relaxed;
+                let joined_other = std::sync::atomic::AtomicBool::new(false);

-                let res = {
-                    let (_guard, shutdown_progress) = completion::channel();
-                    tenant.shutdown(shutdown_progress, freeze_and_flush).await
+                let mut shutdown = std::pin::pin!(async {
+                    let freeze_and_flush = true;
+
+                    let res = {
+                        let (_guard, shutdown_progress) = completion::channel();
+                        tenant.shutdown(shutdown_progress, freeze_and_flush).await
+                    };
+
+                    if let Err(other_progress) = res {
+                        // join the another shutdown in progress
+                        joined_other.store(true, ordering);
+                        other_progress.wait().await;
+                    }
+                });
+
+                // in practice we might not have a lot time to go, since systemd is going to
+                // SIGKILL us at 10s, but we can try. delete tenant might take a while, so put out
+                // a warning.
+                let warning = std::time::Duration::from_secs(5);
+                let mut warning = std::pin::pin!(tokio::time::sleep(warning));
+
+                tokio::select! {
+                    _ = &mut shutdown => {},
+                    _ = &mut warning => {
+                        let joined_other = joined_other.load(ordering);
+                        warn!(%joined_other, "waiting for the shutdown to complete");
+                        shutdown.await;
+                    }
                };

-                if let Err(other_progress) = res {
-                    // join the another shutdown in progress
-                    other_progress.wait().await;
-                }
-
-                // we cannot afford per tenant logging here, because if s3 is degraded, we are
-                // going to log too many lines
-
                debug!("tenant successfully stopped");
            }
            .instrument(info_span!("shutdown", %tenant_id)),
        );
    }

-    let total = join_set.len();
    let mut panicked = 0;
-    let mut buffering = true;
-    const BUFFER_FOR: std::time::Duration = std::time::Duration::from_millis(500);
-    let mut buffered = std::pin::pin!(tokio::time::sleep(BUFFER_FOR));

-    while !join_set.is_empty() {
-        tokio::select! {
-            Some(joined) = join_set.join_next() => {
-                match joined {
-                    Ok(()) => {}
-                    Err(join_error) if join_error.is_cancelled() => {
-                        unreachable!("we are not cancelling any of the futures");
-                    }
-                    Err(join_error) if join_error.is_panic() => {
-                        // cannot really do anything, as this panic is likely a bug
-                        panicked += 1;
-                    }
-                    Err(join_error) => {
-                        warn!("unknown kind of JoinError: {join_error}");
-                    }
-                }
-                if !buffering {
-                    // buffer so that every 500ms since the first update (or starting) we'll log
-                    // how far away we are; this is because we will get SIGKILL'd at 10s, and we
-                    // are not able to log *then*.
-                    buffering = true;
-                    buffered.as_mut().reset(tokio::time::Instant::now() + BUFFER_FOR);
-                }
-            },
-            _ = &mut buffered, if buffering => {
-                buffering = false;
-                info!(remaining = join_set.len(), total, elapsed_ms = started_at.elapsed().as_millis(), "waiting for tenants to shutdown");
+    while let Some(res) = join_set.join_next().await {
+        match res {
+            Ok(()) => {}
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("we are not cancelling any of the futures");
+            }
+            Err(join_error) if join_error.is_panic() => {
+                // cannot really do anything, as this panic is likely a bug
+                panicked += 1;
+            }
+            Err(join_error) => {
+                warn!("unknown kind of JoinError: {join_error}");
            }
        }
    }

    if panicked > 0 {
-        warn!(
-            panicked,
-            total, "observed panicks while shutting down tenants"
-        );
+        warn!(panicked, "observed panicks while shutting down tenants");
    }
-
-    // caller will log how long we took
 }

 pub async fn create_tenant(
@@ -363,7 +350,7 @@ pub async fn create_tenant(
        //       See https://github.com/neondatabase/neon/issues/4233

        let created_tenant =
-            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, &TENANTS, ctx)?;
+            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -424,14 +411,6 @@ pub async fn get_tenant(
    }
 }

-pub async fn delete_tenant(
-    conf: &'static PageServerConf,
-    remote_storage: Option<GenericRemoteStorage>,
-    tenant_id: TenantId,
-) -> Result<(), DeleteTenantError> {
-    DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant_id).await
-}
-
 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
    #[error("Tenant {0}")]
@@ -447,7 +426,7 @@ pub async fn delete_timeline(
    _ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
    let tenant = get_tenant(tenant_id, true).await?;
-    DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
+    DeleteTimelineFlow::run(&tenant, timeline_id).await?;
    Ok(())
 }

@@ -522,7 +501,7 @@ pub async fn load_tenant(
                .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
        }

-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, &TENANTS, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;
@@ -591,12 +570,6 @@ pub async fn attach_tenant(
    remote_storage: GenericRemoteStorage,
    ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
-    // Temporary solution, proper one would be to resume deletion, but that needs more plumbing around Tenant::load/Tenant::attach
-    // Corresponding issue https://github.com/neondatabase/neon/issues/5006
-    if remote_delete_mark_exists(conf, &tenant_id, &remote_storage).await? {
-        return Err(anyhow::anyhow!("Tenant is marked as deleted on remote storage").into());
-    }
-
    tenant_map_insert(tenant_id, || {
        let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach)?;
        // TODO: tenant directory remains on disk if we bail out from here on.
@@ -609,7 +582,7 @@ pub async fn attach_tenant(
            .context("check for attach marker file existence")?;
        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");

-        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, &TENANTS, ctx)?;
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -211,9 +211,6 @@ use chrono::{NaiveDateTime, Utc};
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;
-use utils::backoff::{
-    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
-};

 use std::collections::{HashMap, VecDeque};
 use std::path::Path;
@@ -222,6 +219,7 @@ use std::sync::{Arc, Mutex};

 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use std::ops::DerefMut;
+use tokio::runtime::Runtime;
 use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
@@ -243,6 +241,7 @@ use crate::{
    tenant::upload_queue::{
        UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
    },
+    {exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
 };

 use utils::id::{TenantId, TimelineId};
@@ -257,12 +256,12 @@ use super::upload_queue::SetDeletedFlagProgress;
 // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
 // level instead, as repeated failures can mean a more serious problem. If it
 // fails more than FAILED_DOWNLOAD_RETRIES times, we give up
-pub(crate) const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
-pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;
+const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
+const FAILED_DOWNLOAD_RETRIES: u32 = 10;

 // Similarly log failed uploads and deletions at WARN level, after this many
 // retries. Uploads and deletions are retried forever, though.
-pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;

 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
@@ -310,7 +309,7 @@ pub enum PersistIndexPartWithDeletedFlagError {
 pub struct RemoteTimelineClient {
    conf: &'static PageServerConf,

-    runtime: tokio::runtime::Handle,
+    runtime: &'static Runtime,

    tenant_id: TenantId,
    timeline_id: TimelineId,
@@ -337,7 +336,7 @@ impl RemoteTimelineClient {
    ) -> RemoteTimelineClient {
        RemoteTimelineClient {
            conf,
-            runtime: BACKGROUND_RUNTIME.handle().to_owned(),
+            runtime: &BACKGROUND_RUNTIME,
            tenant_id,
            timeline_id,
            storage_impl: remote_storage,
@@ -753,24 +752,12 @@ impl RemoteTimelineClient {

        pausable_failpoint!("persist_deleted_index_part");

-        backoff::retry(
-            || async {
-                upload::upload_index_part(
-                    self.conf,
-                    &self.storage_impl,
-                    &self.tenant_id,
-                    &self.timeline_id,
-                    &index_part_with_deleted_at,
-                )
-                .await
-            },
-            |_e| false,
-            1,
-            // have just a couple of attempts
-            // when executed as part of timeline deletion this happens in context of api call
-            // when executed as part of tenant deletion this happens in the background
-            2,
-            "persist_index_part_with_deleted_flag",
+        upload::upload_index_part(
+            self.conf,
+            &self.storage_impl,
+            &self.tenant_id,
+            &self.timeline_id,
+            &index_part_with_deleted_at,
        )
        .await?;

@@ -847,19 +834,10 @@ impl RemoteTimelineClient {
        let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
        let timeline_storage_path = self.conf.remote_path(&timeline_path)?;

-        let remaining = backoff::retry(
-            || async {
-                self.storage_impl
-                    .list_files(Some(&timeline_storage_path))
-                    .await
-            },
-            |_e| false,
-            FAILED_DOWNLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "list_prefixes",
-        )
-        .await
-        .context("list prefixes")?;
+        let remaining = self
+            .storage_impl
+            .list_prefixes(Some(&timeline_storage_path))
+            .await?;

        let remaining: Vec<RemotePath> = remaining
            .into_iter()
@@ -874,15 +852,7 @@ impl RemoteTimelineClient {
            .collect();

        if !remaining.is_empty() {
-            backoff::retry(
-                || async { self.storage_impl.delete_objects(&remaining).await },
-                |_e| false,
-                FAILED_UPLOAD_WARN_THRESHOLD,
-                FAILED_REMOTE_OP_RETRIES,
-                "delete_objects",
-            )
-            .await
-            .context("delete_objects")?;
+            self.storage_impl.delete_objects(&remaining).await?;
        }

        fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -894,16 +864,7 @@ impl RemoteTimelineClient {
        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));

        debug!("deleting index part");
-
-        backoff::retry(
-            || async { self.storage_impl.delete(&index_file_path).await },
-            |_e| false,
-            FAILED_UPLOAD_WARN_THRESHOLD,
-            FAILED_REMOTE_OP_RETRIES,
-            "delete_index",
-        )
-        .await
-        .context("delete_index")?;
+        self.storage_impl.delete(&index_file_path).await?;

        fail::fail_point!("timeline-delete-after-index-delete", |_| {
            Err(anyhow::anyhow!(
@@ -993,7 +954,7 @@ impl RemoteTimelineClient {
            let tenant_id = self.tenant_id;
            let timeline_id = self.timeline_id;
            task_mgr::spawn(
-                &self.runtime,
+                self.runtime.handle(),
                TaskKind::RemoteUploadTask,
                Some(self.tenant_id),
                Some(self.timeline_id),
@@ -1346,7 +1307,7 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
-            Tenant, Timeline,
+            Tenant,
        },
        DEFAULT_PG_VERSION,
    };
@@ -1355,6 +1316,7 @@ mod tests {
        collections::HashSet,
        path::{Path, PathBuf},
    };
+    use tokio::runtime::EnterGuard;
    use utils::lsn::Lsn;

    pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1404,25 +1366,35 @@ mod tests {
    }

    struct TestSetup {
+        runtime: &'static tokio::runtime::Runtime,
+        entered_runtime: EnterGuard<'static>,
        harness: TenantHarness,
        tenant: Arc<Tenant>,
-        timeline: Arc<Timeline>,
        tenant_ctx: RequestContext,
        remote_fs_dir: PathBuf,
        client: Arc<RemoteTimelineClient>,
    }

    impl TestSetup {
-        async fn new(test_name: &str) -> anyhow::Result<Self> {
+        fn new(test_name: &str) -> anyhow::Result<Self> {
            // Use a current-thread runtime in the test
+            let runtime = Box::leak(Box::new(
+                tokio::runtime::Builder::new_current_thread()
+                    .enable_all()
+                    .build()?,
+            ));
+            let entered_runtime = runtime.enter();
+
            let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
            let harness = TenantHarness::create(test_name)?;
-            let (tenant, ctx) = harness.load().await;
-
+            let (tenant, ctx) = runtime.block_on(harness.load());
            // create an empty timeline directory
-            let timeline = tenant
-                .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
-                .await?;
+            let _ = runtime.block_on(tenant.create_test_timeline(
+                TIMELINE_ID,
+                Lsn(8),
+                DEFAULT_PG_VERSION,
+                &ctx,
+            ))?;

            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
            std::fs::create_dir_all(remote_fs_dir)?;
@@ -1444,7 +1416,7 @@ mod tests {

            let client = Arc::new(RemoteTimelineClient {
                conf: harness.conf,
-                runtime: tokio::runtime::Handle::current(),
+                runtime,
                tenant_id: harness.tenant_id,
                timeline_id: TIMELINE_ID,
                storage_impl: storage,
@@ -1456,9 +1428,10 @@ mod tests {
            });

            Ok(Self {
+                runtime,
+                entered_runtime,
                harness,
                tenant,
-                timeline,
                tenant_ctx: ctx,
                remote_fs_dir,
                client,
@@ -1467,8 +1440,8 @@ mod tests {
    }

    // Test scheduling
-    #[tokio::test]
-    async fn upload_scheduling() {
+    #[test]
+    fn upload_scheduling() -> anyhow::Result<()> {
        // Test outline:
        //
        // Schedule upload of a bunch of layers. Check that they are started immediately, not queued
@@ -1484,26 +1457,25 @@ mod tests {
        // Schedule index upload. Check that it's queued

        let TestSetup {
+            runtime,
+            entered_runtime: _entered_runtime,
            harness,
            tenant: _tenant,
-            timeline: _timeline,
            tenant_ctx: _tenant_ctx,
            remote_fs_dir,
            client,
-        } = TestSetup::new("upload_scheduling").await.unwrap();
+        } = TestSetup::new("upload_scheduling").unwrap();

        let timeline_path = harness.timeline_path(&TIMELINE_ID);

        println!("workdir: {}", harness.conf.workdir.display());

        let remote_timeline_dir =
-            remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
+            remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir)?);
        println!("remote_timeline_dir: {}", remote_timeline_dir.display());

        let metadata = dummy_metadata(Lsn(0x10));
-        client
-            .init_upload_queue_for_empty_remote(&metadata)
-            .unwrap();
+        client.init_upload_queue_for_empty_remote(&metadata)?;

        // Create a couple of dummy files,  schedule upload for them
        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
@@ -1512,32 +1484,26 @@ mod tests {
        let content_1 = dummy_contents("foo");
        let content_2 = dummy_contents("bar");
        let content_3 = dummy_contents("baz");
+        std::fs::write(
+            timeline_path.join(layer_file_name_1.file_name()),
+            &content_1,
+        )?;
+        std::fs::write(
+            timeline_path.join(layer_file_name_2.file_name()),
+            &content_2,
+        )?;
+        std::fs::write(timeline_path.join(layer_file_name_3.file_name()), content_3)?;

-        for (filename, content) in [
-            (&layer_file_name_1, &content_1),
-            (&layer_file_name_2, &content_2),
-            (&layer_file_name_3, &content_3),
-        ] {
-            std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
-        }
-
-        client
-            .schedule_layer_file_upload(
-                &layer_file_name_1,
-                &LayerFileMetadata::new(content_1.len() as u64),
-            )
-            .unwrap();
-        client
-            .schedule_layer_file_upload(
-                &layer_file_name_2,
-                &LayerFileMetadata::new(content_2.len() as u64),
-            )
-            .unwrap();
+        client.schedule_layer_file_upload(
+            &layer_file_name_1,
+            &LayerFileMetadata::new(content_1.len() as u64),
+        )?;
+        client.schedule_layer_file_upload(
+            &layer_file_name_2,
+            &LayerFileMetadata::new(content_2.len() as u64),
+        )?;

        // Check that they are started immediately, not queued
-        //
-        // this works because we running within block_on, so any futures are now queued up until
-        // our next await point.
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1551,9 +1517,7 @@ mod tests {

        // Schedule upload of index. Check that it is queued
        let metadata = dummy_metadata(Lsn(0x20));
-        client
-            .schedule_index_upload_for_metadata_update(&metadata)
-            .unwrap();
+        client.schedule_index_upload_for_metadata_update(&metadata)?;
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1562,7 +1526,7 @@ mod tests {
        }

        // Wait for the uploads to finish
-        client.wait_completion().await.unwrap();
+        runtime.block_on(client.wait_completion())?;
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1572,7 +1536,7 @@ mod tests {
        }

        // Download back the index.json, and check that the list of files is correct
-        let index_part = match client.download_index_file().await.unwrap() {
+        let index_part = match runtime.block_on(client.download_index_file())? {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };
@@ -1584,19 +1548,17 @@ mod tests {
                &layer_file_name_2.file_name(),
            ],
        );
-        let downloaded_metadata = index_part.parse_metadata().unwrap();
+        let downloaded_metadata = index_part.parse_metadata()?;
        assert_eq!(downloaded_metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
-        client
-            .schedule_layer_file_upload(
-                &layer_file_name_3,
-                &LayerFileMetadata::new(content_3.len() as u64),
-            )
-            .unwrap();
-        client
-            .schedule_layer_file_deletion(&[layer_file_name_1.clone()])
-            .unwrap();
+        let content_baz = dummy_contents("baz");
+        std::fs::write(timeline_path.join("baz"), &content_baz)?;
+        client.schedule_layer_file_upload(
+            &layer_file_name_3,
+            &LayerFileMetadata::new(content_baz.len() as u64),
+        )?;
+        client.schedule_layer_file_deletion(&[layer_file_name_1.clone()])?;
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1618,7 +1580,7 @@ mod tests {
        );

        // Finish them
-        client.wait_completion().await.unwrap();
+        runtime.block_on(client.wait_completion())?;

        assert_remote_files(
            &[
@@ -1628,24 +1590,23 @@ mod tests {
            ],
            &remote_timeline_dir,
        );
+
+        Ok(())
    }

-    #[tokio::test]
-    async fn bytes_unfinished_gauge_for_layer_file_uploads() {
+    #[test]
+    fn bytes_unfinished_gauge_for_layer_file_uploads() -> anyhow::Result<()> {
        // Setup

        let TestSetup {
+            runtime,
            harness,
-            tenant: _tenant,
-            timeline: _timeline,
            client,
            ..
-        } = TestSetup::new("metrics").await.unwrap();
+        } = TestSetup::new("metrics")?;

        let metadata = dummy_metadata(Lsn(0x10));
-        client
-            .init_upload_queue_for_empty_remote(&metadata)
-            .unwrap();
+        client.init_upload_queue_for_empty_remote(&metadata)?;

        let timeline_path = harness.timeline_path(&TIMELINE_ID);

@@ -1654,8 +1615,7 @@ mod tests {
        std::fs::write(
            timeline_path.join(layer_file_name_1.file_name()),
            &content_1,
-        )
-        .unwrap();
+        )?;

        #[derive(Debug, PartialEq)]
        struct BytesStartedFinished {
@@ -1681,16 +1641,14 @@ mod tests {

        let init = get_bytes_started_stopped();

-        client
-            .schedule_layer_file_upload(
-                &layer_file_name_1,
-                &LayerFileMetadata::new(content_1.len() as u64),
-            )
-            .unwrap();
+        client.schedule_layer_file_upload(
+            &layer_file_name_1,
+            &LayerFileMetadata::new(content_1.len() as u64),
+        )?;

        let pre = get_bytes_started_stopped();

-        client.wait_completion().await.unwrap();
+        runtime.block_on(client.wait_completion())?;

        let post = get_bytes_started_stopped();

@@ -1718,5 +1676,7 @@ mod tests {
                finished: Some(content_1.len())
            }
        );
+
+        Ok(())
    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -11,17 +11,23 @@ use std::time::Duration;
 use anyhow::{anyhow, Context};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
-use utils::{backoff, crashsafe};
+
+use tracing::{info, warn};

 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};

 use super::index::{IndexPart, LayerFileMetadata};
-use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};
+use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
+
+async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
+    fs::File::open(path).await?.sync_all().await
+}

 static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);

@@ -146,7 +152,7 @@ pub async fn download_layer_file<'a>(
        })
        .map_err(DownloadError::Other)?;

-    crashsafe::fsync_async(&local_path)
+    fsync_path(&local_path)
        .await
        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
        .map_err(DownloadError::Other)?;
@@ -262,6 +268,7 @@ pub(super) async fn download_index_part(
    Ok(index_part)
 }

+///
 /// Helper function to handle retries for a download operation.
 ///
 /// Remote operations can fail due to rate limits (IAM, S3), spurious network
@@ -269,17 +276,47 @@ pub(super) async fn download_index_part(
 /// with backoff.
 ///
 /// (See similar logic for uploads in `perform_upload_task`)
-async fn download_retry<T, O, F>(op: O, description: &str) -> Result<T, DownloadError>
+async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
 where
    O: FnMut() -> F,
    F: Future<Output = Result<T, DownloadError>>,
 {
-    backoff::retry(
-        op,
-        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
-        FAILED_DOWNLOAD_WARN_THRESHOLD,
-        FAILED_REMOTE_OP_RETRIES,
-        description,
-    )
-    .await
+    let mut attempts = 0;
+    loop {
+        let result = op().await;
+        match result {
+            Ok(_) => {
+                if attempts > 0 {
+                    info!("{description} succeeded after {attempts} retries");
+                }
+                return result;
+            }
+
+            // These are "permanent" errors that should not be retried.
+            Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
+                return result;
+            }
+            // Assume that any other failure might be transient, and the operation might
+            // succeed if we just keep trying.
+            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
+                info!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
+                warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(DownloadError::Other(ref err)) => {
+                // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
+                warn!("{description} still failed after {attempts} retries, giving up: {err:?}");
+                return result;
+            }
+        }
+        // sleep and retry
+        exponential_backoff(
+            attempts,
+            DEFAULT_BASE_BACKOFF_SECONDS,
+            DEFAULT_MAX_BACKOFF_SECONDS,
+        )
+        .await;
+        attempts += 1;
+    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -223,45 +223,6 @@ mod tests {
        assert_eq!(part, expected);
    }

-    #[test]
-    fn v2_indexpart_is_parsed_with_deleted_at() {
-        let example = r#"{
-            "version":2,
-            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
-            "missing_layers":["This shouldn't fail deserialization"],
-            "layer_metadata":{
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
-                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
-            },
-            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
-            "deleted_at": "2023-07-31T09:00:00.123"
-        }"#;
-
-        let expected = IndexPart {
-            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
-            version: 2,
-            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
-            layer_metadata: HashMap::from([
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
-                    file_size: 25600000,
-                }),
-                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
-                    // serde_json should always parse this but this might be a double with jq for
-                    // example.
-                    file_size: 9007199254741001,
-                })
-            ]),
-            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
-            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
-        };
-
-        let part = serde_json::from_str::<IndexPart>(example).unwrap();
-        assert_eq!(part, expected);
-    }
-
    #[test]
    fn empty_layers_are_parsed() {
        let empty_layers_json = r#"{
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,8 +8,8 @@ mod layer_desc;
 mod remote_layer;

 use crate::config::PageServerConf;
-use crate::context::{AccessStatsBehavior, RequestContext};
-use crate::repository::Key;
+use crate::context::RequestContext;
+use crate::repository::{Key, Value};
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Result;
@@ -34,7 +34,7 @@ use utils::{
    lsn::Lsn,
 };

-pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
+pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
 pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
@@ -241,14 +241,10 @@ impl LayerAccessStats {
        });
    }

-    fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
-        if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
-            return;
-        }
-
+    fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) {
        let this_access = LayerAccessStatFullDetails {
            when: SystemTime::now(),
-            task_kind: ctx.task_kind(),
+            task_kind,
            access_kind,
        };

@@ -256,7 +252,7 @@ impl LayerAccessStats {
        locked.iter_mut().for_each(|inner| {
            inner.first_access.get_or_insert(this_access);
            inner.count_by_access_kind[access_kind] += 1;
-            inner.task_kind_flag |= ctx.task_kind();
+            inner.task_kind_flag |= task_kind;
            inner.last_accesses.write(this_access);
        })
    }
@@ -385,6 +381,12 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
    async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
 }

+/// Returned by [`PersistentLayer::iter`]
+pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
+
+/// Returned by [`PersistentLayer::key_iter`]
+pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
+
 /// Get a layer descriptor from a layer.
 pub trait AsLayerDesc {
    /// Get the layer descriptor.
@@ -405,6 +407,16 @@ pub trait AsLayerDesc {
 /// An image layer is a snapshot of all the data in a key-range, at a single
 /// LSN.
 pub trait PersistentLayer: Layer + AsLayerDesc {
+    /// Identify the tenant this layer belongs to
+    fn get_tenant_id(&self) -> TenantId {
+        self.layer_desc().tenant_id
+    }
+
+    /// Identify the timeline this layer belongs to
+    fn get_timeline_id(&self) -> TimelineId {
+        self.layer_desc().timeline_id
+    }
+
    /// File name used for this layer, both in the pageserver's local filesystem
    /// state as well as in the remote storage.
    fn filename(&self) -> LayerFileName {
@@ -415,6 +427,15 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
    // `None` for `RemoteLayer`.
    fn local_path(&self) -> Option<PathBuf>;

+    /// Iterate through all keys and values stored in the layer
+    fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>>;
+
+    /// Iterate through all keys stored in the layer. Returns key, lsn and value size
+    /// It is used only for compaction and so is currently implemented only for DeltaLayer
+    fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
+        panic!("Not implemented")
+    }
+
    /// Permanently remove this layer from disk.
    fn delete_resident_layer_file(&self) -> Result<()>;

@@ -430,6 +451,14 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
        false
    }

+    /// Returns None if the layer file size is not known.
+    ///
+    /// Should not change over the lifetime of the layer object because
+    /// current_physical_size is computed as the som of this value.
+    fn file_size(&self) -> u64 {
+        self.layer_desc().file_size
+    }
+
    fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;

    fn access_stats(&self) -> &LayerAccessStats;
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -29,10 +29,10 @@
 //!
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::page_cache::PAGE_SZ;
+use crate::page_cache::{PageReadGuard, PAGE_SZ};
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
-use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
+use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
    PersistentLayer, ValueReconstructResult, ValueReconstructState,
@@ -41,6 +41,7 @@ use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
+use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -51,8 +52,6 @@ use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::Arc;
-use tokio::runtime::Handle;
-use tokio::sync::OnceCell;
 use tracing::*;

 use utils::{
@@ -62,8 +61,8 @@ use utils::{
 };

 use super::{
-    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf,
-    PersistentLayerDesc,
+    AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
+    LayerKeyIter, PathOrConf, PersistentLayerDesc,
 };

 ///
@@ -91,30 +90,14 @@ pub struct Summary {

 impl From<&DeltaLayer> for Summary {
    fn from(layer: &DeltaLayer) -> Self {
-        Self::expected(
-            layer.desc.tenant_id,
-            layer.desc.timeline_id,
-            layer.desc.key_range.clone(),
-            layer.desc.lsn_range.clone(),
-        )
-    }
-}
-
-impl Summary {
-    pub(super) fn expected(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        keys: Range<Key>,
-        lsns: Range<Lsn>,
-    ) -> Self {
        Self {
            magic: DELTA_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,

-            tenant_id,
-            timeline_id,
-            key_range: keys,
-            lsn_range: lsns,
+            tenant_id: layer.desc.tenant_id,
+            timeline_id: layer.desc.timeline_id,
+            key_range: layer.desc.key_range.clone(),
+            lsn_range: layer.desc.lsn_range.clone(),

            index_start_blk: 0,
            index_root_blk: 0,
@@ -125,10 +108,12 @@ impl Summary {
 // Flag indicating that this version initialize the page
 const WILL_INIT: u64 = 1;

+///
 /// Struct representing reference to BLOB in layers. Reference contains BLOB
 /// offset, and for WAL records it also contains `will_init` flag. The flag
 /// helps to determine the range of records that needs to be applied, without
 /// reading/deserializing records themselves.
+///
 #[derive(Debug, Serialize, Deserialize, Copy, Clone)]
 pub struct BlobRef(pub u64);

@@ -153,8 +138,10 @@ impl BlobRef {
 pub const DELTA_KEY_SIZE: usize = KEY_SIZE + 8;
 struct DeltaKey([u8; DELTA_KEY_SIZE]);

+///
 /// This is the key of the B-tree index stored in the delta layer. It consists
 /// of the serialized representation of a Key and LSN.
+///
 impl DeltaKey {
    fn from_slice(buf: &[u8]) -> Self {
        let mut bytes: [u8; DELTA_KEY_SIZE] = [0u8; DELTA_KEY_SIZE];
@@ -202,7 +189,7 @@ pub struct DeltaLayer {

    access_stats: LayerAccessStats,

-    inner: OnceCell<Arc<DeltaLayerInner>>,
+    inner: OnceCell<DeltaLayerInner>,
 }

 impl std::fmt::Debug for DeltaLayer {
@@ -227,12 +214,6 @@ pub struct DeltaLayerInner {
    file: FileBlockReader<VirtualFile>,
 }

-impl AsRef<DeltaLayerInner> for DeltaLayerInner {
-    fn as_ref(&self) -> &DeltaLayerInner {
-        self
-    }
-}
-
 impl std::fmt::Debug for DeltaLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("DeltaLayerInner")
@@ -261,7 +242,7 @@ impl Layer for DeltaLayer {
            return Ok(());
        }

-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+        let inner = self.load(LayerAccessKind::Dump, ctx)?;

        println!(
            "index_start_blk: {}, root {}",
@@ -275,14 +256,13 @@ impl Layer for DeltaLayer {
            file,
        );

-        tree_reader.dump().await?;
+        tree_reader.dump()?;

-        let cursor = file.block_cursor();
+        let mut cursor = file.block_cursor();

        // A subroutine to dump a single blob
-        let dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
-            // TODO this is not ideal, but on the other hand we are in dumping code...
-            let buf = Handle::current().block_on(cursor.read_blob(blob_ref.pos()))?;
+        let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
+            let buf = cursor.read_blob(blob_ref.pos())?;
            let val = Value::des(&buf)?;
            let desc = match val {
                Value::Image(img) => {
@@ -301,24 +281,22 @@ impl Layer for DeltaLayer {
            Ok(desc)
        };

-        tree_reader
-            .visit(
-                &[0u8; DELTA_KEY_SIZE],
-                VisitDirection::Forwards,
-                |delta_key, val| {
-                    let blob_ref = BlobRef(val);
-                    let key = DeltaKey::extract_key_from_buf(delta_key);
-                    let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
+        tree_reader.visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |delta_key, val| {
+                let blob_ref = BlobRef(val);
+                let key = DeltaKey::extract_key_from_buf(delta_key);
+                let lsn = DeltaKey::extract_lsn_from_buf(delta_key);

-                    let desc = match dump_blob(blob_ref) {
-                        Ok(desc) => desc,
-                        Err(err) => format!("ERROR: {}", err),
-                    };
-                    println!("  key {} at {}: {}", key, lsn, desc);
-                    true
-                },
-            )
-            .await?;
+                let desc = match dump_blob(blob_ref) {
+                    Ok(desc) => desc,
+                    Err(err) => format!("ERROR: {}", err),
+                };
+                println!("  key {} at {}: {}", key, lsn, desc);
+                true
+            },
+        )?;

        Ok(())
    }
@@ -331,15 +309,82 @@ impl Layer for DeltaLayer {
        ctx: &RequestContext,
    ) -> anyhow::Result<ValueReconstructResult> {
        ensure!(lsn_range.start >= self.desc.lsn_range.start);
+        let mut need_image = true;

        ensure!(self.desc.key_range.contains(&key));

-        let inner = self
-            .load(LayerAccessKind::GetValueReconstructData, ctx)
-            .await?;
-        inner
-            .get_value_reconstruct_data(key, lsn_range, reconstruct_state)
-            .await
+        {
+            // Open the file and lock the metadata in memory
+            let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
+
+            // Scan the page versions backwards, starting from `lsn`.
+            let file = &inner.file;
+            let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+                inner.index_start_blk,
+                inner.index_root_blk,
+                file,
+            );
+            let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));
+
+            let mut offsets: Vec<(Lsn, u64)> = Vec::new();
+
+            tree_reader.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
+                let blob_ref = BlobRef(value);
+                if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
+                    return false;
+                }
+                let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
+                if entry_lsn < lsn_range.start {
+                    return false;
+                }
+                offsets.push((entry_lsn, blob_ref.pos()));
+
+                !blob_ref.will_init()
+            })?;
+
+            // Ok, 'offsets' now contains the offsets of all the entries we need to read
+            let mut cursor = file.block_cursor();
+            let mut buf = Vec::new();
+            for (entry_lsn, pos) in offsets {
+                cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
+                    format!(
+                        "Failed to read blob from virtual file {}",
+                        file.file.path.display()
+                    )
+                })?;
+                let val = Value::des(&buf).with_context(|| {
+                    format!(
+                        "Failed to deserialize file blob from virtual file {}",
+                        file.file.path.display()
+                    )
+                })?;
+                match val {
+                    Value::Image(img) => {
+                        reconstruct_state.img = Some((entry_lsn, img));
+                        need_image = false;
+                        break;
+                    }
+                    Value::WalRecord(rec) => {
+                        let will_init = rec.will_init();
+                        reconstruct_state.records.push((entry_lsn, rec));
+                        if will_init {
+                            // This WAL record initializes the page, so no need to go further back
+                            need_image = false;
+                            break;
+                        }
+                    }
+                }
+            }
+            // release metadata lock and close the file
+        }
+
+        // If an older page image is needed to reconstruct the page, let the
+        // caller know.
+        if need_image {
+            Ok(ValueReconstructResult::Continue)
+        } else {
+            Ok(ValueReconstructResult::Complete)
+        }
    }

    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
@@ -379,6 +424,23 @@ impl PersistentLayer for DeltaLayer {
        Some(self.path())
    }

+    fn iter(&self, ctx: &RequestContext) -> Result<LayerIter<'_>> {
+        let inner = self
+            .load(LayerAccessKind::KeyIter, ctx)
+            .context("load delta layer")?;
+        Ok(match DeltaValueIter::new(inner) {
+            Ok(iter) => Box::new(iter),
+            Err(err) => Box::new(std::iter::once(Err(err))),
+        })
+    }
+
+    fn key_iter(&self, ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
+        let inner = self.load(LayerAccessKind::KeyIter, ctx)?;
+        Ok(Box::new(
+            DeltaKeyIter::new(inner).context("Layer index is corrupted")?,
+        ))
+    }
+
    fn delete_resident_layer_file(&self) -> Result<()> {
        // delete underlying file
        fs::remove_file(self.path())?;
@@ -448,43 +510,55 @@ impl DeltaLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    async fn load(
-        &self,
-        access_kind: LayerAccessKind,
-        ctx: &RequestContext,
-    ) -> Result<&Arc<DeltaLayerInner>> {
-        self.access_stats.record_access(access_kind, ctx);
+    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
+        self.access_stats
+            .record_access(access_kind, ctx.task_kind());
        // Quick exit if already loaded
        self.inner
            .get_or_try_init(|| self.load_inner())
-            .await
            .with_context(|| format!("Failed to load delta layer {}", self.path().display()))
    }

-    async fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
+    fn load_inner(&self) -> Result<DeltaLayerInner> {
        let path = self.path();

-        let summary = match &self.path_or_conf {
-            PathOrConf::Conf(_) => Some(Summary::from(self)),
-            PathOrConf::Path(_) => None,
-        };
+        let file = VirtualFile::open(&path)
+            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+        let file = FileBlockReader::new(file);

-        let loaded = DeltaLayerInner::load(&path, summary)?;
+        let summary_blk = file.read_blk(0)?;
+        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

-        if let PathOrConf::Path(ref path) = self.path_or_conf {
-            // not production code
+        match &self.path_or_conf {
+            PathOrConf::Conf(_) => {
+                let mut expected_summary = Summary::from(self);
+                expected_summary.index_start_blk = actual_summary.index_start_blk;
+                expected_summary.index_root_blk = actual_summary.index_root_blk;
+                if actual_summary != expected_summary {
+                    bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
+                }
+            }
+            PathOrConf::Path(path) => {
+                let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
+                let expected_filename = self.filename().file_name();

-            let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
-            let expected_filename = self.filename().file_name();
-
-            if actual_filename != expected_filename {
-                println!("warning: filename does not match what is expected from in-file summary");
-                println!("actual: {:?}", actual_filename);
-                println!("expected: {:?}", expected_filename);
+                if actual_filename != expected_filename {
+                    println!(
+                        "warning: filename does not match what is expected from in-file summary"
+                    );
+                    println!("actual: {:?}", actual_filename);
+                    println!("expected: {:?}", expected_filename);
+                }
            }
        }

-        Ok(Arc::new(loaded))
+        debug!("loaded from {}", &path.display());
+
+        Ok(DeltaLayerInner {
+            file,
+            index_start_blk: actual_summary.index_start_blk,
+            index_root_blk: actual_summary.index_root_blk,
+        })
    }

    /// Create a DeltaLayer struct representing an existing file on disk.
@@ -506,7 +580,7 @@ impl DeltaLayer {
                file_size,
            ),
            access_stats,
-            inner: OnceCell::new(),
+            inner: once_cell::sync::OnceCell::new(),
        }
    }

@@ -533,7 +607,7 @@ impl DeltaLayer {
                metadata.len(),
            ),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
+            inner: once_cell::sync::OnceCell::new(),
        })
    }

@@ -549,23 +623,6 @@ impl DeltaLayer {
            &self.layer_name(),
        )
    }
-    /// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
-    ///
-    /// The value can be obtained via the [`ValueRef::load`] function.
-    pub(crate) async fn load_keys(
-        &self,
-        ctx: &RequestContext,
-    ) -> Result<Vec<DeltaEntry<Ref<&'_ DeltaLayerInner>>>> {
-        let inner = self
-            .load(LayerAccessKind::KeyIter, ctx)
-            .await
-            .context("load delta layer keys")?;
-
-        let inner = Ref(&**inner);
-        DeltaLayerInner::load_keys(&inner)
-            .await
-            .context("Layer index is corrupted")
-    }
 }

 /// A builder object for constructing a new delta layer.
@@ -701,17 +758,6 @@ impl DeltaLayerWriterInner {
            .metadata()
            .context("get file metadata to determine size")?;

-        // 5GB limit for objects without multipart upload (which we don't want to use)
-        // Make it a little bit below to account for differing GB units
-        // https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html
-        const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
-        ensure!(
-            metadata.len() <= S3_UPLOAD_LIMIT,
-            "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
-            file.path.display(),
-            metadata.len()
-        );
-
        // Note: Because we opened the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
@@ -725,7 +771,7 @@ impl DeltaLayerWriterInner {
                metadata.len(),
            ),
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: OnceCell::new(),
+            inner: once_cell::sync::OnceCell::new(),
        };

        // fsync the file
@@ -847,213 +893,168 @@ impl Drop for DeltaLayerWriter {
    }
 }

-impl DeltaLayerInner {
-    pub(super) fn load(path: &std::path::Path, summary: Option<Summary>) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path)
-            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-        let file = FileBlockReader::new(file);
+///
+/// Iterator over all key-value pairse stored in a delta layer
+///
+/// FIXME: This creates a Vector to hold the offsets of all key value pairs.
+/// That takes up quite a lot of memory. Should do this in a more streaming
+/// fashion.
+///
+struct DeltaValueIter<'a> {
+    all_offsets: Vec<(DeltaKey, BlobRef)>,
+    next_idx: usize,
+    reader: BlockCursor<Adapter<'a>>,
+}

-        let summary_blk = file.read_blk(0)?;
-        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+struct Adapter<'a>(&'a DeltaLayerInner);

-        if let Some(mut expected_summary) = summary {
-            // production code path
-            expected_summary.index_start_blk = actual_summary.index_start_blk;
-            expected_summary.index_root_blk = actual_summary.index_root_blk;
-            if actual_summary != expected_summary {
-                bail!(
-                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
-                    actual_summary,
-                    expected_summary
-                );
-            }
-        }
+impl<'a> BlockReader for Adapter<'a> {
+    type BlockLease = PageReadGuard<'static>;

-        Ok(DeltaLayerInner {
-            file,
-            index_start_blk: actual_summary.index_start_blk,
-            index_root_blk: actual_summary.index_root_blk,
-        })
+    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+        self.0.file.read_blk(blknum)
    }
+}

-    pub(super) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        lsn_range: Range<Lsn>,
-        reconstruct_state: &mut ValueReconstructState,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        let mut need_image = true;
-        // Scan the page versions backwards, starting from `lsn`.
-        let file = &self.file;
+impl<'a> Iterator for DeltaValueIter<'a> {
+    type Item = Result<(Key, Lsn, Value)>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.next_res().transpose()
+    }
+}
+
+impl<'a> DeltaValueIter<'a> {
+    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
+        let file = &inner.file;
        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
+            inner.index_start_blk,
+            inner.index_root_blk,
            file,
        );
-        let search_key = DeltaKey::from_key_lsn(&key, Lsn(lsn_range.end.0 - 1));

-        let mut offsets: Vec<(Lsn, u64)> = Vec::new();
+        let mut all_offsets: Vec<(DeltaKey, BlobRef)> = Vec::new();
+        tree_reader.visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |key, value| {
+                all_offsets.push((DeltaKey::from_slice(key), BlobRef(value)));
+                true
+            },
+        )?;

-        tree_reader
-            .visit(&search_key.0, VisitDirection::Backwards, |key, value| {
-                let blob_ref = BlobRef(value);
-                if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
-                    return false;
-                }
-                let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
-                if entry_lsn < lsn_range.start {
-                    return false;
-                }
-                offsets.push((entry_lsn, blob_ref.pos()));
+        let iter = DeltaValueIter {
+            all_offsets,
+            next_idx: 0,
+            reader: BlockCursor::new(Adapter(inner)),
+        };

-                !blob_ref.will_init()
-            })
-            .await?;
+        Ok(iter)
+    }

-        // Ok, 'offsets' now contains the offsets of all the entries we need to read
-        let cursor = file.block_cursor();
-        let mut buf = Vec::new();
-        for (entry_lsn, pos) in offsets {
-            cursor
-                .read_blob_into_buf(pos, &mut buf)
-                .await
-                .with_context(|| {
-                    format!(
-                        "Failed to read blob from virtual file {}",
-                        file.file.path.display()
-                    )
-                })?;
-            let val = Value::des(&buf).with_context(|| {
-                format!(
-                    "Failed to deserialize file blob from virtual file {}",
-                    file.file.path.display()
-                )
-            })?;
-            match val {
-                Value::Image(img) => {
-                    reconstruct_state.img = Some((entry_lsn, img));
-                    need_image = false;
-                    break;
-                }
-                Value::WalRecord(rec) => {
-                    let will_init = rec.will_init();
-                    reconstruct_state.records.push((entry_lsn, rec));
-                    if will_init {
-                        // This WAL record initializes the page, so no need to go further back
-                        need_image = false;
-                        break;
-                    }
-                }
-            }
-        }
+    fn next_res(&mut self) -> Result<Option<(Key, Lsn, Value)>> {
+        if self.next_idx < self.all_offsets.len() {
+            let (delta_key, blob_ref) = &self.all_offsets[self.next_idx];

-        // If an older page image is needed to reconstruct the page, let the
-        // caller know.
-        if need_image {
-            Ok(ValueReconstructResult::Continue)
+            let key = delta_key.key();
+            let lsn = delta_key.lsn();
+
+            let buf = self.reader.read_blob(blob_ref.pos())?;
+            let val = Value::des(&buf)?;
+            self.next_idx += 1;
+            Ok(Some((key, lsn, val)))
        } else {
-            Ok(ValueReconstructResult::Complete)
+            Ok(None)
        }
    }
-
-    pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
-        this: &T,
-    ) -> Result<Vec<DeltaEntry<T>>> {
-        let dl = this.as_ref();
-        let file = &dl.file;
-
-        let tree_reader =
-            DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);
-
-        let mut all_keys: Vec<DeltaEntry<T>> = Vec::new();
-
-        tree_reader
-            .visit(
-                &[0u8; DELTA_KEY_SIZE],
-                VisitDirection::Forwards,
-                |key, value| {
-                    let delta_key = DeltaKey::from_slice(key);
-                    let val_ref = ValueRef {
-                        blob_ref: BlobRef(value),
-                        reader: BlockCursor::new(Adapter(this.clone())),
-                    };
-                    let pos = BlobRef(value).pos();
-                    if let Some(last) = all_keys.last_mut() {
-                        // subtract offset of the current and last entries to get the size
-                        // of the value associated with this (key, lsn) tuple
-                        let first_pos = last.size;
-                        last.size = pos - first_pos;
-                    }
-                    let entry = DeltaEntry {
-                        key: delta_key.key(),
-                        lsn: delta_key.lsn(),
-                        size: pos,
-                        val: val_ref,
-                    };
-                    all_keys.push(entry);
-                    true
-                },
-            )
-            .await?;
-        if let Some(last) = all_keys.last_mut() {
-            // Last key occupies all space till end of value storage,
-            // which corresponds to beginning of the index
-            last.size = dl.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
-        }
-        Ok(all_keys)
-    }
 }
-
-/// Cloneable borrow wrapper to make borrows behave like smart pointers.
 ///
-/// Shared references are trivially copyable. This wrapper avoids (confusion) to otherwise attempt
-/// cloning DeltaLayerInner.
-pub(crate) struct Ref<T>(T);
+/// Iterator over all keys stored in a delta layer
+///
+/// FIXME: This creates a Vector to hold all keys.
+/// That takes up quite a lot of memory. Should do this in a more streaming
+/// fashion.
+///
+struct DeltaKeyIter {
+    all_keys: Vec<(DeltaKey, u64)>,
+    next_idx: usize,
+}

-impl<'a, T> AsRef<T> for Ref<&'a T> {
-    fn as_ref(&self) -> &T {
-        self.0
+impl Iterator for DeltaKeyIter {
+    type Item = (Key, Lsn, u64);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.next_idx < self.all_keys.len() {
+            let (delta_key, size) = &self.all_keys[self.next_idx];
+
+            let key = delta_key.key();
+            let lsn = delta_key.lsn();
+
+            self.next_idx += 1;
+            Some((key, lsn, *size))
+        } else {
+            None
+        }
    }
 }

-impl<'a, T> Clone for Ref<&'a T> {
-    fn clone(&self) -> Self {
-        *self
+impl<'a> DeltaKeyIter {
+    fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
+        let file = &inner.file;
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            inner.index_start_blk,
+            inner.index_root_blk,
+            file,
+        );
+
+        let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new();
+        tree_reader.visit(
+            &[0u8; DELTA_KEY_SIZE],
+            VisitDirection::Forwards,
+            |key, value| {
+                let delta_key = DeltaKey::from_slice(key);
+                let pos = BlobRef(value).pos();
+                if let Some(last) = all_keys.last_mut() {
+                    if last.0.key() == delta_key.key() {
+                        return true;
+                    } else {
+                        // subtract offset of new key BLOB and first blob of this key
+                        // to get total size if values associated with this key
+                        let first_pos = last.1;
+                        last.1 = pos - first_pos;
+                    }
+                }
+                all_keys.push((delta_key, pos));
+                true
+            },
+        )?;
+        if let Some(last) = all_keys.last_mut() {
+            // Last key occupies all space till end of layer
+            last.1 = std::fs::metadata(&file.file.path)?.len() - last.1;
+        }
+        let iter = DeltaKeyIter {
+            all_keys,
+            next_idx: 0,
+        };
+
+        Ok(iter)
    }
 }

-impl<'a, T> Copy for Ref<&'a T> {}
+#[cfg(test)]
+mod test {
+    use super::DeltaKeyIter;
+    use super::DeltaLayer;
+    use super::DeltaValueIter;

-/// A set of data associated with a delta layer key and its value
-pub struct DeltaEntry<T: AsRef<DeltaLayerInner>> {
-    pub key: Key,
-    pub lsn: Lsn,
-    /// Size of the stored value
-    pub size: u64,
-    /// Reference to the on-disk value
-    pub val: ValueRef<T>,
-}
-
-/// Reference to an on-disk value
-pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
-    blob_ref: BlobRef,
-    reader: BlockCursor<Adapter<T>>,
-}
-
-impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
-    /// Loads the value from disk
-    pub async fn load(&self) -> Result<Value> {
-        // theoretically we *could* record an access time for each, but it does not really matter
-        let buf = self.reader.read_blob(self.blob_ref.pos()).await?;
-        let val = Value::des(&buf)?;
-        Ok(val)
-    }
-}
-
-struct Adapter<T: AsRef<DeltaLayerInner>>(T);
-
-impl<T: AsRef<DeltaLayerInner>> BlockReader for Adapter<T> {
-    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-        self.0.as_ref().file.read_blk(blknum)
+    // We will soon need the iters to be send in the compaction code.
+    // Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
+    // Cf https://github.com/neondatabase/neon/issues/4471
+    #[test]
+    fn is_send() {
+        fn assert_send<T: Send>() {}
+        assert_send::<DeltaLayer>();
+        assert_send::<DeltaValueIter>();
+        assert_send::<DeltaKeyIter>();
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -38,6 +38,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use hex;
+use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -47,7 +48,6 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
-use tokio::sync::OnceCell;
 use tracing::*;

 use utils::{
@@ -57,7 +57,9 @@ use utils::{
 };

 use super::filename::ImageFileName;
-use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc};
+use super::{
+    AsLayerDesc, Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc,
+};

 ///
 /// Header stored in the beginning of the file
@@ -66,7 +68,7 @@ use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLay
 /// the 'index' starts at the block indicated by 'index_start_blk'
 ///
 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
-pub(super) struct Summary {
+struct Summary {
    /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
    magic: u16,
    format_version: u16,
@@ -85,29 +87,13 @@ pub(super) struct Summary {

 impl From<&ImageLayer> for Summary {
    fn from(layer: &ImageLayer) -> Self {
-        Self::expected(
-            layer.desc.tenant_id,
-            layer.desc.timeline_id,
-            layer.desc.key_range.clone(),
-            layer.lsn,
-        )
-    }
-}
-
-impl Summary {
-    pub(super) fn expected(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        key_range: Range<Key>,
-        lsn: Lsn,
-    ) -> Self {
        Self {
            magic: IMAGE_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,
-            tenant_id,
-            timeline_id,
-            key_range,
-            lsn,
+            tenant_id: layer.desc.tenant_id,
+            timeline_id: layer.desc.timeline_id,
+            key_range: layer.desc.key_range.clone(),
+            lsn: layer.lsn,

            index_start_blk: 0,
            index_root_blk: 0,
@@ -152,8 +138,6 @@ pub struct ImageLayerInner {
    index_start_blk: u32,
    index_root_blk: u32,

-    lsn: Lsn,
-
    /// Reader object for reading blocks from the file.
    file: FileBlockReader<VirtualFile>,
 }
@@ -186,19 +170,17 @@ impl Layer for ImageLayer {
            return Ok(());
        }

-        let inner = self.load(LayerAccessKind::Dump, ctx).await?;
+        let inner = self.load(LayerAccessKind::Dump, ctx)?;
        let file = &inner.file;
        let tree_reader =
            DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);

-        tree_reader.dump().await?;
+        tree_reader.dump()?;

-        tree_reader
-            .visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
-                println!("key: {} offset {}", hex::encode(key), value);
-                true
-            })
-            .await?;
+        tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
+            println!("key: {} offset {}", hex::encode(key), value);
+            true
+        })?;

        Ok(())
    }
@@ -215,14 +197,28 @@ impl Layer for ImageLayer {
        assert!(lsn_range.start >= self.lsn);
        assert!(lsn_range.end >= self.lsn);

-        let inner = self
-            .load(LayerAccessKind::GetValueReconstructData, ctx)
-            .await?;
-        inner
-            .get_value_reconstruct_data(key, reconstruct_state)
-            .await
-            // FIXME: makes no sense to dump paths
-            .with_context(|| format!("read {}", self.path().display()))
+        let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
+
+        let file = &inner.file;
+        let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
+
+        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+        key.write_to_byte_slice(&mut keybuf);
+        if let Some(offset) = tree_reader.get(&keybuf)? {
+            let blob = file.block_cursor().read_blob(offset).with_context(|| {
+                format!(
+                    "failed to read value from data file {} at offset {}",
+                    self.path().display(),
+                    offset
+                )
+            })?;
+            let value = Bytes::from(blob);
+
+            reconstruct_state.img = Some((self.lsn, value));
+            Ok(ValueReconstructResult::Complete)
+        } else {
+            Ok(ValueReconstructResult::Missing)
+        }
    }

    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
@@ -259,6 +255,10 @@ impl PersistentLayer for ImageLayer {
        Some(self.path())
    }

+    fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
+        unimplemented!();
+    }
+
    fn delete_resident_layer_file(&self) -> Result<()> {
        // delete underlying file
        fs::remove_file(self.path())?;
@@ -318,41 +318,58 @@ impl ImageLayer {
    /// Open the underlying file and read the metadata into memory, if it's
    /// not loaded already.
    ///
-    async fn load(
-        &self,
-        access_kind: LayerAccessKind,
-        ctx: &RequestContext,
-    ) -> Result<&ImageLayerInner> {
-        self.access_stats.record_access(access_kind, ctx);
-        self.inner
-            .get_or_try_init(|| self.load_inner())
-            .await
-            .with_context(|| format!("Failed to load image layer {}", self.path().display()))
+    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&ImageLayerInner> {
+        self.access_stats
+            .record_access(access_kind, ctx.task_kind());
+        loop {
+            if let Some(inner) = self.inner.get() {
+                return Ok(inner);
+            }
+            self.inner
+                .get_or_try_init(|| self.load_inner())
+                .with_context(|| format!("Failed to load image layer {}", self.path().display()))?;
+        }
    }

-    async fn load_inner(&self) -> Result<ImageLayerInner> {
+    fn load_inner(&self) -> Result<ImageLayerInner> {
        let path = self.path();

-        let expected_summary = match &self.path_or_conf {
-            PathOrConf::Conf(_) => Some(Summary::from(self)),
-            PathOrConf::Path(_) => None,
-        };
+        // Open the file if it's not open already.
+        let file = VirtualFile::open(&path)
+            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+        let file = FileBlockReader::new(file);
+        let summary_blk = file.read_blk(0)?;
+        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary)?;
+        match &self.path_or_conf {
+            PathOrConf::Conf(_) => {
+                let mut expected_summary = Summary::from(self);
+                expected_summary.index_start_blk = actual_summary.index_start_blk;
+                expected_summary.index_root_blk = actual_summary.index_root_blk;

-        if let PathOrConf::Path(ref path) = self.path_or_conf {
-            // not production code
-            let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
-            let expected_filename = self.filename().file_name();
+                if actual_summary != expected_summary {
+                    bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary);
+                }
+            }
+            PathOrConf::Path(path) => {
+                let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
+                let expected_filename = self.filename().file_name();

-            if actual_filename != expected_filename {
-                println!("warning: filename does not match what is expected from in-file summary");
-                println!("actual: {:?}", actual_filename);
-                println!("expected: {:?}", expected_filename);
+                if actual_filename != expected_filename {
+                    println!(
+                        "warning: filename does not match what is expected from in-file summary"
+                    );
+                    println!("actual: {:?}", actual_filename);
+                    println!("expected: {:?}", expected_filename);
+                }
            }
        }

-        Ok(loaded)
+        Ok(ImageLayerInner {
+            index_start_blk: actual_summary.index_start_blk,
+            index_root_blk: actual_summary.index_root_blk,
+            file,
+        })
    }

    /// Create an ImageLayer struct representing an existing file on disk
@@ -422,66 +439,6 @@ impl ImageLayer {
    }
 }

-impl ImageLayerInner {
-    pub(super) fn load(
-        path: &std::path::Path,
-        lsn: Lsn,
-        summary: Option<Summary>,
-    ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path)
-            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0)?;
-        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
-
-        if let Some(mut expected_summary) = summary {
-            // production code path
-            expected_summary.index_start_blk = actual_summary.index_start_blk;
-            expected_summary.index_root_blk = actual_summary.index_root_blk;
-
-            if actual_summary != expected_summary {
-                bail!(
-                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
-                    actual_summary,
-                    expected_summary
-                );
-            }
-        }
-
-        Ok(ImageLayerInner {
-            index_start_blk: actual_summary.index_start_blk,
-            index_root_blk: actual_summary.index_root_blk,
-            lsn,
-            file,
-        })
-    }
-
-    pub(super) async fn get_value_reconstruct_data(
-        &self,
-        key: Key,
-        reconstruct_state: &mut ValueReconstructState,
-    ) -> anyhow::Result<ValueReconstructResult> {
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
-
-        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
-        key.write_to_byte_slice(&mut keybuf);
-        if let Some(offset) = tree_reader.get(&keybuf).await? {
-            let blob = file
-                .block_cursor()
-                .read_blob(offset)
-                .await
-                .with_context(|| format!("failed to read value from offset {}", offset))?;
-            let value = Bytes::from(blob);
-
-            reconstruct_state.img = Some((self.lsn, value));
-            Ok(ValueReconstructResult::Complete)
-        } else {
-            Ok(ValueReconstructResult::Missing)
-        }
-    }
-}
-
 /// A builder object for constructing a new image layer.
 ///
 /// Usage:
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -16,7 +16,6 @@ use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
 use std::cell::RefCell;
 use std::collections::HashMap;
-use std::sync::OnceLock;
 use tracing::*;
 use utils::{
    bin_ser::BeSer,
@@ -28,7 +27,7 @@ use utils::{
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::ops::Range;
-use tokio::sync::RwLock;
+use std::sync::RwLock;

 use super::{DeltaLayer, DeltaLayerWriter, Layer};

@@ -43,16 +42,14 @@ pub struct InMemoryLayer {
    tenant_id: TenantId,
    timeline_id: TimelineId,

+    ///
    /// This layer contains all the changes from 'start_lsn'. The
    /// start is inclusive.
+    ///
    start_lsn: Lsn,

-    /// Frozen layers have an exclusive end LSN.
-    /// Writes are only allowed when this is `None`.
-    end_lsn: OnceLock<Lsn>,
-
-    /// The above fields never change, except for `end_lsn`, which is only set once.
-    /// All other changing parts are in `inner`, and protected by a mutex.
+    /// The above fields never change. The parts that do change are in 'inner',
+    /// and protected by mutex.
    inner: RwLock<InMemoryLayerInner>,
 }

@@ -60,16 +57,21 @@ impl std::fmt::Debug for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("InMemoryLayer")
            .field("start_lsn", &self.start_lsn)
-            .field("end_lsn", &self.end_lsn)
            .field("inner", &self.inner)
            .finish()
    }
 }

 pub struct InMemoryLayerInner {
+    /// Frozen layers have an exclusive end LSN.
+    /// Writes are only allowed when this is None
+    end_lsn: Option<Lsn>,
+
+    ///
    /// All versions of all pages in the layer are kept here.  Indexed
    /// by block number and LSN. The value is an offset into the
    /// ephemeral file where the page version is stored.
+    ///
    index: HashMap<Key, VecMap<Lsn, u64>>,

    /// The values are stored in a serialized format in this file.
@@ -80,7 +82,15 @@ pub struct InMemoryLayerInner {

 impl std::fmt::Debug for InMemoryLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("InMemoryLayerInner").finish()
+        f.debug_struct("InMemoryLayerInner")
+            .field("end_lsn", &self.end_lsn)
+            .finish()
+    }
+}
+
+impl InMemoryLayerInner {
+    fn assert_writeable(&self) {
+        assert!(self.end_lsn.is_none());
    }
 }

@@ -91,21 +101,13 @@ impl InMemoryLayer {

    pub fn info(&self) -> InMemoryLayerInfo {
        let lsn_start = self.start_lsn;
+        let lsn_end = self.inner.read().unwrap().end_lsn;

-        if let Some(&lsn_end) = self.end_lsn.get() {
-            InMemoryLayerInfo::Frozen { lsn_start, lsn_end }
-        } else {
-            InMemoryLayerInfo::Open { lsn_start }
+        match lsn_end {
+            Some(lsn_end) => InMemoryLayerInfo::Frozen { lsn_start, lsn_end },
+            None => InMemoryLayerInfo::Open { lsn_start },
        }
    }
-
-    fn assert_writable(&self) {
-        assert!(self.end_lsn.get().is_none());
-    }
-
-    fn end_lsn_or_max(&self) -> Lsn {
-        self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
-    }
 }

 #[async_trait::async_trait]
@@ -115,7 +117,14 @@ impl Layer for InMemoryLayer {
    }

    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.start_lsn..self.end_lsn_or_max()
+        let inner = self.inner.read().unwrap();
+
+        let end_lsn = if let Some(end_lsn) = inner.end_lsn {
+            end_lsn
+        } else {
+            Lsn(u64::MAX)
+        };
+        self.start_lsn..end_lsn
    }

    fn is_incremental(&self) -> bool {
@@ -125,9 +134,13 @@ impl Layer for InMemoryLayer {

    /// debugging function to print out the contents of the layer
    async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        let inner = self.inner.read().await;
+        let inner = self.inner.read().unwrap();

-        let end_str = self.end_lsn_or_max();
+        let end_str = inner
+            .end_lsn
+            .as_ref()
+            .map(Lsn::to_string)
+            .unwrap_or_default();

        println!(
            "----- in-memory layer for tli {} LSNs {}-{} ----",
@@ -138,12 +151,12 @@ impl Layer for InMemoryLayer {
            return Ok(());
        }

-        let cursor = inner.file.block_cursor();
+        let mut cursor = inner.file.block_cursor();
        let mut buf = Vec::new();
        for (key, vec_map) in inner.index.iter() {
            for (lsn, pos) in vec_map.as_slice() {
                let mut desc = String::new();
-                cursor.read_blob_into_buf(*pos, &mut buf).await?;
+                cursor.read_blob_into_buf(*pos, &mut buf)?;
                let val = Value::des(&buf);
                match val {
                    Ok(Value::Image(img)) => {
@@ -181,15 +194,15 @@ impl Layer for InMemoryLayer {
        ensure!(lsn_range.start >= self.start_lsn);
        let mut need_image = true;

-        let inner = self.inner.read().await;
+        let inner = self.inner.read().unwrap();

-        let reader = inner.file.block_cursor();
+        let mut reader = inner.file.block_cursor();

        // Scan the page versions backwards, starting from `lsn`.
        if let Some(vec_map) = inner.index.get(&key) {
            let slice = vec_map.slice_range(lsn_range);
            for (entry_lsn, pos) in slice.iter().rev() {
-                let buf = reader.read_blob(*pos).await?;
+                let buf = reader.read_blob(*pos)?;
                let value = Value::des(&buf)?;
                match value {
                    Value::Image(img) => {
@@ -223,7 +236,9 @@ impl Layer for InMemoryLayer {

 impl std::fmt::Display for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let end_lsn = self.end_lsn_or_max();
+        let inner = self.inner.read().unwrap();
+
+        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
        write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
    }
 }
@@ -232,8 +247,8 @@ impl InMemoryLayer {
    ///
    /// Get layer size on the disk
    ///
-    pub async fn size(&self) -> Result<u64> {
-        let inner = self.inner.read().await;
+    pub fn size(&self) -> Result<u64> {
+        let inner = self.inner.read().unwrap();
        Ok(inner.file.size)
    }

@@ -255,8 +270,8 @@ impl InMemoryLayer {
            timeline_id,
            tenant_id,
            start_lsn,
-            end_lsn: OnceLock::new(),
            inner: RwLock::new(InMemoryLayerInner {
+                end_lsn: None,
                index: HashMap::new(),
                file,
            }),
@@ -267,10 +282,10 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
+    pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-        let mut inner = self.inner.write().await;
-        self.assert_writable();
+        let mut inner = self.inner.write().unwrap();
+        inner.assert_writeable();

        let off = {
            SER_BUFFER.with(|x| -> Result<_> {
@@ -301,11 +316,11 @@ impl InMemoryLayer {
    /// Make the layer non-writeable. Only call once.
    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
-    pub async fn freeze(&self, end_lsn: Lsn) {
-        let inner = self.inner.write().await;
+    pub fn freeze(&self, end_lsn: Lsn) {
+        let mut inner = self.inner.write().unwrap();

        assert!(self.start_lsn < end_lsn);
-        self.end_lsn.set(end_lsn).expect("end_lsn set only once");
+        inner.end_lsn = Some(end_lsn);

        for vec_map in inner.index.values() {
            for (lsn, _pos) in vec_map.as_slice() {
@@ -317,7 +332,7 @@ impl InMemoryLayer {
    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub async fn write_to_disk(&self) -> Result<DeltaLayer> {
+    pub fn write_to_disk(&self) -> Result<DeltaLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -327,21 +342,19 @@ impl InMemoryLayer {
        // lock, it will see that it's not writeable anymore and retry, but it
        // would have to wait until we release it. That race condition is very
        // rare though, so we just accept the potential latency hit for now.
-        let inner = self.inner.read().await;
-
-        let end_lsn = *self.end_lsn.get().unwrap();
+        let inner = self.inner.read().unwrap();

        let mut delta_layer_writer = DeltaLayerWriter::new(
            self.conf,
            self.timeline_id,
            self.tenant_id,
            Key::MIN,
-            self.start_lsn..end_lsn,
+            self.start_lsn..inner.end_lsn.unwrap(),
        )?;

        let mut buf = Vec::new();

-        let cursor = inner.file.block_cursor();
+        let mut cursor = inner.file.block_cursor();

        let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
        keys.sort_by_key(|k| k.0);
@@ -350,7 +363,7 @@ impl InMemoryLayer {
            let key = **key;
            // Write all page versions
            for (lsn, pos) in vec_map.as_slice() {
-                cursor.read_blob_into_buf(*pos, &mut buf).await?;
+                cursor.read_blob_into_buf(*pos, &mut buf)?;
                let will_init = Value::des(&buf)?.will_init();
                delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
            }
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -20,8 +20,8 @@ use utils::{

 use super::filename::{DeltaFileName, ImageFileName};
 use super::{
-    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset,
-    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
+    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter,
+    LayerKeyIter, LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
 };

 /// RemoteLayer is a not yet downloaded [`ImageLayer`] or
@@ -129,6 +129,14 @@ impl PersistentLayer for RemoteLayer {
        None
    }

+    fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
+        bail!("cannot iterate a remote layer");
+    }
+
+    fn key_iter(&self, _ctx: &RequestContext) -> Result<LayerKeyIter<'_>> {
+        bail!("cannot iterate a remote layer");
+    }
+
    fn delete_resident_layer_file(&self) -> Result<()> {
        bail!("remote layer has no layer file");
    }
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -73,13 +73,17 @@ pub fn start_background_loops(
 ///
 async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    let wait_duration = Duration::from_secs(2);
+    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
        let mut first = true;
        loop {
+            trace!("waking up");
+
            tokio::select! {
                _ = cancel.cancelled() => {
+                    info!("received cancellation request");
                    return;
                },
                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
@@ -122,12 +126,15 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                .await
                .is_ok()
            {
+                info!("received cancellation request during idling");
                break;
            }
        }
    }
    .await;
    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
+
+    trace!("compaction loop stopped.");
 }

 ///
@@ -135,6 +142,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 ///
 async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    let wait_duration = Duration::from_secs(2);
+    info!("starting");
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        // GC might require downloading, to find the cutoff LSN that corresponds to the
@@ -143,8 +151,11 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
        let mut first = true;
        loop {
+            trace!("waking up");
+
            tokio::select! {
                _ = cancel.cancelled() => {
+                    info!("received cancellation request");
                    return;
                },
                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
@@ -189,12 +200,14 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                .await
                .is_ok()
            {
+                info!("received cancellation request during idling");
                break;
            }
        }
    }
    .await;
    TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
+    trace!("GC loop stopped.");
 }

 async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
@@ -219,6 +232,7 @@ async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
                    }
                }
                Err(_sender_dropped_error) => {
+                    info!("Tenant dropped the state updates sender, quitting waiting for tenant and the task loop");
                    return ControlFlow::Break(());
                }
            }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,4 +1,3 @@
-pub mod delete;
 mod eviction_task;
 pub mod layer_manager;
 mod logical_size;
@@ -19,7 +18,6 @@ use pageserver_api::models::{
 use remote_storage::GenericRemoteStorage;
 use serde_with::serde_as;
 use storage_broker::BrokerClientChannel;
-use tokio::runtime::Handle;
 use tokio::sync::{oneshot, watch, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -35,11 +33,8 @@ use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};

-use crate::context::{
-    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
-};
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
-use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
    DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
    LayerAccessStats, LayerFileName, RemoteLayer,
@@ -84,7 +79,6 @@ use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};

-use self::delete::DeleteTimelineFlow;
 pub(super) use self::eviction_task::EvictionTaskTenantState;
 use self::eviction_task::EvictionTaskTimelineState;
 use self::layer_manager::LayerManager;
@@ -92,6 +86,7 @@ use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

 use super::config::TenantConf;
+use super::delete::DeleteTimelineFlow;
 use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
 use super::storage_layer::{
@@ -298,10 +293,6 @@ pub struct Timeline {
    /// Completion shared between all timelines loaded during startup; used to delay heavier
    /// background tasks until some logical sizes have been calculated.
    initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
-
-    /// Load or creation time information about the disk_consistent_lsn and when the loading
-    /// happened. Used for consumption metrics.
-    pub(crate) loaded_at: (Lsn, SystemTime),
 }

 pub struct WalReceiverInfo {
@@ -532,7 +523,7 @@ impl Timeline {
        size
    }

-    pub fn resident_physical_size(&self) -> u64 {
+    pub fn get_resident_physical_size(&self) -> u64 {
        self.metrics.resident_physical_size_gauge.get()
    }

@@ -701,9 +692,6 @@ impl Timeline {
                Err(CompactionError::DownloadRequired(rls)) => {
                    anyhow::bail!("Compaction requires downloading multiple times (last was {} layers), possibly battling against eviction", rls.len())
                }
-                Err(CompactionError::ShuttingDown) => {
-                    return Ok(());
-                }
                Err(CompactionError::Other(e)) => {
                    return Err(e);
                }
@@ -785,8 +773,7 @@ impl Timeline {
        let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
        // Is the timeline being deleted?
        if self.is_stopping() {
-            trace!("Dropping out of compaction on timeline shutdown");
-            return Err(CompactionError::ShuttingDown);
+            return Err(anyhow::anyhow!("timeline is Stopping").into());
        }

        let target_file_size = self.get_checkpoint_distance();
@@ -802,15 +789,10 @@ impl Timeline {
            .await
        {
            Ok((partitioning, lsn)) => {
-                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
-                let image_ctx = RequestContextBuilder::extend(ctx)
-                    .access_stats_behavior(AccessStatsBehavior::Skip)
-                    .build();
-
                // 2. Create new image layers for partitions that have been modified
                // "enough".
                let layer_paths_to_upload = self
-                    .create_image_layers(&partitioning, lsn, false, &image_ctx)
+                    .create_image_layers(&partitioning, lsn, false, ctx)
                    .await
                    .map_err(anyhow::Error::from)?;
                if let Some(remote_client) = &self.remote_client {
@@ -883,7 +865,7 @@ impl Timeline {
            let Some(open_layer) = layers.open_layer.as_ref() else {
                return Ok(());
            };
-            open_layer.size().await?
+            open_layer.size()?
        };
        let last_freeze_at = self.last_freeze_at.load();
        let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
@@ -927,7 +909,7 @@ impl Timeline {
    pub fn set_state(&self, new_state: TimelineState) {
        match (self.current_state(), new_state) {
            (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
-                info!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+                warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
            }
            (st, TimelineState::Loading) => {
                error!("ignoring transition from {st:?} into Loading state");
@@ -1168,7 +1150,7 @@ impl Timeline {
            return Err(EvictionError::CannotEvictRemoteLayer);
        }

-        let layer_file_size = local_layer.layer_desc().file_size;
+        let layer_file_size = local_layer.file_size();

        let local_layer_mtime = local_layer
            .local_path()
@@ -1421,8 +1403,6 @@ impl Timeline {
                last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0),
                last_freeze_ts: RwLock::new(Instant::now()),

-                loaded_at: (disk_consistent_lsn, SystemTime::now()),
-
                ancestor_timeline: ancestor,
                ancestor_lsn: metadata.ancestor_lsn(),

@@ -1598,6 +1578,7 @@ impl Timeline {
    ///
    pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
        let mut guard = self.layers.write().await;
+        let mut num_layers = 0;

        let timer = self.metrics.load_layer_map_histo.start_timer();

@@ -1615,12 +1596,12 @@ impl Timeline {
            let fname = direntry.file_name();
            let fname = fname.to_string_lossy();

-            if let Some(filename) = ImageFileName::parse_str(&fname) {
+            if let Some(imgfilename) = ImageFileName::parse_str(&fname) {
                // create an ImageLayer struct for each image file.
-                if filename.lsn > disk_consistent_lsn {
-                    info!(
+                if imgfilename.lsn > disk_consistent_lsn {
+                    warn!(
                        "found future image layer {} on timeline {} disk_consistent_lsn is {}",
-                        filename, self.timeline_id, disk_consistent_lsn
+                        imgfilename, self.timeline_id, disk_consistent_lsn
                    );

                    rename_to_backup(&direntry_path)?;
@@ -1628,31 +1609,31 @@ impl Timeline {
                }

                let file_size = direntry_path.metadata()?.len();
-                let stats =
-                    LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);

                let layer = ImageLayer::new(
                    self.conf,
                    self.timeline_id,
                    self.tenant_id,
-                    &filename,
+                    &imgfilename,
                    file_size,
-                    stats,
+                    LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident),
                );

+                trace!("found layer {}", layer.path().display());
                total_physical_size += file_size;
                loaded_layers.push(Arc::new(layer));
-            } else if let Some(filename) = DeltaFileName::parse_str(&fname) {
+                num_layers += 1;
+            } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
                // Create a DeltaLayer struct for each delta file.
                // The end-LSN is exclusive, while disk_consistent_lsn is
                // inclusive. For example, if disk_consistent_lsn is 100, it is
                // OK for a delta layer to have end LSN 101, but if the end LSN
                // is 102, then it might not have been fully flushed to disk
                // before crash.
-                if filename.lsn_range.end > disk_consistent_lsn + 1 {
-                    info!(
+                if deltafilename.lsn_range.end > disk_consistent_lsn + 1 {
+                    warn!(
                        "found future delta layer {} on timeline {} disk_consistent_lsn is {}",
-                        filename, self.timeline_id, disk_consistent_lsn
+                        deltafilename, self.timeline_id, disk_consistent_lsn
                    );

                    rename_to_backup(&direntry_path)?;
@@ -1660,20 +1641,20 @@ impl Timeline {
                }

                let file_size = direntry_path.metadata()?.len();
-                let stats =
-                    LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident);

                let layer = DeltaLayer::new(
                    self.conf,
                    self.timeline_id,
                    self.tenant_id,
-                    &filename,
+                    &deltafilename,
                    file_size,
-                    stats,
+                    LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Resident),
                );

+                trace!("found layer {}", layer.path().display());
                total_physical_size += file_size;
                loaded_layers.push(Arc::new(layer));
+                num_layers += 1;
            } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                // ignore these
            } else if remote_timeline_client::is_temp_download_file(&direntry_path) {
@@ -1698,7 +1679,6 @@ impl Timeline {
            }
        }

-        let num_layers = loaded_layers.len();
        guard.initialize_local_layers(loaded_layers, Lsn(disk_consistent_lsn.0) + 1);

        info!(
@@ -1793,21 +1773,19 @@ impl Timeline {
            match remote_layer_name {
                LayerFileName::Image(imgfilename) => {
                    if imgfilename.lsn > up_to_date_disk_consistent_lsn {
-                        info!(
+                        warn!(
                        "found future image layer {} on timeline {} remote_consistent_lsn is {}",
                        imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
                    );
                        continue;
                    }
-                    let stats =
-                        LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);

                    let remote_layer = RemoteLayer::new_img(
                        self.tenant_id,
                        self.timeline_id,
                        imgfilename,
                        &remote_layer_metadata,
-                        stats,
+                        LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted),
                    );
                    let remote_layer = Arc::new(remote_layer);
                    added_remote_layers.push(remote_layer);
@@ -1820,21 +1798,18 @@ impl Timeline {
                    // is 102, then it might not have been fully flushed to disk
                    // before crash.
                    if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
-                        info!(
+                        warn!(
                            "found future delta layer {} on timeline {} remote_consistent_lsn is {}",
                            deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
                        );
                        continue;
                    }
-                    let stats =
-                        LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted);
-
                    let remote_layer = RemoteLayer::new_delta(
                        self.tenant_id,
                        self.timeline_id,
                        deltafilename,
                        &remote_layer_metadata,
-                        stats,
+                        LayerAccessStats::for_loading_layer(&guard, LayerResidenceStatus::Evicted),
                    );
                    let remote_layer = Arc::new(remote_layer);
                    added_remote_layers.push(remote_layer);
@@ -2282,16 +2257,15 @@ trait TraversalLayerExt {

 impl TraversalLayerExt for Arc<dyn PersistentLayer> {
    fn traversal_id(&self) -> TraversalId {
-        let timeline_id = self.layer_desc().timeline_id;
        match self.local_path() {
            Some(local_path) => {
-                debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", timeline_id)),
+                debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", self.get_timeline_id())),
                    "need timeline ID to uniquely identify the layer when traversal crosses ancestor boundary",
                );
                format!("{}", local_path.display())
            }
            None => {
-                format!("remote {}/{self}", timeline_id)
+                format!("remote {}/{self}", self.get_timeline_id())
            }
        }
    }
@@ -2655,7 +2629,7 @@ impl Timeline {
    async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
        //info!("PUT: key {} at {}", key, lsn);
        let layer = self.get_layer_for_write(lsn).await?;
-        layer.put_value(key, lsn, val).await?;
+        layer.put_value(key, lsn, val)?;
        Ok(())
    }

@@ -2681,9 +2655,7 @@ impl Timeline {
            Some(self.write_lock.lock().await)
        };
        let mut guard = self.layers.write().await;
-        guard
-            .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
-            .await;
+        guard.try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at);
    }

    /// Layer flusher task's main loop.
@@ -2829,10 +2801,7 @@ impl Timeline {
                // We will remove frozen layer and add delta layer in one atomic operation later.
                let layer = self.create_delta_layer(&frozen_layer).await?;
                (
-                    HashMap::from([(
-                        layer.filename(),
-                        LayerFileMetadata::new(layer.layer_desc().file_size),
-                    )]),
+                    HashMap::from([(layer.filename(), LayerFileMetadata::new(layer.file_size()))]),
                    Some(layer),
                )
            };
@@ -2852,7 +2821,7 @@ impl Timeline {
                );

                // update metrics
-                let sz = l.layer_desc().file_size;
+                let sz = l.file_size();
                self.metrics.resident_physical_size_gauge.add(sz);
                self.metrics.num_persistent_files_created.inc_by(1);
                self.metrics.persistent_bytes_written.inc_by(sz);
@@ -2965,11 +2934,7 @@ impl Timeline {
            let frozen_layer = Arc::clone(frozen_layer);
            move || {
                // Write it out
-                // Keep this inside `spawn_blocking` and `Handle::current`
-                // as long as the write path is still sync and the read impl
-                // is still not fully async. Otherwise executor threads would
-                // be blocked.
-                let new_delta = Handle::current().block_on(frozen_layer.write_to_disk())?;
+                let new_delta = frozen_layer.write_to_disk()?;
                let new_delta_path = new_delta.path();

                // Sync it to disk.
@@ -3263,8 +3228,6 @@ enum CompactionError {
    /// This should not happen repeatedly, but will be retried once by top-level
    /// `Timeline::compact`.
    DownloadRequired(Vec<Arc<RemoteLayer>>),
-    /// The timeline or pageserver is shutting down
-    ShuttingDown,
    /// Compaction cannot be done right now; page reconstruction and so on.
    Other(anyhow::Error),
 }
@@ -3313,10 +3276,10 @@ struct CompactLevel0Phase1StatsBuilder {
    timeline_id: Option<TimelineId>,
    read_lock_acquisition_micros: DurationRecorder,
    read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
-    read_lock_held_key_sort_micros: DurationRecorder,
    read_lock_held_prerequisites_micros: DurationRecorder,
    read_lock_held_compute_holes_micros: DurationRecorder,
    read_lock_drop_micros: DurationRecorder,
+    prepare_iterators_micros: DurationRecorder,
    write_layer_files_micros: DurationRecorder,
    level0_deltas_count: Option<usize>,
    new_deltas_count: Option<usize>,
@@ -3333,10 +3296,10 @@ struct CompactLevel0Phase1Stats {
    timeline_id: TimelineId,
    read_lock_acquisition_micros: RecordedDuration,
    read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
-    read_lock_held_key_sort_micros: RecordedDuration,
    read_lock_held_prerequisites_micros: RecordedDuration,
    read_lock_held_compute_holes_micros: RecordedDuration,
    read_lock_drop_micros: RecordedDuration,
+    prepare_iterators_micros: RecordedDuration,
    write_layer_files_micros: RecordedDuration,
    level0_deltas_count: usize,
    new_deltas_count: usize,
@@ -3363,10 +3326,6 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
                .read_lock_held_spawn_blocking_startup_micros
                .into_recorded()
                .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
-            read_lock_held_key_sort_micros: value
-                .read_lock_held_key_sort_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
            read_lock_held_prerequisites_micros: value
                .read_lock_held_prerequisites_micros
                .into_recorded()
@@ -3379,6 +3338,10 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
                .read_lock_drop_micros
                .into_recorded()
                .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
+            prepare_iterators_micros: value
+                .prepare_iterators_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("prepare_iterators_micros not set"))?,
            write_layer_files_micros: value
                .write_layer_files_micros
                .into_recorded()
@@ -3475,14 +3438,14 @@ impl Timeline {
        // "gaps" in the sequence of level 0 files should only happen in case
        // of a crash, partial download from cloud storage, or something like
        // that, so it's not a big deal in practice.
-        level0_deltas.sort_by_key(|l| l.layer_desc().lsn_range.start);
+        level0_deltas.sort_by_key(|l| l.get_lsn_range().start);
        let mut level0_deltas_iter = level0_deltas.iter();

        let first_level0_delta = level0_deltas_iter.next().unwrap();
-        let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
+        let mut prev_lsn_end = first_level0_delta.get_lsn_range().end;
        let mut deltas_to_compact = vec![Arc::clone(first_level0_delta)];
        for l in level0_deltas_iter {
-            let lsn_range = &l.layer_desc().lsn_range;
+            let lsn_range = l.get_lsn_range();

            if lsn_range.start != prev_lsn_end {
                break;
@@ -3491,13 +3454,8 @@ impl Timeline {
            prev_lsn_end = lsn_range.end;
        }
        let lsn_range = Range {
-            start: deltas_to_compact
-                .first()
-                .unwrap()
-                .layer_desc()
-                .lsn_range
-                .start,
-            end: deltas_to_compact.last().unwrap().layer_desc().lsn_range.end,
+            start: deltas_to_compact.first().unwrap().get_lsn_range().start,
+            end: deltas_to_compact.last().unwrap().get_lsn_range().end,
        };

        let remotes = deltas_to_compact
@@ -3547,26 +3505,10 @@ impl Timeline {
        // min-heap (reserve space for one more element added before eviction)
        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
        let mut prev: Option<Key> = None;
-
-        let mut all_keys = Vec::new();
-
-        let downcast_deltas: Vec<_> = deltas_to_compact
-            .iter()
-            .map(|l| l.clone().downcast_delta_layer().expect("delta layer"))
-            .collect();
-        for dl in downcast_deltas.iter() {
-            // TODO: replace this with an await once we fully go async
-            all_keys.extend(Handle::current().block_on(DeltaLayer::load_keys(dl, ctx))?);
-        }
-
-        // The current stdlib sorting implementation is designed in a way where it is
-        // particularly fast where the slice is made up of sorted sub-ranges.
-        all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
-
-        stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
-
-        for DeltaEntry { key: next_key, .. } in all_keys.iter() {
-            let next_key = *next_key;
+        for (next_key, _next_lsn, _size) in itertools::process_results(
+            deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
+            |iter_iter| iter_iter.kmerge_by(|a, b| a.0 < b.0),
+        )? {
            if let Some(prev_key) = prev {
                // just first fast filter
                if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
@@ -3589,7 +3531,8 @@ impl Timeline {
            }
            prev = Some(next_key.next());
        }
-        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
+        stats.read_lock_held_compute_holes_micros =
+            stats.read_lock_held_prerequisites_micros.till_now();
        drop_rlock(guard);
        stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
        let mut holes = heap.into_vec();
@@ -3598,26 +3541,36 @@ impl Timeline {

        // This iterator walks through all key-value pairs from all the layers
        // we're compacting, in key, LSN order.
-        let all_values_iter = all_keys.iter();
+        let all_values_iter = itertools::process_results(
+            deltas_to_compact.iter().map(|l| l.iter(ctx)),
+            |iter_iter| {
+                iter_iter.kmerge_by(|a, b| {
+                    if let Ok((a_key, a_lsn, _)) = a {
+                        if let Ok((b_key, b_lsn, _)) = b {
+                            (a_key, a_lsn) < (b_key, b_lsn)
+                        } else {
+                            false
+                        }
+                    } else {
+                        true
+                    }
+                })
+            },
+        )?;

        // This iterator walks through all keys and is needed to calculate size used by each key
-        let mut all_keys_iter = all_keys
-            .iter()
-            .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size))
-            .coalesce(|mut prev, cur| {
-                // Coalesce keys that belong to the same key pair.
-                // This ensures that compaction doesn't put them
-                // into different layer files.
-                // Still limit this by the target file size,
-                // so that we keep the size of the files in
-                // check.
-                if prev.0 == cur.0 && prev.2 < target_file_size {
-                    prev.2 += cur.2;
-                    Ok(prev)
-                } else {
-                    Err((prev, cur))
-                }
-            });
+        let mut all_keys_iter = itertools::process_results(
+            deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
+            |iter_iter| {
+                iter_iter.kmerge_by(|a, b| {
+                    let (a_key, a_lsn, _) = a;
+                    let (b_key, b_lsn, _) = b;
+                    (a_key, a_lsn) < (b_key, b_lsn)
+                })
+            },
+        )?;
+
+        stats.prepare_iterators_micros = stats.read_lock_drop_micros.till_now();

        // Merge the contents of all the input delta layers into a new set
        // of delta layers, based on the current partitioning.
@@ -3669,127 +3622,104 @@ impl Timeline {
        let mut key_values_total_size = 0u64;
        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
-
-        // TODO remove this block_on wrapper once we fully go async
-        Handle::current().block_on(async {
-            for &DeltaEntry {
-                key, lsn, ref val, ..
-            } in all_values_iter
-            {
-                let value = val.load().await?;
-                let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
-                // We need to check key boundaries once we reach next key or end of layer with the same key
-                if !same_key || lsn == dup_end_lsn {
-                    let mut next_key_size = 0u64;
-                    let is_dup_layer = dup_end_lsn.is_valid();
-                    dup_start_lsn = Lsn::INVALID;
-                    if !same_key {
-                        dup_end_lsn = Lsn::INVALID;
-                    }
-                    // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
-                    for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
-                        next_key_size = next_size;
-                        if key != next_key {
-                            if dup_end_lsn.is_valid() {
-                                // We are writting segment with duplicates:
-                                // place all remaining values of this key in separate segment
-                                dup_start_lsn = dup_end_lsn; // new segments starts where old stops
-                                dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
-                            }
-                            break;
-                        }
-                        key_values_total_size += next_size;
-                        // Check if it is time to split segment: if total keys size is larger than target file size.
-                        // We need to avoid generation of empty segments if next_size > target_file_size.
-                        if key_values_total_size > target_file_size && lsn != next_lsn {
-                            // Split key between multiple layers: such layer can contain only single key
-                            dup_start_lsn = if dup_end_lsn.is_valid() {
-                                dup_end_lsn // new segment with duplicates starts where old one stops
-                            } else {
-                                lsn // start with the first LSN for this key
-                            };
-                            dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
-                            break;
-                        }
-                    }
-                    // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
-                    if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
-                        dup_start_lsn = dup_end_lsn;
-                        dup_end_lsn = lsn_range.end;
-                    }
-                    if writer.is_some() {
-                        let written_size = writer.as_mut().unwrap().size();
-                        let contains_hole =
-                            next_hole < holes.len() && key >= holes[next_hole].key_range.end;
-                        // check if key cause layer overflow or contains hole...
-                        if is_dup_layer
-                            || dup_end_lsn.is_valid()
-                            || written_size + key_values_total_size > target_file_size
-                            || contains_hole
-                        {
-                            // ... if so, flush previous layer and prepare to write new one
-                            new_layers.push(Arc::new(
-                                writer.take().unwrap().finish(prev_key.unwrap().next())?,
-                            ));
-                            writer = None;
-
-                            if contains_hole {
-                                // skip hole
-                                next_hole += 1;
-                            }
-                        }
-                    }
-                    // Remember size of key value because at next iteration we will access next item
-                    key_values_total_size = next_key_size;
+        for x in all_values_iter {
+            let (key, lsn, value) = x?;
+            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
+            // We need to check key boundaries once we reach next key or end of layer with the same key
+            if !same_key || lsn == dup_end_lsn {
+                let mut next_key_size = 0u64;
+                let is_dup_layer = dup_end_lsn.is_valid();
+                dup_start_lsn = Lsn::INVALID;
+                if !same_key {
+                    dup_end_lsn = Lsn::INVALID;
                }
-                if writer.is_none() {
-                    // Create writer if not initiaized yet
-                    writer = Some(DeltaLayerWriter::new(
-                        self.conf,
-                        self.timeline_id,
-                        self.tenant_id,
-                        key,
+                // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
+                for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
+                    next_key_size = next_size;
+                    if key != next_key {
                        if dup_end_lsn.is_valid() {
-                            // this is a layer containing slice of values of the same key
-                            debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
-                            dup_start_lsn..dup_end_lsn
+                            // We are writting segment with duplicates:
+                            // place all remaining values of this key in separate segment
+                            dup_start_lsn = dup_end_lsn; // new segments starts where old stops
+                            dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
+                        }
+                        break;
+                    }
+                    key_values_total_size += next_size;
+                    // Check if it is time to split segment: if total keys size is larger than target file size.
+                    // We need to avoid generation of empty segments if next_size > target_file_size.
+                    if key_values_total_size > target_file_size && lsn != next_lsn {
+                        // Split key between multiple layers: such layer can contain only single key
+                        dup_start_lsn = if dup_end_lsn.is_valid() {
+                            dup_end_lsn // new segment with duplicates starts where old one stops
                        } else {
-                            debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
-                            lsn_range.clone()
-                        },
-                    )?);
+                            lsn // start with the first LSN for this key
+                        };
+                        dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
+                        break;
+                    }
                }
+                // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
+                if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
+                    dup_start_lsn = dup_end_lsn;
+                    dup_end_lsn = lsn_range.end;
+                }
+                if writer.is_some() {
+                    let written_size = writer.as_mut().unwrap().size();
+                    let contains_hole =
+                        next_hole < holes.len() && key >= holes[next_hole].key_range.end;
+                    // check if key cause layer overflow or contains hole...
+                    if is_dup_layer
+                        || dup_end_lsn.is_valid()
+                        || written_size + key_values_total_size > target_file_size
+                        || contains_hole
+                    {
+                        // ... if so, flush previous layer and prepare to write new one
+                        new_layers.push(Arc::new(
+                            writer.take().unwrap().finish(prev_key.unwrap().next())?,
+                        ));
+                        writer = None;

-                fail_point!("delta-layer-writer-fail-before-finish", |_| {
-                    Result::<_>::Err(anyhow::anyhow!(
-                        "failpoint delta-layer-writer-fail-before-finish"
-                    ))
-                });
-
-                writer.as_mut().unwrap().put_value(key, lsn, value)?;
-                prev_key = Some(key);
+                        if contains_hole {
+                            // skip hole
+                            next_hole += 1;
+                        }
+                    }
+                }
+                // Remember size of key value because at next iteration we will access next item
+                key_values_total_size = next_key_size;
            }
-            Ok(())
-        })?;
+            if writer.is_none() {
+                // Create writer if not initiaized yet
+                writer = Some(DeltaLayerWriter::new(
+                    self.conf,
+                    self.timeline_id,
+                    self.tenant_id,
+                    key,
+                    if dup_end_lsn.is_valid() {
+                        // this is a layer containing slice of values of the same key
+                        debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
+                        dup_start_lsn..dup_end_lsn
+                    } else {
+                        debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+                        lsn_range.clone()
+                    },
+                )?);
+            }
+
+            fail_point!("delta-layer-writer-fail-before-finish", |_| {
+                Err(anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into())
+            });
+
+            writer.as_mut().unwrap().put_value(key, lsn, value)?;
+            prev_key = Some(key);
+        }
        if let Some(writer) = writer {
            new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
        }

        // Sync layers
        if !new_layers.is_empty() {
-            // Print a warning if the created layer is larger than double the target size
-            // Add two pages for potential overhead. This should in theory be already
-            // accounted for in the target calculation, but for very small targets,
-            // we still might easily hit the limit otherwise.
-            let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
-            for layer in new_layers.iter() {
-                if layer.desc.file_size > warn_limit {
-                    warn!(
-                        %layer,
-                        "created delta file of size {} larger than double of target of {target_file_size}", layer.desc.file_size
-                    );
-                }
-            }
            let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();

            // Fsync all the layer files and directory using multiple threads to
@@ -3802,10 +3732,12 @@ impl Timeline {
            layer_paths.pop().unwrap();
        }

-        stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
+        stats.write_layer_files_micros = stats.prepare_iterators_micros.till_now();
        stats.new_deltas_count = Some(new_layers.len());
        stats.new_deltas_size = Some(new_layers.iter().map(|l| l.desc.file_size).sum());

+        drop(all_keys_iter); // So that deltas_to_compact is no longer borrowed
+
        match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
            .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
        {
@@ -4703,7 +4635,7 @@ impl std::fmt::Debug for LocalLayerInfoForDiskUsageEviction {

 impl LocalLayerInfoForDiskUsageEviction {
    pub fn file_size(&self) -> u64 {
-        self.layer.layer_desc().file_size
+        self.layer.file_size()
    }
 }

--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -1,594 +0,0 @@
-use std::{
-    ops::{Deref, DerefMut},
-    sync::Arc,
-};
-
-use anyhow::Context;
-use pageserver_api::models::TimelineState;
-use tokio::sync::OwnedMutexGuard;
-use tracing::{debug, error, info, instrument, warn, Instrument, Span};
-use utils::{
-    crashsafe, fs_ext,
-    id::{TenantId, TimelineId},
-};
-
-use crate::{
-    config::PageServerConf,
-    task_mgr::{self, TaskKind},
-    tenant::{
-        metadata::TimelineMetadata,
-        remote_timeline_client::{
-            self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
-        },
-        CreateTimelineCause, DeleteTimelineError, Tenant,
-    },
-    InitializationOrder,
-};
-
-use super::Timeline;
-
-/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
-async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
-    // Stop the walreceiver first.
-    debug!("waiting for wal receiver to shutdown");
-    let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
-    if let Some(walreceiver) = maybe_started_walreceiver {
-        walreceiver.stop().await;
-    }
-    debug!("wal receiver shutdown confirmed");
-
-    // Prevent new uploads from starting.
-    if let Some(remote_client) = timeline.remote_client.as_ref() {
-        let res = remote_client.stop();
-        match res {
-            Ok(()) => {}
-            Err(e) => match e {
-                remote_timeline_client::StopError::QueueUninitialized => {
-                    // This case shouldn't happen currently because the
-                    // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
-                    // That is, before we declare the Tenant as Active.
-                    // But we only allow calls to delete_timeline on Active tenants.
-                    return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
-                }
-            },
-        }
-    }
-
-    // Stop & wait for the remaining timeline tasks, including upload tasks.
-    // NB: This and other delete_timeline calls do not run as a task_mgr task,
-    //     so, they are not affected by this shutdown_tasks() call.
-    info!("waiting for timeline tasks to shutdown");
-    task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await;
-
-    fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: timeline-delete-before-index-deleted-at"
-        ))?
-    });
-    Ok(())
-}
-
-/// Mark timeline as deleted in S3 so we won't pick it up next time
-/// during attach or pageserver restart.
-/// See comment in persist_index_part_with_deleted_flag.
-async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
-    if let Some(remote_client) = timeline.remote_client.as_ref() {
-        match remote_client.persist_index_part_with_deleted_flag().await {
-            // If we (now, or already) marked it successfully as deleted, we can proceed
-            Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
-            // Bail out otherwise
-            //
-            // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
-            // two tasks from performing the deletion at the same time. The first task
-            // that starts deletion should run it to completion.
-            Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
-            | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
-                return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
-            }
-        }
-    }
-    Ok(())
-}
-
-// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued.
-// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json
-// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote"
-// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file.
-// After index part is deleted presence of this mark file indentifies that it was a deletion intention.
-// So we can just remove the mark file.
-async fn create_delete_mark(
-    conf: &PageServerConf,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-) -> Result<(), DeleteTimelineError> {
-    fail::fail_point!("timeline-delete-before-delete-mark", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: timeline-delete-before-delete-mark"
-        ))?
-    });
-    let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id);
-
-    // Note: we're ok to replace existing file.
-    let _ = std::fs::OpenOptions::new()
-        .write(true)
-        .create(true)
-        .open(&marker_path)
-        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
-
-    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
-    Ok(())
-}
-
-/// Grab the layer_removal_cs lock, and actually perform the deletion.
-///
-/// This lock prevents prevents GC or compaction from running at the same time.
-/// The GC task doesn't register itself with the timeline it's operating on,
-/// so it might still be running even though we called `shutdown_tasks`.
-///
-/// Note that there are still other race conditions between
-/// GC, compaction and timeline deletion. See
-/// <https://github.com/neondatabase/neon/issues/2671>
-///
-/// No timeout here, GC & Compaction should be responsive to the
-/// `TimelineState::Stopping` change.
-async fn delete_local_layer_files(
-    conf: &PageServerConf,
-    tenant_id: TenantId,
-    timeline: &Timeline,
-) -> anyhow::Result<()> {
-    info!("waiting for layer_removal_cs.lock()");
-    let layer_removal_guard = timeline.layer_removal_cs.lock().await;
-    info!("got layer_removal_cs.lock(), deleting layer files");
-
-    // NB: storage_sync upload tasks that reference these layers have been cancelled
-    //     by the caller.
-
-    let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id);
-
-    fail::fail_point!("timeline-delete-before-rm", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
-    });
-
-    // NB: This need not be atomic because the deleted flag in the IndexPart
-    // will be observed during tenant/timeline load. The deletion will be resumed there.
-    //
-    // For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
-    //
-    // Note that here we do not bail out on std::io::ErrorKind::NotFound.
-    // This can happen if we're called a second time, e.g.,
-    // because of a previous failure/cancellation at/after
-    // failpoint timeline-delete-after-rm.
-    //
-    // It can also happen if we race with tenant detach, because,
-    // it doesn't grab the layer_removal_cs lock.
-    //
-    // For now, log and continue.
-    // warn! level is technically not appropriate for the
-    // first case because we should expect retries to happen.
-    // But the error is so rare, it seems better to get attention if it happens.
-    //
-    // Note that metadata removal is skipped, this is not technically needed,
-    // but allows to reuse timeline loading code during resumed deletion.
-    // (we always expect that metadata is in place when timeline is being loaded)
-
-    #[cfg(feature = "testing")]
-    let mut counter = 0;
-
-    // Timeline directory may not exist if we failed to delete mark file and request was retried.
-    if !local_timeline_directory.exists() {
-        return Ok(());
-    }
-
-    let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id);
-
-    for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
-        #[cfg(feature = "testing")]
-        {
-            counter += 1;
-            if counter == 2 {
-                fail::fail_point!("timeline-delete-during-rm", |_| {
-                    Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
-                });
-            }
-        }
-
-        let entry = entry?;
-        if entry.path() == metadata_path {
-            debug!("found metadata, skipping");
-            continue;
-        }
-
-        if entry.path() == local_timeline_directory {
-            // Keeping directory because metedata file is still there
-            debug!("found timeline dir itself, skipping");
-            continue;
-        }
-
-        let metadata = match entry.metadata() {
-            Ok(metadata) => metadata,
-            Err(e) => {
-                if crate::is_walkdir_io_not_found(&e) {
-                    warn!(
-                        timeline_dir=?local_timeline_directory,
-                        path=?entry.path().display(),
-                        "got not found err while removing timeline dir, proceeding anyway"
-                    );
-                    continue;
-                }
-                anyhow::bail!(e);
-            }
-        };
-
-        if metadata.is_dir() {
-            warn!(path=%entry.path().display(), "unexpected directory under timeline dir");
-            tokio::fs::remove_dir(entry.path()).await
-        } else {
-            tokio::fs::remove_file(entry.path()).await
-        }
-        .with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
-    }
-
-    info!("finished deleting layer files, releasing layer_removal_cs.lock()");
-    drop(layer_removal_guard);
-
-    fail::fail_point!("timeline-delete-after-rm", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
-    });
-
-    Ok(())
-}
-
-/// Removes remote layers and an index file after them.
-async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
-    if let Some(remote_client) = &timeline.remote_client {
-        remote_client.delete_all().await.context("delete_all")?
-    };
-
-    Ok(())
-}
-
-// This function removs remaining traces of a timeline on disk.
-// Namely: metadata file, timeline directory, delete mark.
-// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
-// delete mark should be present because it is the last step during deletion.
-// (nothing can fail after its deletion)
-async fn cleanup_remaining_timeline_fs_traces(
-    conf: &PageServerConf,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-) -> anyhow::Result<()> {
-    // Remove local metadata
-    tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id))
-        .await
-        .or_else(fs_ext::ignore_not_found)
-        .context("remove metadata")?;
-
-    fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: timeline-delete-after-rm-metadata"
-        ))?
-    });
-
-    // Remove timeline dir
-    tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id))
-        .await
-        .or_else(fs_ext::ignore_not_found)
-        .context("timeline dir")?;
-
-    fail::fail_point!("timeline-delete-after-rm-dir", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
-    });
-
-    // Make sure previous deletions are ordered before mark removal.
-    // Otherwise there is no guarantee that they reach the disk before mark deletion.
-    // So its possible for mark to reach disk first and for other deletions
-    // to be reordered later and thus missed if a crash occurs.
-    // Note that we dont need to sync after mark file is removed
-    // because we can tolerate the case when mark file reappears on startup.
-    let timeline_path = conf.timelines_path(&tenant_id);
-    crashsafe::fsync_async(timeline_path)
-        .await
-        .context("fsync_pre_mark_remove")?;
-
-    // Remove delete mark
-    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
-        .await
-        .context("remove delete mark")
-}
-
-/// It is important that this gets called when DeletionGuard is being held.
-/// For more context see comments in [`DeleteTimelineFlow::prepare`]
-async fn remove_timeline_from_tenant(
-    tenant: &Tenant,
-    timeline_id: TimelineId,
-    _: &DeletionGuard, // using it as a witness
-) -> anyhow::Result<()> {
-    // Remove the timeline from the map.
-    let mut timelines = tenant.timelines.lock().unwrap();
-    let children_exist = timelines
-        .iter()
-        .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
-    // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
-    // We already deleted the layer files, so it's probably best to panic.
-    // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
-    if children_exist {
-        panic!("Timeline grew children while we removed layer files");
-    }
-
-    timelines
-        .remove(&timeline_id)
-        .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
-
-    drop(timelines);
-
-    Ok(())
-}
-
-/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
-/// and deletes its data from both disk and s3.
-/// The sequence of steps:
-/// 1. Set deleted_at in remote index part.
-/// 2. Create local mark file.
-/// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
-/// 4. Delete remote layers
-/// 5. Delete index part
-/// 6. Delete meta, timeline directory
-/// 7. Delete mark file
-/// It is resumable from any step in case a crash/restart occurs.
-/// There are three entrypoints to the process:
-/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
-/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
-/// and we possibly neeed to continue deletion of remote files.
-/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
-/// index but still have local metadata, timeline directory and delete mark.
-/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
-#[derive(Default)]
-pub enum DeleteTimelineFlow {
-    #[default]
-    NotStarted,
-    InProgress,
-    Finished,
-}
-
-impl DeleteTimelineFlow {
-    // These steps are run in the context of management api request handler.
-    // Long running steps are continued to run in the background.
-    // NB: If this fails half-way through, and is retried, the retry will go through
-    // all the same steps again. Make sure the code here is idempotent, and don't
-    // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))]
-    pub async fn run(
-        tenant: &Arc<Tenant>,
-        timeline_id: TimelineId,
-        inplace: bool,
-    ) -> Result<(), DeleteTimelineError> {
-        let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
-
-        guard.mark_in_progress()?;
-
-        stop_tasks(&timeline).await?;
-
-        set_deleted_in_remote_index(&timeline).await?;
-
-        create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?;
-
-        fail::fail_point!("timeline-delete-before-schedule", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: timeline-delete-before-schedule"
-            ))?
-        });
-
-        if inplace {
-            Self::background(guard, tenant.conf, tenant, &timeline).await?
-        } else {
-            Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
-        }
-
-        Ok(())
-    }
-
-    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
-        match self {
-            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
-            Self::InProgress { .. } => { /* We're in a retry */ }
-            Self::NotStarted => { /* Fresh start */ }
-        }
-
-        *self = Self::InProgress;
-
-        Ok(())
-    }
-
-    /// Shortcut to create Timeline in stopping state and spawn deletion task.
-    /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
-    #[instrument(skip_all, fields(%timeline_id))]
-    pub async fn resume_deletion(
-        tenant: Arc<Tenant>,
-        timeline_id: TimelineId,
-        local_metadata: &TimelineMetadata,
-        remote_client: Option<RemoteTimelineClient>,
-        init_order: Option<&InitializationOrder>,
-    ) -> anyhow::Result<()> {
-        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
-        // RemoteTimelineClient is the only functioning part.
-        let timeline = tenant
-            .create_timeline_struct(
-                timeline_id,
-                local_metadata,
-                None, // Ancestor is not needed for deletion.
-                remote_client,
-                init_order,
-                // Important. We dont pass ancestor above because it can be missing.
-                // Thus we need to skip the validation here.
-                CreateTimelineCause::Delete,
-            )
-            .context("create_timeline_struct")?;
-
-        let mut guard = DeletionGuard(
-            Arc::clone(&timeline.delete_progress)
-                .try_lock_owned()
-                .expect("cannot happen because we're the only owner"),
-        );
-
-        // We meed to do this because when console retries delete request we shouldnt answer with 404
-        // because 404 means successful deletion.
-        {
-            let mut locked = tenant.timelines.lock().unwrap();
-            locked.insert(timeline_id, Arc::clone(&timeline));
-        }
-
-        guard.mark_in_progress()?;
-
-        // Note that delete mark can be missing on resume
-        // because we create delete mark after we set deleted_at in the index part.
-        create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?;
-
-        Self::schedule_background(guard, tenant.conf, tenant, timeline);
-
-        Ok(())
-    }
-
-    #[instrument(skip_all, fields(%timeline_id))]
-    pub async fn cleanup_remaining_timeline_fs_traces(
-        tenant: &Tenant,
-        timeline_id: TimelineId,
-    ) -> anyhow::Result<()> {
-        let r =
-            cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await;
-        info!("Done");
-        r
-    }
-
-    fn prepare(
-        tenant: &Tenant,
-        timeline_id: TimelineId,
-    ) -> Result<(Arc<Timeline>, DeletionGuard), DeleteTimelineError> {
-        // Note the interaction between this guard and deletion guard.
-        // Here we attempt to lock deletion guard when we're holding a lock on timelines.
-        // This is important because when you take into account `remove_timeline_from_tenant`
-        // we remove timeline from memory when we still hold the deletion guard.
-        // So here when timeline deletion is finished timeline wont be present in timelines map at all
-        // which makes the following sequence impossible:
-        // T1: get preempted right before the try_lock on `Timeline::delete_progress`
-        // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
-        // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
-        // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
-        let timelines = tenant.timelines.lock().unwrap();
-
-        let timeline = match timelines.get(&timeline_id) {
-            Some(t) => t,
-            None => return Err(DeleteTimelineError::NotFound),
-        };
-
-        // Ensure that there are no child timelines **attached to that pageserver**,
-        // because detach removes files, which will break child branches
-        let children: Vec<TimelineId> = timelines
-            .iter()
-            .filter_map(|(id, entry)| {
-                if entry.get_ancestor_timeline_id() == Some(timeline_id) {
-                    Some(*id)
-                } else {
-                    None
-                }
-            })
-            .collect();
-
-        if !children.is_empty() {
-            return Err(DeleteTimelineError::HasChildren(children));
-        }
-
-        // Note that using try_lock here is important to avoid a deadlock.
-        // Here we take lock on timelines and then the deletion guard.
-        // At the end of the operation we're holding the guard and need to lock timelines map
-        // to remove the timeline from it.
-        // Always if you have two locks that are taken in different order this can result in a deadlock.
-
-        let delete_progress = Arc::clone(&timeline.delete_progress);
-        let delete_lock_guard = match delete_progress.try_lock_owned() {
-            Ok(guard) => DeletionGuard(guard),
-            Err(_) => {
-                // Unfortunately if lock fails arc is consumed.
-                return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
-                    &timeline.delete_progress,
-                )));
-            }
-        };
-
-        timeline.set_state(TimelineState::Stopping);
-
-        Ok((Arc::clone(timeline), delete_lock_guard))
-    }
-
-    fn schedule_background(
-        guard: DeletionGuard,
-        conf: &'static PageServerConf,
-        tenant: Arc<Tenant>,
-        timeline: Arc<Timeline>,
-    ) {
-        let tenant_id = timeline.tenant_id;
-        let timeline_id = timeline.timeline_id;
-
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::TimelineDeletionWorker,
-            Some(tenant_id),
-            Some(timeline_id),
-            "timeline_delete",
-            false,
-            async move {
-                if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
-                    error!("Error: {err:#}");
-                    timeline.set_broken(format!("{err:#}"))
-                };
-                Ok(())
-            }
-            .instrument({
-                let span =
-                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
-                span.follows_from(Span::current());
-                span
-            }),
-        );
-    }
-
-    async fn background(
-        mut guard: DeletionGuard,
-        conf: &PageServerConf,
-        tenant: &Tenant,
-        timeline: &Timeline,
-    ) -> Result<(), DeleteTimelineError> {
-        delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
-
-        delete_remote_layers_and_index(timeline).await?;
-
-        pausable_failpoint!("in_progress_delete");
-
-        cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?;
-
-        remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
-
-        *guard = Self::Finished;
-
-        Ok(())
-    }
-
-    pub(crate) fn is_finished(&self) -> bool {
-        matches!(self, Self::Finished)
-    }
-}
-
-struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
-
-impl Deref for DeletionGuard {
-    type Target = DeleteTimelineFlow;
-
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
-
-impl DerefMut for DeletionGuard {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.0
-    }
-}
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -78,6 +78,9 @@ impl Timeline {

    #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
    async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
+        scopeguard::defer! {
+            info!("eviction task finishing");
+        }
        use crate::tenant::tasks::random_init_delay;
        {
            let policy = self.get_eviction_policy();
@@ -305,13 +308,8 @@ impl Timeline {
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
        let mut state = self.eviction_task_timeline_state.lock().await;
-
-        // Only do the imitate_layer accesses approximately as often as the threshold.  A little
-        // more frequently, to avoid this period racing with the threshold/period-th eviction iteration.
-        let inter_imitate_period = p.threshold.checked_sub(p.period).unwrap_or(p.threshold);
-
        match state.last_layer_access_imitation {
-            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
+            Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
            _ => {
                self.imitate_timeline_cached_layer_accesses(cancel, ctx)
                    .await;
@@ -334,7 +332,7 @@ impl Timeline {
        };
        let mut state = tenant.eviction_task_tenant_state.lock().await;
        match state.last_layer_access_imitation {
-            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
+            Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
            _ => {
                self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
                    .await;
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -120,9 +120,10 @@ impl LayerManager {

        ensure!(
            lsn > last_record_lsn,
-            "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})",
+            "cannot modify relation after advancing last_record_lsn (incoming_lsn={}, last_record_lsn={})\n{}",
            lsn,
            last_record_lsn,
+            std::backtrace::Backtrace::force_capture(),
        );

        // Do we have a layer open for writing already?
@@ -163,7 +164,7 @@ impl LayerManager {
    }

    /// Called from `freeze_inmem_layer`, returns true if successfully frozen.
-    pub async fn try_freeze_in_memory_layer(
+    pub fn try_freeze_in_memory_layer(
        &mut self,
        Lsn(last_record_lsn): Lsn,
        last_freeze_at: &AtomicLsn,
@@ -173,7 +174,7 @@ impl LayerManager {
        if let Some(open_layer) = &self.layer_map.open_layer {
            let open_layer_rc = Arc::clone(open_layer);
            // Does this layer need freezing?
-            open_layer.freeze(end_lsn).await;
+            open_layer.freeze(end_lsn);

            // The layer is no longer open, update the layer map to reflect this.
            // We will replace it with on-disk historics below.
@@ -277,7 +278,7 @@ impl LayerManager {
        updates: &mut BatchedUpdates<'_>,
        mapping: &mut LayerFileManager,
    ) {
-        updates.remove_historic(layer.layer_desc());
+        updates.remove_historic(layer.layer_desc().clone());
        mapping.remove(layer);
    }

@@ -291,10 +292,10 @@ impl LayerManager {
        metrics: &TimelineMetrics,
        mapping: &mut LayerFileManager,
    ) -> anyhow::Result<()> {
-        let desc = layer.layer_desc();
        if !layer.is_remote_layer() {
            layer.delete_resident_layer_file()?;
-            metrics.resident_physical_size_gauge.sub(desc.file_size);
+            let layer_file_size = layer.file_size();
+            metrics.resident_physical_size_gauge.sub(layer_file_size);
        }

        // TODO Removing from the bottom of the layer map is expensive.
@@ -302,7 +303,7 @@ impl LayerManager {
        //      won't be needed for page reconstruction for this timeline,
        //      and mark what we can't delete yet as deleted from the layer
        //      map index without actually rebuilding the index.
-        updates.remove_historic(desc);
+        updates.remove_historic(layer.layer_desc().clone());
        mapping.remove(layer);

        Ok(())
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -31,19 +31,14 @@ use storage_broker::Streaming;
 use tokio::select;
 use tracing::*;

+use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use postgres_connection::{parse_host_port, PgConnectionConfig};
-use utils::backoff::{
-    exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
-};
 use utils::{
    id::{NodeId, TenantTimelineId},
    lsn::Lsn,
 };

-use super::{
-    walreceiver_connection::WalConnectionStatus, walreceiver_connection::WalReceiverError,
-    TaskEvent, TaskHandle,
-};
+use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle};

 /// Attempts to subscribe for timeline updates, pushed by safekeepers into the broker.
 /// Based on the updates, desides whether to start, keep or stop a WAL receiver task.
@@ -424,19 +419,13 @@ impl ConnectionManagerState {
                match res {
                    Ok(()) => Ok(()),
                    Err(e) => {
-                        match e {
-                            WalReceiverError::SuccessfulCompletion(msg) => {
-                                info!("walreceiver connection handling ended with success: {msg}");
-                                Ok(())
-                            }
-                            WalReceiverError::ExpectedSafekeeperError(e) => {
-                                info!("walreceiver connection handling ended: {e}");
-                                Ok(())
-                            }
-                            WalReceiverError::Other(e) => {
-                                // give out an error to have task_mgr give it a really verbose logging
-                                Err(e).context("walreceiver connection handling failure")
-                            }
+                        use super::walreceiver_connection::ExpectedError;
+                        if e.is_expected() {
+                            info!("walreceiver connection handling ended: {e:#}");
+                            Ok(())
+                        } else {
+                            // give out an error to have task_mgr give it a really verbose logging
+                            Err(e).context("walreceiver connection handling failure")
                        }
                    }
                }
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -8,14 +8,14 @@ use std::{
    time::{Duration, SystemTime},
 };

-use anyhow::{anyhow, Context};
+use anyhow::{bail, ensure, Context};
 use bytes::BytesMut;
 use chrono::{NaiveDateTime, Utc};
 use fail::fail_point;
 use futures::StreamExt;
 use postgres::{error::SqlState, SimpleQueryMessage, SimpleQueryRow};
+use postgres_ffi::v14::xlog_utils::normalize_lsn;
 use postgres_ffi::WAL_SEGMENT_SIZE;
-use postgres_ffi::{v14::xlog_utils::normalize_lsn, waldecoder::WalDecodeError};
 use postgres_protocol::message::backend::ReplicationMessage;
 use postgres_types::PgLsn;
 use tokio::{select, sync::watch, time};
@@ -60,50 +60,6 @@ pub(super) struct WalConnectionStatus {
    pub node: NodeId,
 }

-pub(super) enum WalReceiverError {
-    /// An error of a type that does not indicate an issue, e.g. a connection closing
-    ExpectedSafekeeperError(postgres::Error),
-    /// An "error" message that carries a SUCCESSFUL_COMPLETION status code.  Carries
-    /// the message part of the original postgres error
-    SuccessfulCompletion(String),
-    /// Generic error
-    Other(anyhow::Error),
-}
-
-impl From<tokio_postgres::Error> for WalReceiverError {
-    fn from(err: tokio_postgres::Error) -> Self {
-        if let Some(dberror) = err.as_db_error().filter(|db_error| {
-            db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
-                && db_error.message().contains("ending streaming")
-        }) {
-            // Strip the outer DbError, which carries a misleading "error" severity
-            Self::SuccessfulCompletion(dberror.message().to_string())
-        } else if err.is_closed()
-            || err
-                .source()
-                .and_then(|source| source.downcast_ref::<std::io::Error>())
-                .map(is_expected_io_error)
-                .unwrap_or(false)
-        {
-            Self::ExpectedSafekeeperError(err)
-        } else {
-            Self::Other(anyhow::Error::new(err))
-        }
-    }
-}
-
-impl From<anyhow::Error> for WalReceiverError {
-    fn from(err: anyhow::Error) -> Self {
-        Self::Other(err)
-    }
-}
-
-impl From<WalDecodeError> for WalReceiverError {
-    fn from(err: WalDecodeError) -> Self {
-        Self::Other(anyhow::Error::new(err))
-    }
-}
-
 /// Open a connection to the given safekeeper and receive WAL, sending back progress
 /// messages as we go.
 pub(super) async fn handle_walreceiver_connection(
@@ -114,7 +70,7 @@ pub(super) async fn handle_walreceiver_connection(
    connect_timeout: Duration,
    ctx: RequestContext,
    node: NodeId,
-) -> Result<(), WalReceiverError> {
+) -> anyhow::Result<()> {
    debug_assert_current_span_has_tenant_and_timeline_id();

    WALRECEIVER_STARTED_CONNECTIONS.inc();
@@ -174,15 +130,11 @@ pub(super) async fn handle_walreceiver_connection(
                connection_result = connection => match connection_result {
                    Ok(()) => debug!("Walreceiver db connection closed"),
                    Err(connection_error) => {
-                        match WalReceiverError::from(connection_error) {
-                            WalReceiverError::ExpectedSafekeeperError(_) => {
-                                // silence, because most likely we've already exited the outer call
-                                // with a similar error.
-                            },
-                            WalReceiverError::SuccessfulCompletion(_) => {}
-                            WalReceiverError::Other(err) => {
-                                warn!("Connection aborted: {err:#}")
-                            }
+                        if connection_error.is_expected() {
+                            // silence, because most likely we've already exited the outer call
+                            // with a similar error.
+                        } else {
+                            warn!("Connection aborted: {connection_error:#}")
                        }
                    }
                },
@@ -228,7 +180,7 @@ pub(super) async fn handle_walreceiver_connection(
    let mut startpoint = last_rec_lsn;

    if startpoint == Lsn(0) {
-        return Err(WalReceiverError::Other(anyhow!("No previous WAL position")));
+        bail!("No previous WAL position");
    }

    // There might be some padding after the last full record, skip it.
@@ -310,9 +262,7 @@ pub(super) async fn handle_walreceiver_connection(
                        // It is important to deal with the aligned records as lsn in getPage@LSN is
                        // aligned and can be several bytes bigger. Without this alignment we are
                        // at risk of hitting a deadlock.
-                        if !lsn.is_aligned() {
-                            return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
-                        }
+                        ensure!(lsn.is_aligned());

                        walingest
                            .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
@@ -469,3 +419,51 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
        Err(IdentifyError.into())
    }
 }
+
+/// Trait for avoid reporting walreceiver specific expected or "normal" or "ok" errors.
+pub(super) trait ExpectedError {
+    /// Test if this error is an ok error.
+    ///
+    /// We don't want to report connectivity problems as real errors towards connection manager because
+    /// 1. they happen frequently enough to make server logs hard to read and
+    /// 2. the connection manager can retry other safekeeper.
+    ///
+    /// If this function returns `true`, it's such an error.
+    /// The caller should log it at info level and then report to connection manager that we're done handling this connection.
+    /// Connection manager will then handle reconnections.
+    ///
+    /// If this function returns an `false` the error should be propagated and the connection manager
+    /// will log the error at ERROR level.
+    fn is_expected(&self) -> bool;
+}
+
+impl ExpectedError for postgres::Error {
+    fn is_expected(&self) -> bool {
+        self.is_closed()
+            || self
+                .source()
+                .and_then(|source| source.downcast_ref::<std::io::Error>())
+                .map(is_expected_io_error)
+                .unwrap_or(false)
+            || self
+                .as_db_error()
+                .filter(|db_error| {
+                    db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
+                        && db_error.message().contains("ending streaming")
+                })
+                .is_some()
+    }
+}
+
+impl ExpectedError for anyhow::Error {
+    fn is_expected(&self) -> bool {
+        let head = self.downcast_ref::<postgres::Error>();
+
+        let tail = self
+            .chain()
+            .filter_map(|e| e.downcast_ref::<postgres::Error>());
+
+        // check if self or any of the chained/sourced errors are expected
+        head.into_iter().chain(tail).any(|e| e.is_expected())
+    }
+}
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -53,9 +53,6 @@ pub struct VirtualFile {
    pub path: PathBuf,
    open_options: OpenOptions,

-    // These are strings becase we only use them for metrics, and those expect strings.
-    // It makes no sense for us to constantly turn the `TimelineId` and `TenantId` into
-    // strings.
    tenant_id: String,
    timeline_id: String,
 }
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -4,7 +4,6 @@
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
-	extension_server.o \
 	file_cache.o \
 	libpagestore.o \
 	libpqwalproposer.o \
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -1,103 +0,0 @@
-
-/*-------------------------------------------------------------------------
- *
- * extension_server.c
- *	  Request compute_ctl to download extension files.
- *
- * IDENTIFICATION
- *	 contrib/neon/extension_server.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-#include "tcop/pquery.h"
-#include "tcop/utility.h"
-#include "access/xact.h"
-#include "utils/hsearch.h"
-#include "utils/memutils.h"
-#include "commands/defrem.h"
-#include "miscadmin.h"
-#include "utils/acl.h"
-#include "fmgr.h"
-#include "utils/guc.h"
-#include "port.h"
-#include "fmgr.h"
-
-#include <curl/curl.h>
-
-static int extension_server_port = 0;
-
-static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
-
-// to download all SQL (and data) files for an extension:
-// curl -X POST http://localhost:8080/extension_server/postgis
-// it covers two possible extension files layouts:
-// 1. extension_name--version--platform.sql
-// 2. extension_name/extension_name--version.sql
-//    extension_name/extra_files.csv
-//
-// to download specific library file:
-// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
-static bool
-neon_download_extension_file_http(const char *filename, bool is_library)
-{
-    CURL *curl;
-    CURLcode res;
-    char *compute_ctl_url;
-    char *postdata;
-    bool ret = false;
-
-    if ((curl = curl_easy_init()) == NULL)
-    {
-        elog(ERROR, "Failed to initialize curl handle");
-    }
-
-    compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
-                               extension_server_port, filename, is_library ? "?is_library=true" : "");
-
-    elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
-
-    curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
-    curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
-    curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */);
-
-    if (curl)
-    {
-        /* Perform the request, res will get the return code */
-        res = curl_easy_perform(curl);
-        /* Check for errors */
-        if (res == CURLE_OK)
-        {
-            ret = true;
-        }
-        else
-        {
-            // Don't error here because postgres will try to find the file
-            // and will fail with some proper error message if it's not found.
-            elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
-        }
-
-        /* always cleanup */
-        curl_easy_cleanup(curl);
-    }
-
-    return ret;
-}
-
-void pg_init_extension_server()
-{
-    // Port to connect to compute_ctl on localhost
-    // to request extension files.
-    DefineCustomIntVariable("neon.extension_server_port",
-                            "connection string to the compute_ctl",
-                            NULL,
-                            &extension_server_port,
-                            0, 0, INT_MAX,
-                            PGC_POSTMASTER,
-                            0, /* no flags required */
-                            NULL, NULL, NULL);
-
-    // set download_extension_file_hook
-    prev_download_extension_file_hook = download_extension_file_hook;
-    download_extension_file_hook = neon_download_extension_file_http;
-}
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -172,7 +172,7 @@ lfc_change_limit_hook(int newval, void *extra)
 	{
 		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
 		if (lfc_desc < 0) {
-			elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
+			elog(LOG, "Failed to open file cache %s: %m", lfc_path);
 			lfc_size_limit = 0; /* disable file cache */
 			return;
 		}
@@ -557,7 +557,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 			Assert(victim->access_count == 0);
 			entry->offset = victim->offset; /* grab victim's chunk */
 			hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
-			elog(DEBUG2, "Swap file cache page");
+			elog(LOG, "Swap file cache page");
 		}
 		else
 		{
@@ -574,7 +574,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 	{
 		lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
 		if (lfc_desc < 0) {
-			elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
+			elog(LOG, "Failed to open file cache %s: %m", lfc_path);
 			lfc_size_limit = 0; /* disable file cache */
 		}
 	}
@@ -583,7 +583,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 		rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
 		if (rc != BLCKSZ)
 		{
-			elog(WARNING, "Failed to write file cache: %m, disabling file cache");
+			elog(INFO, "Failed to write file cache: %m");
 			lfc_size_limit = 0; /* disable file cache */
 		}
 	}
--- a/pgxn/neon/libpqwalproposer.c
+++ b/pgxn/neon/libpqwalproposer.c
@@ -74,7 +74,7 @@ walprop_connect_start(char *conninfo, char *password)
 	if (password)
 	{
 		keywords[n] = "password";
-		values[n] = password;
+		values[n] = neon_auth_token;
 		n++;
 	}
 	keywords[n] = "dbname";
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -35,11 +35,8 @@ _PG_init(void)
 {
 	pg_init_libpagestore();
 	pg_init_walproposer();
-
 	InitControlPlaneConnector();

-	pg_init_extension_server();
-
        // Important: This must happen after other parts of the extension
        // are loaded, otherwise any settings to GUCs that were set before
        // the extension was loaded will be removed.
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -21,8 +21,6 @@ extern char *neon_tenant;
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);

-extern void pg_init_extension_server(void);
-
 /*
 * Returns true if we shouldn't do REDO on that block in record indicated by
 * block_id; false otherwise.
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1393,22 +1393,8 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 	char	   *err;
 	WalReceiverConn *wrconn;
 	WalRcvStreamOptions options;
-	char conninfo[MAXCONNINFO];

-	if (!neon_auth_token)
-	{
-		memcpy(conninfo, safekeeper[donor].conninfo, MAXCONNINFO);
-	}
-	else
-	{
-		int written = 0;
-
-		written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, safekeeper[donor].conninfo);
-		if (written > MAXCONNINFO || written < 0)
-			elog(FATAL, "could not append password to the safekeeper connection string");
-	}
-
-	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
+	wrconn = walrcv_connect(safekeeper[donor].conninfo, false, "wal_proposer_recovery", &err);
 	if (!wrconn)
 	{
 		ereport(WARNING,
--- a/pgxn/neon/walproposer_utils.c
+++ b/pgxn/neon/walproposer_utils.c
@@ -37,14 +37,68 @@ static XLogSegNo walpropSegNo = 0;

 /* START cloned file-local variables and functions from walsender.c */

+/*
+ * xlogreader used for replication.  Note that a WAL sender doing physical
+ * replication does not need xlogreader to read WAL, but it needs one to
+ * keep a state of its work.
+ */
+static XLogReaderState *xlogreader = NULL;
+
+/*
+ * These variables keep track of the state of the timeline we're currently
+ * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric,
+ * the timeline is not the latest timeline on this server, and the server's
+ * history forked off from that timeline at sendTimeLineValidUpto.
+ */
+static TimeLineID sendTimeLine = 0;
+static TimeLineID sendTimeLineNextTLI = 0;
+static bool sendTimeLineIsHistoric = false;
+static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr;
+
+/*
+ * Timestamp of last ProcessRepliesIfAny() that saw a reply from the
+ * standby. Set to 0 if wal_sender_timeout doesn't need to be active.
+ */
+static TimestampTz last_reply_timestamp = 0;
+
+/* Have we sent a heartbeat message asking for reply, since last reply? */
+static bool waiting_for_ping_response = false;
+
+static bool streamingDoneSending;
+static bool streamingDoneReceiving;
+
+/* Are we there yet? */
+static bool WalSndCaughtUp = false;
+
+/* Flags set by signal handlers for later service in main loop */
+static volatile sig_atomic_t got_STOPPING = false;
+
 /*
 * How far have we sent WAL already? This is also advertised in
 * MyWalSnd->sentPtr.  (Actually, this is the next WAL location to send.)
 */
 static XLogRecPtr sentPtr = InvalidXLogRecPtr;

-static void WalSndLoop(void);
-static void XLogBroadcastWalProposer(void);
+/*
+ * This is set while we are streaming. When not set
+ * PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set,
+ * the main loop is responsible for checking got_STOPPING and terminating when
+ * it's set (after streaming any remaining WAL).
+ */
+static volatile sig_atomic_t replication_active = false;
+
+typedef void (*WalSndSendDataCallback) (void);
+static void WalSndLoop(WalSndSendDataCallback send_data);
+static void XLogSendPhysical(void);
+#if PG_VERSION_NUM >= 150000
+static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *tli);
+#else
+static XLogRecPtr GetStandbyFlushRecPtr(void);
+#endif
+
+static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
+							  TimeLineID *tli_p);
+
 /* END cloned file-level variables and functions from walsender.c */

 int
@@ -452,7 +506,7 @@ XLogWalPropClose(XLogRecPtr recptr)
 /* START of cloned functions from walsender.c */

 /*
- * Subscribe for new WAL and stream it in the loop to safekeepers.
+ * Handle START_REPLICATION command.
 *
 * At the moment, this never returns, but an ereport(ERROR) will take us back
 * to the main loop.
@@ -470,6 +524,18 @@ StartProposerReplication(StartReplicationCmd *cmd)
 				 errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
 #endif

+	/* create xlogreader for physical replication */
+	xlogreader =
+		XLogReaderAllocate(wal_segment_size, NULL,
+						   XL_ROUTINE(.segment_open = WalSndSegmentOpen,
+									  .segment_close = wal_segment_close),
+						   NULL);
+
+	if (!xlogreader)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+
 	/*
 	 * We assume here that we're logging enough information in the WAL for
 	 * log-shipping, since this is checked in PostmasterMain().
@@ -503,61 +569,341 @@ StartProposerReplication(StartReplicationCmd *cmd)
 	 * we keep this code around to lighten the load for when we need it.
 	 */
 #if PG_VERSION_NUM >= 150000
-	FlushPtr = GetFlushRecPtr(&currTLI);
+	if (am_cascading_walsender)
+	{
+		/* this also updates ThisTimeLineID */
+		FlushPtr = GetStandbyFlushRecPtr(&currTLI);
+	}
+	else
+		FlushPtr = GetFlushRecPtr(&currTLI);
 #else
-	FlushPtr = GetFlushRecPtr();
+	if (am_cascading_walsender)
+	{
+		/* this also updates ThisTimeLineID */
+		FlushPtr = GetStandbyFlushRecPtr();
+	}
+	else
+		FlushPtr = GetFlushRecPtr();
+
 	currTLI = ThisTimeLineID;
 #endif

-	/*
-	 * When we first start replication the standby will be behind the
-	 * primary. For some applications, for example synchronous
-	 * replication, it is important to have a clear state for this initial
-	 * catchup mode, so we can trigger actions when we change streaming
-	 * state later. We may stay in this state for a long time, which is
-	 * exactly why we want to be able to monitor whether or not we are
-	 * still here.
-	 */
-	WalSndSetState(WALSNDSTATE_CATCHUP);

-	/*
-	 * Don't allow a request to stream from a future point in WAL that
-	 * hasn't been flushed to disk in this server yet.
-	 */
-	if (FlushPtr < cmd->startpoint)
+	if (cmd->timeline != 0)
 	{
-		ereport(ERROR,
-				(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
-						LSN_FORMAT_ARGS(cmd->startpoint),
-						LSN_FORMAT_ARGS(FlushPtr))));
+		XLogRecPtr	switchpoint;
+
+		sendTimeLine = cmd->timeline;
+		if (sendTimeLine == currTLI)
+		{
+			sendTimeLineIsHistoric = false;
+			sendTimeLineValidUpto = InvalidXLogRecPtr;
+		}
+		else
+		{
+			List	   *timeLineHistory;
+
+			sendTimeLineIsHistoric = true;
+
+			/*
+			 * Check that the timeline the client requested exists, and the
+			 * requested start location is on that timeline.
+			 */
+			timeLineHistory = readTimeLineHistory(currTLI);
+			switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory,
+										 &sendTimeLineNextTLI);
+			list_free_deep(timeLineHistory);
+
+			/*
+			 * Found the requested timeline in the history. Check that
+			 * requested startpoint is on that timeline in our history.
+			 *
+			 * This is quite loose on purpose. We only check that we didn't
+			 * fork off the requested timeline before the switchpoint. We
+			 * don't check that we switched *to* it before the requested
+			 * starting point. This is because the client can legitimately
+			 * request to start replication from the beginning of the WAL
+			 * segment that contains switchpoint, but on the new timeline, so
+			 * that it doesn't end up with a partial segment. If you ask for
+			 * too old a starting point, you'll get an error later when we
+			 * fail to find the requested WAL segment in pg_wal.
+			 *
+			 * XXX: we could be more strict here and only allow a startpoint
+			 * that's older than the switchpoint, if it's still in the same
+			 * WAL segment.
+			 */
+			if (!XLogRecPtrIsInvalid(switchpoint) &&
+				switchpoint < cmd->startpoint)
+			{
+				ereport(ERROR,
+						(errmsg("requested starting point %X/%X on timeline %u is not in this server's history",
+								LSN_FORMAT_ARGS(cmd->startpoint),
+								cmd->timeline),
+						 errdetail("This server's history forked from timeline %u at %X/%X.",
+								   cmd->timeline,
+								   LSN_FORMAT_ARGS(switchpoint))));
+			}
+			sendTimeLineValidUpto = switchpoint;
+		}
+	}
+	else
+	{
+		sendTimeLine = currTLI;
+		sendTimeLineValidUpto = InvalidXLogRecPtr;
+		sendTimeLineIsHistoric = false;
 	}

-	/* Start streaming from the requested point */
-	sentPtr = cmd->startpoint;
+	streamingDoneSending = streamingDoneReceiving = false;

-	/* Initialize shared memory status, too */
-	SpinLockAcquire(&MyWalSnd->mutex);
-	MyWalSnd->sentPtr = sentPtr;
-	SpinLockRelease(&MyWalSnd->mutex);
+	/* If there is nothing to stream, don't even enter COPY mode */
+	if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto)
+	{
+		/*
+		 * When we first start replication the standby will be behind the
+		 * primary. For some applications, for example synchronous
+		 * replication, it is important to have a clear state for this initial
+		 * catchup mode, so we can trigger actions when we change streaming
+		 * state later. We may stay in this state for a long time, which is
+		 * exactly why we want to be able to monitor whether or not we are
+		 * still here.
+		 */
+		WalSndSetState(WALSNDSTATE_CATCHUP);

-	SyncRepInitConfig();
+		/*
+		 * Don't allow a request to stream from a future point in WAL that
+		 * hasn't been flushed to disk in this server yet.
+		 */
+		if (FlushPtr < cmd->startpoint)
+		{
+			ereport(ERROR,
+					(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
+							LSN_FORMAT_ARGS(cmd->startpoint),
+							LSN_FORMAT_ARGS(FlushPtr))));
+		}

-	/* Infinite send loop, never returns */
-	WalSndLoop();
+		/* Start streaming from the requested point */
+		sentPtr = cmd->startpoint;

-	WalSndSetState(WALSNDSTATE_STARTUP);
+		/* Initialize shared memory status, too */
+		SpinLockAcquire(&MyWalSnd->mutex);
+		MyWalSnd->sentPtr = sentPtr;
+		SpinLockRelease(&MyWalSnd->mutex);
+
+		SyncRepInitConfig();
+
+		/* Main loop of walsender */
+		replication_active = true;
+
+		WalSndLoop(XLogSendPhysical);
+
+		replication_active = false;
+		if (got_STOPPING)
+			proc_exit(0);
+		WalSndSetState(WALSNDSTATE_STARTUP);
+
+		Assert(streamingDoneSending && streamingDoneReceiving);
+	}

 	if (cmd->slotname)
 		ReplicationSlotRelease();
+
+	/*
+	 * Copy is finished now. Send a single-row result set indicating the next
+	 * timeline.
+	 */
+	if (sendTimeLineIsHistoric)
+	{
+		char		startpos_str[8 + 1 + 8 + 1];
+		DestReceiver *dest;
+		TupOutputState *tstate;
+		TupleDesc	tupdesc;
+		Datum		values[2];
+		bool		nulls[2];
+
+		snprintf(startpos_str, sizeof(startpos_str), "%X/%X",
+				 LSN_FORMAT_ARGS(sendTimeLineValidUpto));
+
+		dest = CreateDestReceiver(DestRemoteSimple);
+		MemSet(nulls, false, sizeof(nulls));
+
+		/*
+		 * Need a tuple descriptor representing two columns. int8 may seem
+		 * like a surprising data type for this, but in theory int4 would not
+		 * be wide enough for this, as TimeLineID is unsigned.
+		 */
+		tupdesc = CreateTemplateTupleDesc(2);
+		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli",
+								  INT8OID, -1, 0);
+		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos",
+								  TEXTOID, -1, 0);
+
+		/* prepare for projection of tuple */
+		tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual);
+
+		values[0] = Int64GetDatum((int64) sendTimeLineNextTLI);
+		values[1] = CStringGetTextDatum(startpos_str);
+
+		/* send it to dest */
+		do_tup_output(tstate, values, nulls);
+
+		end_tup_output(tstate);
+	}
+
+	/* Send CommandComplete message */
+	EndReplicationCommand("START_STREAMING");
 }

-/*
- * Main loop that waits for LSN updates and calls the walproposer.
- * Synchronous replication sets latch in WalSndWakeup at walsender.c
- */
-static void
-WalSndLoop(void)
+#if PG_VERSION_NUM >= 150000
+static XLogRecPtr
+GetStandbyFlushRecPtr(TimeLineID *tli)
 {
+	XLogRecPtr	replayPtr;
+	TimeLineID	replayTLI;
+	XLogRecPtr	receivePtr;
+	TimeLineID	receiveTLI;
+	XLogRecPtr	result;
+
+	/*
+	 * We can safely send what's already been replayed. Also, if walreceiver
+	 * is streaming WAL from the same timeline, we can send anything that it
+	 * has streamed, but hasn't been replayed yet.
+	 */
+
+	receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
+	replayPtr = GetXLogReplayRecPtr(&replayTLI);
+
+	*tli = replayTLI;
+
+	result = replayPtr;
+	if (receiveTLI == replayTLI && receivePtr > replayPtr)
+		result = receivePtr;
+
+	return result;
+}
+#else
+/*
+ * Returns the latest point in WAL that has been safely flushed to disk, and
+ * can be sent to the standby. This should only be called when in recovery,
+ * ie. we're streaming to a cascaded standby.
+ *
+ * As a side-effect, ThisTimeLineID is updated to the TLI of the last
+ * replayed WAL record.
+ */
+static XLogRecPtr
+GetStandbyFlushRecPtr(void)
+{
+	XLogRecPtr	replayPtr;
+	TimeLineID	replayTLI;
+	XLogRecPtr	receivePtr;
+	TimeLineID	receiveTLI;
+	XLogRecPtr	result;
+
+	/*
+	 * We can safely send what's already been replayed. Also, if walreceiver
+	 * is streaming WAL from the same timeline, we can send anything that it
+	 * has streamed, but hasn't been replayed yet.
+	 */
+
+	receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
+	replayPtr = GetXLogReplayRecPtr(&replayTLI);
+
+	ThisTimeLineID = replayTLI;
+
+	result = replayPtr;
+	if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr)
+		result = receivePtr;
+
+	return result;
+}
+#endif
+
+
+
+/* XLogReaderRoutine->segment_open callback */
+static void
+WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
+				  TimeLineID *tli_p)
+{
+	char		path[MAXPGPATH];
+
+	/*-------
+	 * When reading from a historic timeline, and there is a timeline switch
+	 * within this segment, read from the WAL segment belonging to the new
+	 * timeline.
+	 *
+	 * For example, imagine that this server is currently on timeline 5, and
+	 * we're streaming timeline 4. The switch from timeline 4 to 5 happened at
+	 * 0/13002088. In pg_wal, we have these files:
+	 *
+	 * ...
+	 * 000000040000000000000012
+	 * 000000040000000000000013
+	 * 000000050000000000000013
+	 * 000000050000000000000014
+	 * ...
+	 *
+	 * In this situation, when requested to send the WAL from segment 0x13, on
+	 * timeline 4, we read the WAL from file 000000050000000000000013. Archive
+	 * recovery prefers files from newer timelines, so if the segment was
+	 * restored from the archive on this server, the file belonging to the old
+	 * timeline, 000000040000000000000013, might not exist. Their contents are
+	 * equal up to the switchpoint, because at a timeline switch, the used
+	 * portion of the old segment is copied to the new file.  -------
+	 */
+	*tli_p = sendTimeLine;
+	if (sendTimeLineIsHistoric)
+	{
+		XLogSegNo	endSegNo;
+
+		XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize);
+		if (nextSegNo == endSegNo)
+			*tli_p = sendTimeLineNextTLI;
+	}
+
+	XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize);
+	state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
+	if (state->seg.ws_file >= 0)
+		return;
+
+	/*
+	 * If the file is not found, assume it's because the standby asked for a
+	 * too old WAL segment that has already been removed or recycled.
+	 */
+	if (errno == ENOENT)
+	{
+		char		xlogfname[MAXFNAMELEN];
+		int			save_errno = errno;
+
+		XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size);
+		errno = save_errno;
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("requested WAL segment %s has already been removed",
+						xlogfname)));
+	}
+	else
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m",
+						path)));
+}
+
+
+/* Main loop of walsender process that streams the WAL over Copy messages. */
+static void
+WalSndLoop(WalSndSendDataCallback send_data)
+{
+	/*
+	 * Initialize the last reply timestamp. That enables timeout processing
+	 * from hereon.
+	 */
+	last_reply_timestamp = GetCurrentTimestamp();
+	waiting_for_ping_response = false;
+
+	/*
+	 * Loop until we reach the end of this timeline or the client requests to
+	 * stop streaming.
+	 */
 	for (;;)
 	{
 		/* Clear any already-pending wakeups */
@@ -565,41 +911,153 @@ WalSndLoop(void)

 		CHECK_FOR_INTERRUPTS();

-		XLogBroadcastWalProposer();
+		/* Process any requests or signals received recently */
+		if (ConfigReloadPending)
+		{
+			ConfigReloadPending = false;
+			ProcessConfigFile(PGC_SIGHUP);
+			SyncRepInitConfig();
+		}

-		if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
-			WalSndSetState(WALSNDSTATE_STREAMING);
-		WalProposerPoll();
+		/* always true */
+		if (am_wal_proposer)
+		{
+			send_data();
+			if (WalSndCaughtUp)
+			{
+				if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
+					WalSndSetState(WALSNDSTATE_STREAMING);
+				WalProposerPoll();
+				WalSndCaughtUp = false;
+			}
+			continue;
+		}
 	}
 }

 /*
- * Notify walproposer about the new WAL position.
+ * Send out the WAL in its normal physical/stored form.
+ *
+ * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
+ * but not yet sent to the client, and buffer it in the libpq output
+ * buffer.
+ *
+ * If there is no unsent WAL remaining, WalSndCaughtUp is set to true,
+ * otherwise WalSndCaughtUp is set to false.
 */
 static void
-XLogBroadcastWalProposer(void)
+XLogSendPhysical(void)
 {
+	XLogRecPtr	SendRqstPtr;
 	XLogRecPtr	startptr;
 	XLogRecPtr	endptr;
+	Size		nbytes PG_USED_FOR_ASSERTS_ONLY;
+	TimeLineID	currTLI;

-	/* Start from the last sent position */
-	startptr = sentPtr;
+	/* If requested switch the WAL sender to the stopping state. */
+	if (got_STOPPING)
+		WalSndSetState(WALSNDSTATE_STOPPING);

-	/*
-	 * Streaming the current timeline on a primary.
-	 *
-	 * Attempt to send all data that's already been written out and
-	 * fsync'd to disk.  We cannot go further than what's been written out
-	 * given the current implementation of WALRead().  And in any case
-	 * it's unsafe to send WAL that is not securely down to disk on the
-	 * primary: if the primary subsequently crashes and restarts, standbys
-	 * must not have applied any WAL that got lost on the primary.
-	 */
+	if (streamingDoneSending)
+	{
+		WalSndCaughtUp = true;
+		return;
+	}
+
+	/* Figure out how far we can safely send the WAL. */
+	if (sendTimeLineIsHistoric)
+	{
+		/*
+		 * Streaming an old timeline that's in this server's history, but is
+		 * not the one we're currently inserting or replaying. It can be
+		 * streamed up to the point where we switched off that timeline.
+		 */
+		SendRqstPtr = sendTimeLineValidUpto;
+	}
+	else if (am_cascading_walsender)
+	{
+		/*
+		 * Streaming the latest timeline on a standby.
+		 *
+		 * Attempt to send all WAL that has already been replayed, so that we
+		 * know it's valid. If we're receiving WAL through streaming
+		 * replication, it's also OK to send any WAL that has been received
+		 * but not replayed.
+		 *
+		 * The timeline we're recovering from can change, or we can be
+		 * promoted. In either case, the current timeline becomes historic. We
+		 * need to detect that so that we don't try to stream past the point
+		 * where we switched to another timeline. We check for promotion or
+		 * timeline switch after calculating FlushPtr, to avoid a race
+		 * condition: if the timeline becomes historic just after we checked
+		 * that it was still current, it's still be OK to stream it up to the
+		 * FlushPtr that was calculated before it became historic.
+		 */
+		bool		becameHistoric = false;
 #if PG_VERSION_NUM >= 150000
-	endptr = GetFlushRecPtr(NULL);
+		SendRqstPtr = GetStandbyFlushRecPtr(&currTLI);
 #else
-	endptr = GetFlushRecPtr();
+		SendRqstPtr = GetStandbyFlushRecPtr();
+		currTLI = ThisTimeLineID;
 #endif
+		if (!RecoveryInProgress())
+		{
+			/*
+			 * We have been promoted. RecoveryInProgress() updated
+			 * ThisTimeLineID to the new current timeline.
+			 */
+			am_cascading_walsender = false;
+			becameHistoric = true;
+		}
+		else
+		{
+			/*
+			 * Still a cascading standby. But is the timeline we're sending
+			 * still the one recovery is recovering from? currTLI was updated
+			 * by the GetStandbyFlushRecPtr() call above.
+			 */
+			if (sendTimeLine != currTLI)
+				becameHistoric = true;
+		}
+
+		if (becameHistoric)
+		{
+			/*
+			 * The timeline we were sending has become historic. Read the
+			 * timeline history file of the new timeline to see where exactly
+			 * we forked off from the timeline we were sending.
+			 */
+			List	   *history;
+
+			history = readTimeLineHistory(currTLI);
+			sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI);
+
+			Assert(sendTimeLine < sendTimeLineNextTLI);
+			list_free_deep(history);
+
+			sendTimeLineIsHistoric = true;
+
+			SendRqstPtr = sendTimeLineValidUpto;
+		}
+	}
+	else
+	{
+		/*
+		 * Streaming the current timeline on a primary.
+		 *
+		 * Attempt to send all data that's already been written out and
+		 * fsync'd to disk.  We cannot go further than what's been written out
+		 * given the current implementation of WALRead().  And in any case
+		 * it's unsafe to send WAL that is not securely down to disk on the
+		 * primary: if the primary subsequently crashes and restarts, standbys
+		 * must not have applied any WAL that got lost on the primary.
+		 */
+#if PG_VERSION_NUM >= 150000
+		SendRqstPtr = GetFlushRecPtr(NULL);
+#else
+		SendRqstPtr = GetFlushRecPtr();
+#endif
+	}

 	/*
 	 * Record the current system time as an approximation of the time at which
@@ -625,14 +1083,91 @@ XLogBroadcastWalProposer(void)
 	 * that arbitrary LSN is eventually reported as written, flushed and
 	 * applied, so that it can measure the elapsed time.
 	 */
-	LagTrackerWrite(endptr, GetCurrentTimestamp());
+	LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp());
+
+	/*
+	 * If this is a historic timeline and we've reached the point where we
+	 * forked to the next timeline, stop streaming.
+	 *
+	 * Note: We might already have sent WAL > sendTimeLineValidUpto. The
+	 * startup process will normally replay all WAL that has been received
+	 * from the primary, before promoting, but if the WAL streaming is
+	 * terminated at a WAL page boundary, the valid portion of the timeline
+	 * might end in the middle of a WAL record. We might've already sent the
+	 * first half of that partial WAL record to the cascading standby, so that
+	 * sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't
+	 * replay the partial WAL record either, so it can still follow our
+	 * timeline switch.
+	 */
+	if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr)
+	{
+		/* close the current file. */
+		if (xlogreader->seg.ws_file >= 0)
+			wal_segment_close(xlogreader);
+
+		/* Send CopyDone */
+		pq_putmessage_noblock('c', NULL, 0);
+		streamingDoneSending = true;
+
+		WalSndCaughtUp = true;
+
+		elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
+			 LSN_FORMAT_ARGS(sendTimeLineValidUpto),
+			 LSN_FORMAT_ARGS(sentPtr));
+		return;
+	}

 	/* Do we have any work to do? */
-	Assert(startptr <= endptr);
-	if (endptr <= startptr)
+	Assert(sentPtr <= SendRqstPtr);
+	if (SendRqstPtr <= sentPtr)
+	{
+		WalSndCaughtUp = true;
 		return;
+	}

-	WalProposerBroadcast(startptr, endptr);
+	/*
+	 * Figure out how much to send in one message. If there's no more than
+	 * MAX_SEND_SIZE bytes to send, send everything. Otherwise send
+	 * MAX_SEND_SIZE bytes, but round back to logfile or page boundary.
+	 *
+	 * The rounding is not only for performance reasons. Walreceiver relies on
+	 * the fact that we never split a WAL record across two messages. Since a
+	 * long WAL record is split at page boundary into continuation records,
+	 * page boundary is always a safe cut-off point. We also assume that
+	 * SendRqstPtr never points to the middle of a WAL record.
+	 */
+	startptr = sentPtr;
+	endptr = startptr;
+	endptr += MAX_SEND_SIZE;
+
+	/* if we went beyond SendRqstPtr, back off */
+	if (SendRqstPtr <= endptr)
+	{
+		endptr = SendRqstPtr;
+		if (sendTimeLineIsHistoric)
+			WalSndCaughtUp = false;
+		else
+			WalSndCaughtUp = true;
+	}
+	else
+	{
+		/* round down to page boundary. */
+		endptr -= (endptr % XLOG_BLCKSZ);
+		WalSndCaughtUp = false;
+	}
+
+	nbytes = endptr - startptr;
+	Assert(nbytes <= MAX_SEND_SIZE);
+
+	/* always true */
+	if (am_wal_proposer)
+	{
+		WalProposerBroadcast(startptr, endptr);
+	}
+	else
+	{
+		/* code removed for brevity */
+	}
 	sentPtr = endptr;

 	/* Update shared memory status */
--- a/poetry.lock
+++ b/poetry.lock
@@ -887,34 +887,34 @@ files = [

 [[package]]
 name = "cryptography"
-version = "41.0.3"
+version = "41.0.2"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507"},
-    {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922"},
-    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81"},
-    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd"},
-    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47"},
-    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116"},
-    {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c"},
-    {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae"},
-    {file = "cryptography-41.0.3-cp37-abi3-win32.whl", hash = "sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306"},
-    {file = "cryptography-41.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574"},
-    {file = "cryptography-41.0.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087"},
-    {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858"},
-    {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906"},
-    {file = "cryptography-41.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e"},
-    {file = "cryptography-41.0.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd"},
-    {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207"},
-    {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84"},
-    {file = "cryptography-41.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7"},
-    {file = "cryptography-41.0.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d"},
-    {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de"},
-    {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1"},
-    {file = "cryptography-41.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4"},
-    {file = "cryptography-41.0.3.tar.gz", hash = "sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34"},
+    {file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:01f1d9e537f9a15b037d5d9ee442b8c22e3ae11ce65ea1f3316a41c78756b711"},
+    {file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:079347de771f9282fbfe0e0236c716686950c19dee1b76240ab09ce1624d76d7"},
+    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:439c3cc4c0d42fa999b83ded80a9a1fb54d53c58d6e59234cfe97f241e6c781d"},
+    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f14ad275364c8b4e525d018f6716537ae7b6d369c094805cae45300847e0894f"},
+    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:84609ade00a6ec59a89729e87a503c6e36af98ddcd566d5f3be52e29ba993182"},
+    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:49c3222bb8f8e800aead2e376cbef687bc9e3cb9b58b29a261210456a7783d83"},
+    {file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d73f419a56d74fef257955f51b18d046f3506270a5fd2ac5febbfa259d6c0fa5"},
+    {file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:2a034bf7d9ca894720f2ec1d8b7b5832d7e363571828037f9e0c4f18c1b58a58"},
+    {file = "cryptography-41.0.2-cp37-abi3-win32.whl", hash = "sha256:d124682c7a23c9764e54ca9ab5b308b14b18eba02722b8659fb238546de83a76"},
+    {file = "cryptography-41.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:9c3fe6534d59d071ee82081ca3d71eed3210f76ebd0361798c74abc2bcf347d4"},
+    {file = "cryptography-41.0.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a719399b99377b218dac6cf547b6ec54e6ef20207b6165126a280b0ce97e0d2a"},
+    {file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:182be4171f9332b6741ee818ec27daff9fb00349f706629f5cbf417bd50e66fd"},
+    {file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7a9a3bced53b7f09da251685224d6a260c3cb291768f54954e28f03ef14e3766"},
+    {file = "cryptography-41.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f0dc40e6f7aa37af01aba07277d3d64d5a03dc66d682097541ec4da03cc140ee"},
+    {file = "cryptography-41.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:674b669d5daa64206c38e507808aae49904c988fa0a71c935e7006a3e1e83831"},
+    {file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7af244b012711a26196450d34f483357e42aeddb04128885d95a69bd8b14b69b"},
+    {file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9b6d717393dbae53d4e52684ef4f022444fc1cce3c48c38cb74fca29e1f08eaa"},
+    {file = "cryptography-41.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:192255f539d7a89f2102d07d7375b1e0a81f7478925b3bc2e0549ebf739dae0e"},
+    {file = "cryptography-41.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f772610fe364372de33d76edcd313636a25684edb94cee53fd790195f5989d14"},
+    {file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b332cba64d99a70c1e0836902720887fb4529ea49ea7f5462cf6640e095e11d2"},
+    {file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9a6673c1828db6270b76b22cc696f40cde9043eb90373da5c2f8f2158957f42f"},
+    {file = "cryptography-41.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:342f3767e25876751e14f8459ad85e77e660537ca0a066e10e75df9c9e9099f0"},
+    {file = "cryptography-41.0.2.tar.gz", hash = "sha256:7d230bf856164de164ecb615ccc14c7fc6de6906ddd5b491f3af90d3514c925c"},
 ]

 [package.dependencies]
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -13,7 +13,6 @@ bytes = { workspace = true, features = ["serde"] }
 chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
-dashmap.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hashbrown.workspace = true
@@ -30,7 +29,7 @@ metrics.workspace = true
 once_cell.workspace = true
 opentelemetry.workspace = true
 parking_lot.workspace = true
-pbkdf2 = { workspace = true, features = ["simple", "std"] }
+pbkdf2.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
 pq_proto.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -53,12 +53,6 @@ pub enum BackendType<'a, T> {
    Postgres(Cow<'a, console::provider::mock::Api>, T),
    /// Authentication via a web browser.
    Link(Cow<'a, url::ApiUrl>),
-    /// Test backend.
-    Test(&'a dyn TestBackend),
-}
-
-pub trait TestBackend: Send + Sync + 'static {
-    fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
 }

 impl std::fmt::Display for BackendType<'_, ()> {
@@ -68,7 +62,6 @@ impl std::fmt::Display for BackendType<'_, ()> {
            Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(),
            Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(),
            Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
-            Test(_) => fmt.debug_tuple("Test").finish(),
        }
    }
 }
@@ -82,7 +75,6 @@ impl<T> BackendType<'_, T> {
            Console(c, x) => Console(Cow::Borrowed(c), x),
            Postgres(c, x) => Postgres(Cow::Borrowed(c), x),
            Link(c) => Link(Cow::Borrowed(c)),
-            Test(x) => Test(*x),
        }
    }
 }
@@ -97,7 +89,6 @@ impl<'a, T> BackendType<'a, T> {
            Console(c, x) => Console(c, f(x)),
            Postgres(c, x) => Postgres(c, f(x)),
            Link(c) => Link(c),
-            Test(x) => Test(x),
        }
    }
 }
@@ -111,7 +102,6 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
            Console(c, x) => x.map(|x| Console(c, x)),
            Postgres(c, x) => x.map(|x| Postgres(c, x)),
            Link(c) => Ok(Link(c)),
-            Test(x) => Ok(Test(x)),
        }
    }
 }
@@ -157,7 +147,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
            Console(_, creds) => creds.project.clone(),
            Postgres(_, creds) => creds.project.clone(),
            Link(_) => Some("link".to_owned()),
-            Test(_) => Some("test".to_owned()),
        }
    }
    /// Authenticate the client via the requested backend, possibly using credentials.
@@ -199,9 +188,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
                    .await?
                    .map(CachedNodeInfo::new_uncached)
            }
-            Test(_) => {
-                unreachable!("this function should never be called in the test backend")
-            }
        };

        info!("user successfully authenticated");
@@ -220,7 +206,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
            Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
            Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
            Link(_) => Ok(None),
-            Test(x) => x.wake_compute().map(Some),
        }
    }
 }
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -5,12 +5,12 @@ use crate::{
    auth::{self, AuthFlow, ClientCredentials},
    compute,
    console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
-    proxy::{handle_try_wake, retry_after},
+    proxy::{try_wake, NUM_RETRIES_CONNECT},
    sasl, scram,
    stream::PqStream,
 };
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{error, info, warn};
+use tracing::info;

 pub(super) async fn authenticate(
    api: &impl console::Api,
@@ -36,18 +36,7 @@ pub(super) async fn authenticate(
        AuthInfo::Scram(secret) => {
            info!("auth endpoint chooses SCRAM");
            let scram = auth::Scram(&secret);
-
-            let auth_flow = flow.begin(scram).await.map_err(|error| {
-                warn!(?error, "error sending scram acknowledgement");
-                error
-            })?;
-
-            let auth_outcome = auth_flow.authenticate().await.map_err(|error| {
-                warn!(?error, "error processing scram messages");
-                error
-            })?;
-
-            let client_key = match auth_outcome {
+            let client_key = match flow.begin(scram).await?.authenticate().await? {
                sasl::Outcome::Success(key) => key,
                sasl::Outcome::Failure(reason) => {
                    info!("auth backend failed with an error: {reason}");
@@ -64,21 +53,12 @@ pub(super) async fn authenticate(

    let mut num_retries = 0;
    let mut node = loop {
-        let wake_res = api.wake_compute(extra, creds).await;
-        match handle_try_wake(wake_res, num_retries) {
-            Err(e) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                return Err(e.into());
-            }
-            Ok(ControlFlow::Continue(e)) => {
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-            }
-            Ok(ControlFlow::Break(n)) => break n,
-        }
-
-        let wait_duration = retry_after(num_retries);
        num_retries += 1;
-        tokio::time::sleep(wait_duration).await;
+        match try_wake(api, extra, creds).await? {
+            ControlFlow::Break(n) => break n,
+            ControlFlow::Continue(_) if num_retries < NUM_RETRIES_CONNECT => continue,
+            ControlFlow::Continue(e) => return Err(e.into()),
+        }
    };
    if let Some(keys) = scram_keys {
        use tokio_postgres::config::AuthKeys;
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -230,8 +230,7 @@ pub struct PostgresConnection {
 }

 impl ConnCfg {
-    /// Connect to a corresponding compute node.
-    pub async fn connect(
+    async fn do_connect(
        &self,
        allow_self_signed_compute: bool,
        timeout: Duration,
@@ -271,6 +270,20 @@ impl ConnCfg {

        Ok(connection)
    }
+
+    /// Connect to a corresponding compute node.
+    pub async fn connect(
+        &self,
+        allow_self_signed_compute: bool,
+        timeout: Duration,
+    ) -> Result<PostgresConnection, ConnectionError> {
+        self.do_connect(allow_self_signed_compute, timeout)
+            .inspect_err(|err| {
+                // Immediately log the error we have at our disposal.
+                error!("couldn't connect to compute node: {err}");
+            })
+            .await
+    }
 }

 /// Retrieve `options` from a startup message, dropping all proxy-secific flags.
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -8,7 +8,6 @@ use super::{
 use crate::{auth::ClientCredentials, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};

@@ -48,9 +47,7 @@ impl Api {
                .build()?;

            info!(url = request.url().as_str(), "sending http request");
-            let start = Instant::now();
            let response = self.endpoint.execute(request).await?;
-            info!(duration = ?start.elapsed(), "received http response");
            let body = match parse_body::<GetRoleSecret>(response).await {
                Ok(body) => body,
                // Error 404 is special: it's ok not to have a secret.
@@ -91,9 +88,7 @@ impl Api {
                .build()?;

            info!(url = request.url().as_str(), "sending http request");
-            let start = Instant::now();
            let response = self.endpoint.execute(request).await?;
-            info!(duration = ?start.elapsed(), "received http response");
            let body = parse_body::<WakeCompute>(response).await?;

            // Unfortunately, ownership won't let us use `Option::ok_or` here.
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -7,14 +7,11 @@ pub mod server;
 pub mod sql_over_http;
 pub mod websocket;

-use std::{sync::Arc, time::Duration};
+use std::time::Duration;

-use futures::FutureExt;
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
-use tokio::time::Instant;
-use tracing::trace;

 use crate::url::ApiUrl;
 use reqwest_middleware::RequestBuilder;
@@ -23,21 +20,13 @@ use reqwest_middleware::RequestBuilder;
 /// because it takes care of observability (OpenTelemetry).
 /// We deliberately don't want to replace this with a public static.
 pub fn new_client() -> ClientWithMiddleware {
-    let client = reqwest::ClientBuilder::new()
-        .dns_resolver(Arc::new(GaiResolver::default()))
-        .connection_verbose(true)
-        .build()
-        .expect("Failed to create http client");
-
-    reqwest_middleware::ClientBuilder::new(client)
+    reqwest_middleware::ClientBuilder::new(reqwest::Client::new())
        .with(reqwest_tracing::TracingMiddleware::default())
        .build()
 }

 pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
    let timeout_client = reqwest::ClientBuilder::new()
-        .dns_resolver(Arc::new(GaiResolver::default()))
-        .connection_verbose(true)
        .timeout(default_timout)
        .build()
        .expect("Failed to create http client with timeout");
@@ -50,10 +39,6 @@ pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware
        // As per docs, "This middleware always errors when given requests with streaming bodies".
        // That's all right because we only use this client to send `serde_json::RawValue`, which
        // is not a stream.
-        //
-        // ex-maintainer note:
-        // this limitation can be fixed if streaming is necessary.
-        // retries will still not be performed, but it wont error immediately
        .with(RetryTransientMiddleware::new_with_policy(retry_policy))
        .build()
 }
@@ -96,37 +81,6 @@ impl Endpoint {
    }
 }

-/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
-use hyper::{
-    client::connect::dns::{GaiResolver as HyperGaiResolver, Name},
-    service::Service,
-};
-use reqwest::dns::{Addrs, Resolve, Resolving};
-#[derive(Debug)]
-pub struct GaiResolver(HyperGaiResolver);
-
-impl Default for GaiResolver {
-    fn default() -> Self {
-        Self(HyperGaiResolver::new())
-    }
-}
-
-impl Resolve for GaiResolver {
-    fn resolve(&self, name: Name) -> Resolving {
-        let this = &mut self.0.clone();
-        let start = Instant::now();
-        Box::pin(
-            Service::<Name>::call(this, name.clone()).map(move |result| {
-                let resolve_duration = start.elapsed();
-                trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
-                result
-                    .map(|addrs| -> Addrs { Box::new(addrs) })
-                    .map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
-            }),
-        )
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -1,21 +1,10 @@
 use anyhow::Context;
 use async_trait::async_trait;
-use dashmap::DashMap;
-use futures::future::poll_fn;
-use parking_lot::RwLock;
-use pbkdf2::{
-    password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString},
-    Params, Pbkdf2,
-};
+use parking_lot::Mutex;
 use pq_proto::StartupMessageParams;
-use std::sync::atomic::{self, AtomicUsize};
+use std::fmt;
 use std::{collections::HashMap, sync::Arc};
-use std::{
-    fmt,
-    task::{ready, Poll},
-};
 use tokio::time;
-use tokio_postgres::AsyncMessage;

 use crate::{auth, console};
 use crate::{compute, config};
@@ -24,8 +13,8 @@ use super::sql_over_http::MAX_RESPONSE_SIZE;

 use crate::proxy::ConnectMechanism;

-use tracing::{error, warn};
-use tracing::{info, info_span, Instrument};
+use tracing::error;
+use tracing::info;

 pub const APP_NAME: &str = "sql_over_http";
 const MAX_CONNS_PER_ENDPOINT: usize = 20;
@@ -53,44 +42,23 @@ impl fmt::Display for ConnInfo {
 }

 struct ConnPoolEntry {
-    conn: Client,
+    conn: tokio_postgres::Client,
    _last_access: std::time::Instant,
 }

-// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
+// Per-endpoint connection pool, (dbname, username) -> Vec<ConnPoolEntry>
 // Number of open connections is limited by the `max_conns_per_endpoint`.
 pub struct EndpointConnPool {
-    pools: HashMap<(String, String), DbUserConnPool>,
+    pools: HashMap<(String, String), Vec<ConnPoolEntry>>,
    total_conns: usize,
 }

-/// This is cheap and not hugely secure.
-/// But probably good enough for in memory only hashes.
-///
-/// Still takes 3.5ms to hash on my hardware.
-/// We don't want to ruin the latency improvements of using the pool by making password verification take too long
-const PARAMS: Params = Params {
-    rounds: 10_000,
-    output_length: 32,
-};
-
-#[derive(Default)]
-pub struct DbUserConnPool {
-    conns: Vec<ConnPoolEntry>,
-    password_hash: Option<PasswordHashString>,
-}
-
 pub struct GlobalConnPool {
    // endpoint -> per-endpoint connection pool
    //
    // That should be a fairly conteded map, so return reference to the per-endpoint
    // pool as early as possible and release the lock.
-    global_pool: DashMap<String, Arc<RwLock<EndpointConnPool>>>,
-
-    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
-    /// That seems like far too much effort, so we're using a relaxed increment counter instead.
-    /// It's only used for diagnostics.
-    global_pool_size: AtomicUsize,
+    global_pool: Mutex<HashMap<String, Arc<Mutex<EndpointConnPool>>>>,

    // Maximum number of connections per one endpoint.
    // Can mix different (dbname, username) connections.
@@ -104,8 +72,7 @@ pub struct GlobalConnPool {
 impl GlobalConnPool {
    pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
        Arc::new(Self {
-            global_pool: DashMap::new(),
-            global_pool_size: AtomicUsize::new(0),
+            global_pool: Mutex::new(HashMap::new()),
            max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
            proxy_config: config,
        })
@@ -115,125 +82,70 @@ impl GlobalConnPool {
        &self,
        conn_info: &ConnInfo,
        force_new: bool,
-        session_id: uuid::Uuid,
-    ) -> anyhow::Result<Client> {
-        let mut client: Option<Client> = None;
+    ) -> anyhow::Result<tokio_postgres::Client> {
+        let mut client: Option<tokio_postgres::Client> = None;

-        let mut hash_valid = false;
        if !force_new {
-            let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
-            let mut hash = None;
+            let pool = self.get_endpoint_pool(&conn_info.hostname).await;

            // find a pool entry by (dbname, username) if exists
-            {
-                let pool = pool.read();
-                if let Some(pool_entries) = pool.pools.get(&conn_info.db_and_user()) {
-                    if !pool_entries.conns.is_empty() {
-                        hash = pool_entries.password_hash.clone();
-                    }
-                }
-            }
-
-            // a connection exists in the pool, verify the password hash
-            if let Some(hash) = hash {
-                let pw = conn_info.password.clone();
-                let validate = tokio::task::spawn_blocking(move || {
-                    Pbkdf2.verify_password(pw.as_bytes(), &hash.password_hash())
-                })
-                .await?;
-
-                // if the hash is invalid, don't error
-                // we will continue with the regular connection flow
-                if validate.is_ok() {
-                    hash_valid = true;
-                    let mut pool = pool.write();
-                    if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
-                        if let Some(entry) = pool_entries.conns.pop() {
-                            client = Some(entry.conn);
-                            pool.total_conns -= 1;
-                        }
-                    }
+            let mut pool = pool.lock();
+            let pool_entries = pool.pools.get_mut(&conn_info.db_and_user());
+            if let Some(pool_entries) = pool_entries {
+                if let Some(entry) = pool_entries.pop() {
+                    client = Some(entry.conn);
+                    pool.total_conns -= 1;
                }
            }
        }

        // ok return cached connection if found and establish a new one otherwise
-        let new_client = if let Some(client) = client {
-            if client.inner.is_closed() {
+        if let Some(client) = client {
+            if client.is_closed() {
                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
-                connect_to_compute(self.proxy_config, conn_info, session_id).await
+                connect_to_compute(self.proxy_config, conn_info).await
            } else {
                info!("pool: reusing connection '{conn_info}'");
-                client.session.send(session_id)?;
-                return Ok(client);
+                Ok(client)
            }
        } else {
            info!("pool: opening a new connection '{conn_info}'");
-            connect_to_compute(self.proxy_config, conn_info, session_id).await
-        };
-
-        match &new_client {
-            // clear the hash. it's no longer valid
-            // TODO: update tokio-postgres fork to allow access to this error kind directly
-            Err(err)
-                if hash_valid && err.to_string().contains("password authentication failed") =>
-            {
-                let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
-                let mut pool = pool.write();
-                if let Some(entry) = pool.pools.get_mut(&conn_info.db_and_user()) {
-                    entry.password_hash = None;
-                }
-            }
-            // new password is valid and we should insert/update it
-            Ok(_) if !force_new && !hash_valid => {
-                let pw = conn_info.password.clone();
-                let new_hash = tokio::task::spawn_blocking(move || {
-                    let salt = SaltString::generate(rand::rngs::OsRng);
-                    Pbkdf2
-                        .hash_password_customized(pw.as_bytes(), None, None, PARAMS, &salt)
-                        .map(|s| s.serialize())
-                })
-                .await??;
-
-                let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
-                let mut pool = pool.write();
-                pool.pools
-                    .entry(conn_info.db_and_user())
-                    .or_default()
-                    .password_hash = Some(new_hash);
-            }
-            _ => {}
+            connect_to_compute(self.proxy_config, conn_info).await
        }
-
-        new_client
    }

-    pub async fn put(&self, conn_info: &ConnInfo, client: Client) -> anyhow::Result<()> {
-        let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
+    pub async fn put(
+        &self,
+        conn_info: &ConnInfo,
+        client: tokio_postgres::Client,
+    ) -> anyhow::Result<()> {
+        let pool = self.get_endpoint_pool(&conn_info.hostname).await;

        // return connection to the pool
+        let mut total_conns;
        let mut returned = false;
        let mut per_db_size = 0;
-        let total_conns = {
-            let mut pool = pool.write();
+        {
+            let mut pool = pool.lock();
+            total_conns = pool.total_conns;

-            if pool.total_conns < self.max_conns_per_endpoint {
-                // we create this db-user entry in get, so it should not be None
-                if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
-                    pool_entries.conns.push(ConnPoolEntry {
-                        conn: client,
-                        _last_access: std::time::Instant::now(),
-                    });
+            let pool_entries: &mut Vec<ConnPoolEntry> = pool
+                .pools
+                .entry(conn_info.db_and_user())
+                .or_insert_with(|| Vec::with_capacity(1));
+            if total_conns < self.max_conns_per_endpoint {
+                pool_entries.push(ConnPoolEntry {
+                    conn: client,
+                    _last_access: std::time::Instant::now(),
+                });

-                    returned = true;
-                    per_db_size = pool_entries.conns.len();
+                total_conns += 1;
+                returned = true;
+                per_db_size = pool_entries.len();

-                    pool.total_conns += 1;
-                }
+                pool.total_conns += 1;
            }
-
-            pool.total_conns
-        };
+        }

        // do logging outside of the mutex
        if returned {
@@ -245,35 +157,25 @@ impl GlobalConnPool {
        Ok(())
    }

-    fn get_or_create_endpoint_pool(&self, endpoint: &String) -> Arc<RwLock<EndpointConnPool>> {
-        // fast path
-        if let Some(pool) = self.global_pool.get(endpoint) {
-            return pool.clone();
-        }
-
-        // slow path
-        let new_pool = Arc::new(RwLock::new(EndpointConnPool {
-            pools: HashMap::new(),
-            total_conns: 0,
-        }));
-
+    async fn get_endpoint_pool(&self, endpoint: &String) -> Arc<Mutex<EndpointConnPool>> {
        // find or create a pool for this endpoint
        let mut created = false;
-        let pool = self
-            .global_pool
+        let mut global_pool = self.global_pool.lock();
+        let pool = global_pool
            .entry(endpoint.clone())
            .or_insert_with(|| {
                created = true;
-                new_pool
+                Arc::new(Mutex::new(EndpointConnPool {
+                    pools: HashMap::new(),
+                    total_conns: 0,
+                }))
            })
            .clone();
+        let global_pool_size = global_pool.len();
+        drop(global_pool);

        // log new global pool size
        if created {
-            let global_pool_size = self
-                .global_pool_size
-                .fetch_add(1, atomic::Ordering::Relaxed)
-                + 1;
            info!(
                "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}"
            );
@@ -285,12 +187,11 @@ impl GlobalConnPool {

 struct TokioMechanism<'a> {
    conn_info: &'a ConnInfo,
-    session_id: uuid::Uuid,
 }

 #[async_trait]
 impl ConnectMechanism for TokioMechanism<'_> {
-    type Connection = Client;
+    type Connection = tokio_postgres::Client;
    type ConnectError = tokio_postgres::Error;
    type Error = anyhow::Error;

@@ -299,7 +200,7 @@ impl ConnectMechanism for TokioMechanism<'_> {
        node_info: &console::CachedNodeInfo,
        timeout: time::Duration,
    ) -> Result<Self::Connection, Self::ConnectError> {
-        connect_to_compute_once(node_info, self.conn_info, timeout, self.session_id).await
+        connect_to_compute_once(node_info, self.conn_info, timeout).await
    }

    fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
@@ -312,8 +213,7 @@ impl ConnectMechanism for TokioMechanism<'_> {
 async fn connect_to_compute(
    config: &config::ProxyConfig,
    conn_info: &ConnInfo,
-    session_id: uuid::Uuid,
-) -> anyhow::Result<Client> {
+) -> anyhow::Result<tokio_postgres::Client> {
    let tls = config.tls_config.as_ref();
    let common_names = tls.and_then(|tls| tls.common_names.clone());

@@ -344,27 +244,17 @@ async fn connect_to_compute(
        .await?
        .context("missing cache entry from wake_compute")?;

-    crate::proxy::connect_to_compute(
-        &TokioMechanism {
-            conn_info,
-            session_id,
-        },
-        node_info,
-        &extra,
-        &creds,
-    )
-    .await
+    crate::proxy::connect_to_compute(&TokioMechanism { conn_info }, node_info, &extra, &creds).await
 }

 async fn connect_to_compute_once(
    node_info: &console::CachedNodeInfo,
    conn_info: &ConnInfo,
    timeout: time::Duration,
-    mut session: uuid::Uuid,
-) -> Result<Client, tokio_postgres::Error> {
+) -> Result<tokio_postgres::Client, tokio_postgres::Error> {
    let mut config = (*node_info.config).clone();

-    let (client, mut connection) = config
+    let (client, connection) = config
        .user(&conn_info.username)
        .password(&conn_info.password)
        .dbname(&conn_info.dbname)
@@ -373,53 +263,11 @@ async fn connect_to_compute_once(
        .connect(tokio_postgres::NoTls)
        .await?;

-    let (tx, mut rx) = tokio::sync::watch::channel(session);
-
-    let conn_id = uuid::Uuid::new_v4();
-    let span = info_span!(parent: None, "connection", %conn_info, %conn_id);
-    span.in_scope(|| {
-        info!(%session, "new connection");
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            error!("connection error: {}", e);
+        }
    });

-    tokio::spawn(
-        poll_fn(move |cx| {
-            if matches!(rx.has_changed(), Ok(true)) {
-                session = *rx.borrow_and_update();
-                info!(%session, "changed session");
-            }
-
-            let message = ready!(connection.poll_message(cx));
-
-            match message {
-                Some(Ok(AsyncMessage::Notice(notice))) => {
-                    info!(%session, "notice: {}", notice);
-                    Poll::Pending
-                }
-                Some(Ok(AsyncMessage::Notification(notif))) => {
-                    warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
-                    Poll::Pending
-                }
-                Some(Ok(_)) => {
-                    warn!(%session, "unknown message");
-                    Poll::Pending
-                }
-                Some(Err(e)) => {
-                    error!(%session, "connection error: {}", e);
-                    Poll::Ready(())
-                }
-                None => Poll::Ready(()),
-            }
-        })
-        .instrument(span)
-    );
-
-    Ok(Client {
-        inner: client,
-        session: tx,
-    })
-}
-
-pub struct Client {
-    pub inner: tokio_postgres::Client,
-    session: tokio::sync::watch::Sender<uuid::Uuid>,
+    Ok(client)
 }
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -1,9 +1,7 @@
 use std::sync::Arc;

-use anyhow::bail;
 use futures::pin_mut;
 use futures::StreamExt;
-use hashbrown::HashMap;
 use hyper::body::HttpBody;
 use hyper::http::HeaderName;
 use hyper::http::HeaderValue;
@@ -14,9 +12,7 @@ use serde_json::Value;
 use tokio_postgres::types::Kind;
 use tokio_postgres::types::Type;
 use tokio_postgres::GenericClient;
-use tokio_postgres::IsolationLevel;
 use tokio_postgres::Row;
-use tracing::Instrument;
 use url::Url;

 use super::conn_pool::ConnInfo;
@@ -28,27 +24,19 @@ struct QueryData {
    params: Vec<serde_json::Value>,
 }

-#[derive(serde::Deserialize)]
-struct BatchQueryData {
-    queries: Vec<QueryData>,
-}
-
 #[derive(serde::Deserialize)]
 #[serde(untagged)]
 enum Payload {
    Single(QueryData),
-    Batch(BatchQueryData),
+    Batch(Vec<QueryData>),
 }

-pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB
+pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB

 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
 static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
-static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
-static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");
-static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrable");

 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");

@@ -182,8 +170,7 @@ pub async fn handle(
    request: Request<Body>,
    sni_hostname: Option<String>,
    conn_pool: Arc<GlobalConnPool>,
-    session_id: uuid::Uuid,
-) -> anyhow::Result<(Value, HashMap<HeaderName, HeaderValue>)> {
+) -> anyhow::Result<Value> {
    //
    // Determine the destination and connection params
    //
@@ -198,23 +185,6 @@ pub async fn handle(
    // Allow connection pooling only if explicitly requested
    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);

-    // isolation level, read only and deferrable
-
-    let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
-    let txn_isolation_level = match txn_isolation_level_raw {
-        Some(ref x) => Some(match x.as_bytes() {
-            b"Serializable" => IsolationLevel::Serializable,
-            b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
-            b"ReadCommitted" => IsolationLevel::ReadCommitted,
-            b"RepeatableRead" => IsolationLevel::RepeatableRead,
-            _ => bail!("invalid isolation level"),
-        }),
-        None => None,
-    };
-
-    let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
-    let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
-
    let request_content_length = match request.body().size_hint().upper() {
        Some(v) => v,
        None => MAX_REQUEST_SIZE + 1,
@@ -222,7 +192,7 @@ pub async fn handle(

    if request_content_length > MAX_REQUEST_SIZE {
        return Err(anyhow::anyhow!(
-            "request is too large (max is {MAX_REQUEST_SIZE} bytes)"
+            "request is too large (max {MAX_REQUEST_SIZE} bytes)"
        ));
    }

@@ -232,29 +202,17 @@ pub async fn handle(
    let body = hyper::body::to_bytes(request.into_body()).await?;
    let payload: Payload = serde_json::from_slice(&body)?;

-    let mut client = conn_pool.get(&conn_info, !allow_pool, session_id).await?;
+    let mut client = conn_pool.get(&conn_info, !allow_pool).await?;

    //
    // Now execute the query and return the result
    //
    let result = match payload {
-        Payload::Single(query) => query_to_json(&client.inner, query, raw_output, array_mode)
-            .await
-            .map(|x| (x, HashMap::default())),
-        Payload::Batch(batch_query) => {
+        Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode).await,
+        Payload::Batch(queries) => {
            let mut results = Vec::new();
-            let mut builder = client.inner.build_transaction();
-            if let Some(isolation_level) = txn_isolation_level {
-                builder = builder.isolation_level(isolation_level);
-            }
-            if txn_read_only {
-                builder = builder.read_only(true);
-            }
-            if txn_deferrable {
-                builder = builder.deferrable(true);
-            }
-            let transaction = builder.start().await?;
-            for query in batch_query.queries {
+            let transaction = client.transaction().await?;
+            for query in queries {
                let result = query_to_json(&transaction, query, raw_output, array_mode).await;
                match result {
                    Ok(r) => results.push(r),
@@ -265,34 +223,15 @@ pub async fn handle(
                }
            }
            transaction.commit().await?;
-            let mut headers = HashMap::default();
-            if txn_read_only {
-                headers.insert(
-                    TXN_READ_ONLY.clone(),
-                    HeaderValue::try_from(txn_read_only.to_string())?,
-                );
-            }
-            if txn_deferrable {
-                headers.insert(
-                    TXN_DEFERRABLE.clone(),
-                    HeaderValue::try_from(txn_deferrable.to_string())?,
-                );
-            }
-            if let Some(txn_isolation_level) = txn_isolation_level_raw {
-                headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
-            }
-            Ok((json!({ "results": results }), headers))
+            Ok(json!({ "results": results }))
        }
    };

    if allow_pool {
        // return connection to the pool
-        tokio::task::spawn(
-            async move {
-                let _ = conn_pool.put(&conn_info, client).await;
-            }
-            .in_current_span(),
-        );
+        tokio::task::spawn(async move {
+            let _ = conn_pool.put(&conn_info, client).await;
+        });
    }

    result
@@ -314,15 +253,13 @@ async fn query_to_json<T: GenericClient>(
    // big.
    pin_mut!(row_stream);
    let mut rows: Vec<tokio_postgres::Row> = Vec::new();
-    let mut current_size = 0;
+    let mut curret_size = 0;
    while let Some(row) = row_stream.next().await {
        let row = row?;
-        current_size += row.body_len();
+        curret_size += row.body_len();
        rows.push(row);
-        if current_size > MAX_RESPONSE_SIZE {
-            return Err(anyhow::anyhow!(
-                "response is too large (max is {MAX_RESPONSE_SIZE} bytes)"
-            ));
+        if curret_size > MAX_RESPONSE_SIZE {
+            return Err(anyhow::anyhow!("response too large"));
        }
    }

--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -6,7 +6,6 @@ use crate::{
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
-use hashbrown::HashMap;
 use hyper::{
    server::{
        accept,
@@ -187,30 +186,26 @@ async fn ws_handler(
        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
            .map_err(|e| ApiError::BadRequest(e.into()))?;

-        tokio::spawn(
-            async move {
-                if let Err(e) =
-                    serve_websocket(websocket, config, &cancel_map, session_id, host).await
-                {
-                    error!(session_id = ?session_id, "error in websocket connection: {e:#}");
-                }
+        tokio::spawn(async move {
+            if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
+            {
+                error!(session_id = ?session_id, "error in websocket connection: {e:?}");
            }
-            .in_current_span(),
-        );
+        });

        // Return the response so the spawned future can continue.
        Ok(response)
    // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
    // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        let result = sql_over_http::handle(request, sni_hostname, conn_pool, session_id)
+        let result = sql_over_http::handle(request, sni_hostname, conn_pool)
            .instrument(info_span!("sql-over-http"))
            .await;
        let status_code = match result {
            Ok(_) => StatusCode::OK,
            Err(_) => StatusCode::BAD_REQUEST,
        };
-        let (json, headers) = match result {
+        let json = match result {
            Ok(r) => r,
            Err(e) => {
                let message = format!("{:?}", e);
@@ -221,14 +216,7 @@ async fn ws_handler(
                    },
                    None => Value::Null,
                };
-                error!(
-                    ?code,
-                    "sql-over-http per-client task finished with an error: {e:#}"
-                );
-                (
-                    json!({ "message": message, "code": code }),
-                    HashMap::default(),
-                )
+                json!({ "message": message, "code": code })
            }
        };
        json_response(status_code, json).map(|mut r| {
@@ -236,9 +224,6 @@ async fn ws_handler(
                "Access-Control-Allow-Origin",
                hyper::http::HeaderValue::from_static("*"),
            );
-            for (k, v) in headers {
-                r.headers_mut().insert(k, v);
-            }
            r
        })
    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
@@ -307,7 +292,7 @@ pub async fn task_main(
                        ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
                            .instrument(info_span!(
                                "ws-client",
-                                session = %session_id
+                                session = format_args!("{session_id}")
                            ))
                            .await
                    }
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -11,6 +11,7 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";

 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);

+///
 /// Key that uniquely identifies the object, this metric describes.
 /// Currently, endpoint_id is enough, but this may change later,
 /// so keep it in a named struct.
@@ -18,7 +19,8 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// Both the proxy and the ingestion endpoint will live in the same region (or cell)
 /// so while the project-id is unique across regions the whole pipeline will work correctly
 /// because we enrich the event with project_id in the control-plane endpoint.
-#[derive(Eq, Hash, PartialEq, Serialize, Debug, Clone)]
+///
+#[derive(Eq, Hash, PartialEq, Serialize, Debug)]
 pub struct Ids {
    pub endpoint_id: String,
    pub branch_id: String,
@@ -147,7 +149,7 @@ async fn collect_metrics_iteration(
                    stop_time: *curr_time,
                },
                metric: PROXY_IO_BYTES_PER_CLIENT,
-                idempotency_key: idempotency_key(hostname),
+                idempotency_key: idempotency_key(hostname.to_owned()),
                value,
                extra: Ids {
                    endpoint_id: curr_key.endpoint_id.clone(),
@@ -165,11 +167,12 @@ async fn collect_metrics_iteration(
    // Send metrics.
    // Split into chunks of 1000 metrics to avoid exceeding the max request size
    for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
+        let chunk_json = serde_json::value::to_raw_value(&EventChunk { events: chunk })
+            .expect("ProxyConsumptionMetric should not fail serialization");
+
        let res = client
            .post(metric_collection_endpoint.clone())
-            .json(&EventChunk {
-                events: chunk.into(),
-            })
+            .json(&chunk_json)
            .send()
            .await;

--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -6,7 +6,7 @@ use crate::{
    cancellation::{self, CancelMap},
    compute::{self, PostgresConnection},
    config::{ProxyConfig, TlsConfig},
-    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
+    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo},
    stream::{PqStream, Stream},
 };
 use anyhow::{bail, Context};
@@ -23,7 +23,7 @@ use tokio::{
    time,
 };
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{error, info, warn};
 use utils::measured_stream::MeasuredStream;

 /// Number of times we should retry the `/proxy_wake_compute` http request.
@@ -101,20 +101,21 @@ pub async fn task_main(
        tokio::select! {
            accept_result = listener.accept() => {
                let (socket, peer_addr) = accept_result?;
+                info!("accepted postgres client connection from {peer_addr}");

                let session_id = uuid::Uuid::new_v4();
                let cancel_map = Arc::clone(&cancel_map);
                connections.spawn(
                    async move {
-                        info!("accepted postgres client connection");
+                        info!("spawned a task for {peer_addr}");

                        socket
                            .set_nodelay(true)
                            .context("failed to set socket option")?;

-                        handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp).await
+                        handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp)
+                        .await
                    }
-                    .instrument(info_span!("handle_client", ?session_id, %peer_addr))
                    .unwrap_or_else(move |e| {
                        // Acknowledge that the task has finished with an error.
                        error!(?session_id, "per-client task finished with an error: {e:#}");
@@ -182,6 +183,7 @@ impl ClientMode {
    }
 }

+#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    config: &'static ProxyConfig,
    cancel_map: &CancelMap,
@@ -345,6 +347,11 @@ async fn connect_to_compute_once(
        .await
 }

+enum ConnectionState<E> {
+    Cached(console::CachedNodeInfo),
+    Invalid(compute::ConnCfg, E),
+}
+
 #[async_trait]
 pub trait ConnectMechanism {
    type Connection;
@@ -400,73 +407,70 @@ where

    mechanism.update_connect_config(&mut node_info.config);

-    // try once
-    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-        Ok(res) => return Ok(res),
-        Err(e) => {
-            error!(error = ?e, "could not connect to compute node");
-            (invalidate_cache(node_info), e)
-        }
-    };
+    let mut num_retries = 0;
+    let mut state = ConnectionState::<M::ConnectError>::Cached(node_info);

-    let mut num_retries = 1;
-
-    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
-    info!("compute node's state has likely changed; requesting a wake-up");
-    let node_info = loop {
-        let wake_res = match creds {
-            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
-            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
-            // nothing to do?
-            auth::BackendType::Link(_) => return Err(err.into()),
-            // test backend
-            auth::BackendType::Test(x) => x.wake_compute(),
-        };
-
-        match handle_try_wake(wake_res, num_retries) {
-            Err(e) => {
-                error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
-                return Err(e.into());
-            }
-            // failed to wake up but we can continue to retry
-            Ok(ControlFlow::Continue(e)) => {
-                warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-            }
-            // successfully woke up a compute node and can break the wakeup loop
-            Ok(ControlFlow::Break(mut node_info)) => {
-                node_info.config.reuse_password(&config);
-                mechanism.update_connect_config(&mut node_info.config);
-                break node_info;
-            }
-        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-
-        time::sleep(wait_duration).await;
-    };
-
-    // now that we have a new node, try connect to it repeatedly.
-    // this can error for a few reasons, for instance:
-    // * DNS connection settings haven't quite propagated yet
-    info!("wake_compute success. attempting to connect");
    loop {
-        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-            Ok(res) => return Ok(res),
-            Err(e) => {
-                let retriable = e.should_retry(num_retries);
-                if !retriable {
-                    error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
-                    return Err(e.into());
+        match state {
+            ConnectionState::Invalid(config, err) => {
+                let wake_res = match creds {
+                    auth::BackendType::Console(api, creds) => {
+                        try_wake(api.as_ref(), extra, creds).await
+                    }
+                    auth::BackendType::Postgres(api, creds) => {
+                        try_wake(api.as_ref(), extra, creds).await
+                    }
+                    // nothing to do?
+                    auth::BackendType::Link(_) => return Err(err.into()),
+                };
+
+                match wake_res {
+                    // there was an error communicating with the control plane
+                    Err(e) => return Err(e.into()),
+                    // failed to wake up but we can continue to retry
+                    Ok(ControlFlow::Continue(_)) => {
+                        state = ConnectionState::Invalid(config, err);
+                        let wait_duration = retry_after(num_retries);
+                        num_retries += 1;
+
+                        info!(num_retries, "retrying wake compute");
+                        time::sleep(wait_duration).await;
+                        continue;
+                    }
+                    // successfully woke up a compute node and can break the wakeup loop
+                    Ok(ControlFlow::Break(mut node_info)) => {
+                        node_info.config.reuse_password(&config);
+                        mechanism.update_connect_config(&mut node_info.config);
+                        state = ConnectionState::Cached(node_info)
+                    }
+                }
+            }
+            ConnectionState::Cached(node_info) => {
+                match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+                    Ok(res) => return Ok(res),
+                    Err(e) => {
+                        error!(error = ?e, "could not connect to compute node");
+                        if !e.should_retry(num_retries) {
+                            return Err(e.into());
+                        }
+
+                        // after the first connect failure,
+                        // we should invalidate the cache and wake up a new compute node
+                        if num_retries == 0 {
+                            state = ConnectionState::Invalid(invalidate_cache(node_info), e);
+                        } else {
+                            state = ConnectionState::Cached(node_info);
+                        }
+
+                        let wait_duration = retry_after(num_retries);
+                        num_retries += 1;
+
+                        info!(num_retries, "retrying wake compute");
+                        time::sleep(wait_duration).await;
+                    }
                }
-                warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
            }
        }
-
-        let wait_duration = retry_after(num_retries);
-        num_retries += 1;
-
-        time::sleep(wait_duration).await;
    }
 }

@@ -474,15 +478,15 @@ where
 /// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
 /// * Returns Ok(Break(node)) if the wakeup succeeded
 /// * Returns Err(e) if there was an error
-pub fn handle_try_wake(
-    result: Result<console::CachedNodeInfo, WakeComputeError>,
-    num_retries: u32,
+pub async fn try_wake(
+    api: &impl console::Api,
+    extra: &console::ConsoleReqExtra<'_>,
+    creds: &auth::ClientCredentials<'_>,
 ) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
-    match result {
+    info!("compute node's state has likely changed; requesting a wake-up");
+    match api.wake_compute(extra, creds).await {
        Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
-                Ok(ControlFlow::Continue(err))
-            }
+            WakeComputeError::ApiError(api) if api.could_retry() => Ok(ControlFlow::Continue(err)),
            _ => Err(err),
        },
        // Ready to try again.
@@ -494,6 +498,8 @@ pub trait ShouldRetry {
    fn could_retry(&self) -> bool;
    fn should_retry(&self, num_retries: u32) -> bool {
        match self {
+            // retry all errors at least once
+            _ if num_retries == 0 => true,
            _ if num_retries >= NUM_RETRIES_CONNECT => false,
            err => err.could_retry(),
        }
@@ -546,8 +552,13 @@ impl ShouldRetry for compute::ConnectionError {
 }

 pub fn retry_after(num_retries: u32) -> time::Duration {
-    // 1.5 seems to be an ok growth factor heuristic
-    BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
+    match num_retries {
+        0 => time::Duration::ZERO,
+        _ => {
+            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
+            BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries)
+        }
+    }
 }

 /// Finish client connection initialization: confirm auth success, send params, etc.
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,10 +1,10 @@
 //! A group of high-level tests for connection establishing logic and auth.
-//!
+use std::borrow::Cow;
+
 use super::*;
-use crate::auth::backend::TestBackend;
 use crate::auth::ClientCredentials;
 use crate::console::{CachedNodeInfo, NodeInfo};
-use crate::{auth, http, sasl, scram};
+use crate::{auth, sasl, scram};
 use async_trait::async_trait;
 use rstest::rstest;
 use tokio_postgres::config::SslMode;
@@ -302,18 +302,15 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 #[test]
 fn connect_compute_total_wait() {
    let mut total_wait = tokio::time::Duration::ZERO;
-    for num_retries in 1..10 {
+    for num_retries in 0..10 {
        total_wait += retry_after(num_retries);
    }
    assert!(total_wait < tokio::time::Duration::from_secs(12));
    assert!(total_wait > tokio::time::Duration::from_secs(10));
 }

-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy)]
 enum ConnectAction {
-    Wake,
-    WakeFail,
-    WakeRetry,
    Connect,
    Retry,
    Fail,
@@ -324,17 +321,6 @@ struct TestConnectMechanism {
    sequence: Vec<ConnectAction>,
 }

-impl TestConnectMechanism {
-    fn verify(&self) {
-        let counter = self.counter.lock().unwrap();
-        assert_eq!(
-            *counter,
-            self.sequence.len(),
-            "sequence does not proceed to the end"
-        );
-    }
-}
-
 impl TestConnectMechanism {
    fn new(sequence: Vec<ConnectAction>) -> Self {
        Self {
@@ -384,63 +370,30 @@ impl ConnectMechanism for TestConnectMechanism {
            ConnectAction::Connect => Ok(TestConnection),
            ConnectAction::Retry => Err(TestConnectError { retryable: true }),
            ConnectAction::Fail => Err(TestConnectError { retryable: false }),
-            x => panic!("expecting action {:?}, connect is called instead", x),
        }
    }

    fn update_connect_config(&self, _conf: &mut compute::ConnCfg) {}
 }

-impl TestBackend for TestConnectMechanism {
-    fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
-        let mut counter = self.counter.lock().unwrap();
-        let action = self.sequence[*counter];
-        *counter += 1;
-        match action {
-            ConnectAction::Wake => Ok(helper_create_cached_node_info()),
-            ConnectAction::WakeFail => {
-                let err = console::errors::ApiError::Console {
-                    status: http::StatusCode::FORBIDDEN,
-                    text: "TEST".into(),
-                };
-                assert!(!err.could_retry());
-                Err(console::errors::WakeComputeError::ApiError(err))
-            }
-            ConnectAction::WakeRetry => {
-                let err = console::errors::ApiError::Console {
-                    status: http::StatusCode::INTERNAL_SERVER_ERROR,
-                    text: "TEST".into(),
-                };
-                assert!(err.could_retry());
-                Err(console::errors::WakeComputeError::ApiError(err))
-            }
-            x => panic!("expecting action {:?}, wake_compute is called instead", x),
-        }
-    }
-}
-
-fn helper_create_cached_node_info() -> CachedNodeInfo {
+fn helper_create_connect_info() -> (
+    CachedNodeInfo,
+    console::ConsoleReqExtra<'static>,
+    auth::BackendType<'static, ClientCredentials<'static>>,
+) {
    let node = NodeInfo {
        config: compute::ConnCfg::new(),
        aux: Default::default(),
        allow_self_signed_compute: false,
    };
-    CachedNodeInfo::new_uncached(node)
-}
-
-fn helper_create_connect_info(
-    mechanism: &TestConnectMechanism,
-) -> (
-    CachedNodeInfo,
-    console::ConsoleReqExtra<'static>,
-    auth::BackendType<'_, ClientCredentials<'static>>,
-) {
-    let cache = helper_create_cached_node_info();
+    let cache = CachedNodeInfo::new_uncached(node);
    let extra = console::ConsoleReqExtra {
        session_id: uuid::Uuid::new_v4(),
        application_name: Some("TEST"),
    };
-    let creds = auth::BackendType::Test(mechanism);
+    let url = "https://TEST_URL".parse().unwrap();
+    let api = console::provider::mock::Api::new(url);
+    let creds = auth::BackendType::Postgres(Cow::Owned(api), ClientCredentials::new_noop());
    (cache, extra, creds)
 }

@@ -448,46 +401,42 @@ fn helper_create_connect_info(
 async fn connect_to_compute_success() {
    use ConnectAction::*;
    let mechanism = TestConnectMechanism::new(vec![Connect]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
+    let (cache, extra, creds) = helper_create_connect_info();
    connect_to_compute(&mechanism, cache, &extra, &creds)
        .await
        .unwrap();
-    mechanism.verify();
 }

 #[tokio::test]
 async fn connect_to_compute_retry() {
    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
+    let mechanism = TestConnectMechanism::new(vec![Retry, Retry, Connect]);
+    let (cache, extra, creds) = helper_create_connect_info();
    connect_to_compute(&mechanism, cache, &extra, &creds)
        .await
        .unwrap();
-    mechanism.verify();
 }

 /// Test that we don't retry if the error is not retryable.
 #[tokio::test]
 async fn connect_to_compute_non_retry_1() {
    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
+    let mechanism = TestConnectMechanism::new(vec![Retry, Retry, Fail]);
+    let (cache, extra, creds) = helper_create_connect_info();
    connect_to_compute(&mechanism, cache, &extra, &creds)
        .await
        .unwrap_err();
-    mechanism.verify();
 }

 /// Even for non-retryable errors, we should retry at least once.
 #[tokio::test]
 async fn connect_to_compute_non_retry_2() {
    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
+    let mechanism = TestConnectMechanism::new(vec![Fail, Retry, Connect]);
+    let (cache, extra, creds) = helper_create_connect_info();
    connect_to_compute(&mechanism, cache, &extra, &creds)
        .await
        .unwrap();
-    mechanism.verify();
 }

 /// Retry for at most `NUM_RETRIES_CONNECT` times.
@@ -496,36 +445,11 @@ async fn connect_to_compute_non_retry_3() {
    assert_eq!(NUM_RETRIES_CONNECT, 10);
    use ConnectAction::*;
    let mechanism = TestConnectMechanism::new(vec![
-        Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
+        Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
        /* the 11th time */ Retry,
    ]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
+    let (cache, extra, creds) = helper_create_connect_info();
    connect_to_compute(&mechanism, cache, &extra, &creds)
        .await
        .unwrap_err();
-    mechanism.verify();
-}
-
-/// Should retry wake compute.
-#[tokio::test]
-async fn wake_retry() {
-    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds)
-        .await
-        .unwrap();
-    mechanism.verify();
-}
-
-/// Wake failed with a non-retryable error.
-#[tokio::test]
-async fn wake_non_retry() {
-    use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]);
-    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mechanism, cache, &extra, &creds)
-        .await
-        .unwrap_err();
-    mechanism.verify();
 }
--- a/proxy/src/sasl/stream.rs
+++ b/proxy/src/sasl/stream.rs
@@ -4,7 +4,6 @@ use super::{messages::ServerMessage, Mechanism};
 use crate::stream::PqStream;
 use std::io;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::info;

 /// Abstracts away all peculiarities of the libpq's protocol.
 pub struct SaslStream<'a, S> {
@@ -69,10 +68,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> SaslStream<'_, S> {
    ) -> super::Result<Outcome<M::Output>> {
        loop {
            let input = self.recv().await?;
-            let step = mechanism.exchange(input).map_err(|error| {
-                info!(?error, "error during SASL exchange");
-                error
-            })?;
+            let step = mechanism.exchange(input)?;

            use super::Step;
            return Ok(match step {
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -15,7 +15,6 @@ use toml_edit::Document;
 use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::path::{Path, PathBuf};
-use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 use storage_broker::Uri;
@@ -80,10 +79,6 @@ struct Args {
    /// Listen http endpoint for management and metrics in the form host:port.
    #[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)]
    listen_http: String,
-    /// Advertised endpoint for receiving/sending WAL in the form host:port. If not
-    /// specified, listen_pg is used to advertise instead.
-    #[arg(long, default_value = None)]
-    advertise_pg: Option<String>,
    /// Availability zone of the safekeeper.
    #[arg(long)]
    availability_zone: Option<String>,
@@ -123,24 +118,9 @@ struct Args {
    /// WAL backup horizon.
    #[arg(long)]
    disable_wal_backup: bool,
-    /// If given, enables auth on incoming connections to WAL service endpoint
-    /// (--listen-pg). Value specifies path to a .pem public key used for
-    /// validations of JWT tokens. Empty string is allowed and means disabling
-    /// auth.
-    #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
-    pg_auth_public_key_path: Option<PathBuf>,
-    /// If given, enables auth on incoming connections to tenant only WAL
-    /// service endpoint (--listen-pg-tenant-only). Value specifies path to a
-    /// .pem public key used for validations of JWT tokens. Empty string is
-    /// allowed and means disabling auth.
-    #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
-    pg_tenant_only_auth_public_key_path: Option<PathBuf>,
-    /// If given, enables auth on incoming connections to http management
-    /// service endpoint (--listen-http). Value specifies path to a .pem public
-    /// key used for validations of JWT tokens. Empty string is allowed and
-    /// means disabling auth.
-    #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
-    http_auth_public_key_path: Option<PathBuf>,
+    /// Path to a .pem public key which is used to check JWT tokens.
+    #[arg(long)]
+    auth_validation_public_key_path: Option<PathBuf>,
    /// Format for logging, either 'plain' or 'json'.
    #[arg(long, default_value = "plain")]
    log_format: String,
@@ -150,39 +130,9 @@ struct Args {
    current_thread_runtime: bool,
 }

-// Like PathBufValueParser, but allows empty string.
-fn opt_pathbuf_parser(s: &str) -> Result<PathBuf, String> {
-    Ok(PathBuf::from_str(s).unwrap())
-}
-
 #[tokio::main(flavor = "current_thread")]
 async fn main() -> anyhow::Result<()> {
-    // We want to allow multiple occurences of the same arg (taking the last) so
-    // that neon_local could generate command with defaults + overrides without
-    // getting 'argument cannot be used multiple times' error. This seems to be
-    // impossible with pure Derive API, so convert struct to Command, modify it,
-    // parse arguments, and then fill the struct back.
-    let cmd = <Args as clap::CommandFactory>::command().args_override_self(true);
-    let mut matches = cmd.get_matches();
-    let mut args = <Args as clap::FromArgMatches>::from_arg_matches_mut(&mut matches)?;
-
-    // I failed to modify opt_pathbuf_parser to return Option<PathBuf> in
-    // reasonable time, so turn empty string into option post factum.
-    if let Some(pb) = &args.pg_auth_public_key_path {
-        if pb.as_os_str().is_empty() {
-            args.pg_auth_public_key_path = None;
-        }
-    }
-    if let Some(pb) = &args.pg_tenant_only_auth_public_key_path {
-        if pb.as_os_str().is_empty() {
-            args.pg_tenant_only_auth_public_key_path = None;
-        }
-    }
-    if let Some(pb) = &args.http_auth_public_key_path {
-        if pb.as_os_str().is_empty() {
-            args.http_auth_public_key_path = None;
-        }
-    }
+    let args = Args::parse();

    if let Some(addr) = args.dump_control_file {
        let state = control_file::FileStorage::load_control_file(addr)?;
@@ -216,40 +166,13 @@ async fn main() -> anyhow::Result<()> {
        return Ok(());
    }

-    let pg_auth = match args.pg_auth_public_key_path.as_ref() {
+    let auth = match args.auth_validation_public_key_path.as_ref() {
        None => {
-            info!("pg auth is disabled");
+            info!("auth is disabled");
            None
        }
        Some(path) => {
-            info!("loading pg auth JWT key from {}", path.display());
-            Some(Arc::new(
-                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
-            ))
-        }
-    };
-    let pg_tenant_only_auth = match args.pg_tenant_only_auth_public_key_path.as_ref() {
-        None => {
-            info!("pg tenant only auth is disabled");
-            None
-        }
-        Some(path) => {
-            info!(
-                "loading pg tenant only auth JWT key from {}",
-                path.display()
-            );
-            Some(Arc::new(
-                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
-            ))
-        }
-    };
-    let http_auth = match args.http_auth_public_key_path.as_ref() {
-        None => {
-            info!("http auth is disabled");
-            None
-        }
-        Some(path) => {
-            info!("loading http auth JWT key from {}", path.display());
+            info!("loading JWT auth key from {}", path.display());
            Some(Arc::new(
                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
            ))
@@ -262,7 +185,6 @@ async fn main() -> anyhow::Result<()> {
        listen_pg_addr: args.listen_pg,
        listen_pg_addr_tenant_only: args.listen_pg_tenant_only,
        listen_http_addr: args.listen_http,
-        advertise_pg_addr: args.advertise_pg,
        availability_zone: args.availability_zone,
        no_sync: args.no_sync,
        broker_endpoint: args.broker_endpoint,
@@ -272,9 +194,7 @@ async fn main() -> anyhow::Result<()> {
        max_offloader_lag_bytes: args.max_offloader_lag,
        wal_backup_enabled: !args.disable_wal_backup,
        backup_parallel_jobs: args.wal_backup_parallel_jobs,
-        pg_auth,
-        pg_tenant_only_auth,
-        http_auth,
+        auth,
        current_thread_runtime: args.current_thread_runtime,
    };

@@ -314,10 +234,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
                listen_pg_addr_tenant_only
            );
            let listener = tcp_listener::bind(listen_pg_addr_tenant_only.clone()).map_err(|e| {
-                error!(
-                    "failed to bind to address {}: {}",
-                    listen_pg_addr_tenant_only, e
-                );
+                error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
                e
            })?;
            Some(listener)
@@ -363,7 +280,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
        .spawn(wal_service::task_main(
            conf_,
            pg_listener,
-            Scope::SafekeeperData,
+            Some(Scope::SafekeeperData),
        ))
        // wrap with task name for error reporting
        .map(|res| ("WAL service main".to_owned(), res));
@@ -377,7 +294,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
            .spawn(wal_service::task_main(
                conf_,
                pg_listener_tenant_only,
-                Scope::Tenant,
+                Some(Scope::Tenant),
            ))
            // wrap with task name for error reporting
            .map(|res| ("WAL service tenant only main".to_owned(), res));
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -2,9 +2,8 @@
 //! protocol commands.

 use anyhow::Context;
+use std::str;
 use std::str::FromStr;
-use std::str::{self};
-use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span, Instrument};

@@ -12,7 +11,6 @@ use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};

 use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED};
-use crate::safekeeper::Term;
 use crate::timeline::TimelineError;
 use crate::wal_service::ConnectionId;
 use crate::{GlobalTimelines, SafeKeeperConf};
@@ -21,7 +19,7 @@ use postgres_backend::{self, PostgresBackend};
 use postgres_ffi::PG_TLI;
 use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
 use regex::Regex;
-use utils::auth::{Claims, JwtAuth, Scope};
+use utils::auth::{Claims, Scope};
 use utils::{
    id::{TenantId, TenantTimelineId, TimelineId},
    lsn::Lsn,
@@ -37,8 +35,8 @@ pub struct SafekeeperPostgresHandler {
    pub ttid: TenantTimelineId,
    /// Unique connection id is logged in spans for observability.
    pub conn_id: ConnectionId,
-    /// Auth scope allowed on the connections and public key used to check auth tokens. None if auth is not configured.
-    auth: Option<(Scope, Arc<JwtAuth>)>,
+    /// Auth scope allowed on the connections. None if auth is not configured.
+    allowed_auth_scope: Option<Scope>,
    claims: Option<Claims>,
    io_metrics: Option<TrafficMetrics>,
 }
@@ -46,7 +44,7 @@ pub struct SafekeeperPostgresHandler {
 /// Parsed Postgres command.
 enum SafekeeperPostgresCommand {
    StartWalPush,
-    StartReplication { start_lsn: Lsn, term: Option<Term> },
+    StartReplication { start_lsn: Lsn },
    IdentifySystem,
    TimelineStatus,
    JSONCtrl { cmd: AppendLogicalMessage },
@@ -57,21 +55,15 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
        Ok(SafekeeperPostgresCommand::StartWalPush)
    } else if cmd.starts_with("START_REPLICATION") {
        let re = Regex::new(
-            // We follow postgres START_REPLICATION LOGICAL options to pass term.
-            r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)(?: \(term='(\d+)'\))?",
+            r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)",
        )
        .unwrap();
-        let caps = re
-            .captures(cmd)
-            .context(format!("failed to parse START_REPLICATION command {}", cmd))?;
-        let start_lsn =
-            Lsn::from_str(&caps[1]).context("parse start LSN from START_REPLICATION command")?;
-        let term = if let Some(m) = caps.get(2) {
-            Some(m.as_str().parse::<u64>().context("invalid term")?)
-        } else {
-            None
-        };
-        Ok(SafekeeperPostgresCommand::StartReplication { start_lsn, term })
+        let mut caps = re.captures_iter(cmd);
+        let start_lsn = caps
+            .next()
+            .map(|cap| Lsn::from_str(&cap[1]))
+            .context("parse start LSN from START_REPLICATION command")??;
+        Ok(SafekeeperPostgresCommand::StartReplication { start_lsn })
    } else if cmd.starts_with("IDENTIFY_SYSTEM") {
        Ok(SafekeeperPostgresCommand::IdentifySystem)
    } else if cmd.starts_with("TIMELINE_STATUS") {
@@ -155,17 +147,18 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
    ) -> Result<(), QueryError> {
        // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
        // which requires auth to be present
-        let (allowed_auth_scope, auth) = self
+        let data = self
+            .conf
            .auth
            .as_ref()
-            .expect("auth_type is configured but .auth of handler is missing");
-        let data =
-            auth.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
+            .unwrap()
+            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;

+        let scope = self
+            .allowed_auth_scope
+            .expect("auth is enabled but scope is not configured");
        // The handler might be configured to allow only tenant scope tokens.
-        if matches!(allowed_auth_scope, Scope::Tenant)
-            && !matches!(data.claims.scope, Scope::Tenant)
-        {
+        if matches!(scope, Scope::Tenant) && !matches!(data.claims.scope, Scope::Tenant) {
            return Err(QueryError::Other(anyhow::anyhow!(
                "passed JWT token is for full access, but only tenant scope is allowed"
            )));
@@ -225,8 +218,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                    .instrument(info_span!("WAL receiver", ttid = %span_ttid))
                    .await
            }
-            SafekeeperPostgresCommand::StartReplication { start_lsn, term } => {
-                self.handle_start_replication(pgb, start_lsn, term)
+            SafekeeperPostgresCommand::StartReplication { start_lsn } => {
+                self.handle_start_replication(pgb, start_lsn)
                    .instrument(info_span!("WAL sender", ttid = %span_ttid))
                    .await
            }
@@ -244,7 +237,7 @@ impl SafekeeperPostgresHandler {
        conf: SafeKeeperConf,
        conn_id: u32,
        io_metrics: Option<TrafficMetrics>,
-        auth: Option<(Scope, Arc<JwtAuth>)>,
+        allowed_auth_scope: Option<Scope>,
    ) -> Self {
        SafekeeperPostgresHandler {
            conf,
@@ -254,7 +247,7 @@ impl SafekeeperPostgresHandler {
            ttid: TenantTimelineId::empty(),
            conn_id,
            claims: None,
-            auth,
+            allowed_auth_scope,
            io_metrics,
        }
    }
@@ -262,7 +255,7 @@ impl SafekeeperPostgresHandler {
    // when accessing management api supply None as an argument
    // when using to authorize tenant pass corresponding tenant id
    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
-        if self.auth.is_none() {
+        if self.conf.auth.is_none() {
            // auth is set to Trust, nothing to check so just return ok
            return Ok(());
        }
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -359,7 +359,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
 /// Safekeeper http router.
 pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
    let mut router = endpoint::make_router();
-    if conf.http_auth.is_some() {
+    if conf.auth.is_some() {
        router = router.middleware(auth_middleware(|request| {
            #[allow(clippy::mutable_key_type)]
            static ALLOWLIST_ROUTES: Lazy<HashSet<Uri>> =
@@ -375,7 +375,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>

    // NB: on any changes do not forget to update the OpenAPI spec
    // located nearby (/safekeeper/src/http/openapi_spec.yaml).
-    let auth = conf.http_auth.clone();
+    let auth = conf.auth.clone();
    router
        .data(Arc::new(conf))
        .data(auth)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Joonas Koivunen	ec672723fa	try: catch all bad tests by removing the implicit endpoint creation	2023-07-27 16:47:46 +03:00
Joonas Koivunen	b9aa38358f	test: do not start two primary endpoints on same branch	2023-07-27 14:32:13 +03:00
Joonas Koivunen	d4b64b9ef7	test: allow passing branch-name to endpoint_start	2023-07-27 14:31:53 +03:00
Joonas Koivunen	b6f5c395cb	neon_local: add check against duplicate primaries	2023-07-27 14:31:04 +03:00