Merge pull request #5113 from neondatabase/release-http-connection-fix

Release 2023-08-25
proxy: dont return connection pending (#5107 )
2026-06-02 13:00:37 +00:00 · 2023-08-25 17:21:35 +01:00 · 2023-08-25 16:42:30 +01:00 · 2023-08-25 16:42:30 +01:00 · 2023-08-22 09:06:14 +01:00 · 2023-08-22 09:41:02 +02:00
99 changed files with 4778 additions and 2641 deletions
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -1,6 +1,13 @@
 name: 'Create Allure report'
 description: 'Generate Allure report from uploaded by actions/allure-report-store tests results'

+inputs:
+  store-test-results-into-db:
+    description: 'Whether to store test results into the database. TEST_RESULT_CONNSTR/TEST_RESULT_CONNSTR_NEW should be set'
+    type: boolean
+    required: false
+    default: false
+
 outputs:
  base-url:
    description: 'Base URL for Allure report'
@@ -139,9 +146,11 @@ runs:
        sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html?nocache='"'+Date.now()+'"'" class=|g' ${WORKDIR}/report/app.js

        # Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
-        # Use sync for the final report to delete files from previous runs
        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
-        time aws s3 sync --delete --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
+
+        # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
+        # and to keep files on the host to upload them to the database
+        time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"

        # Generate redirect
        cat <<EOF > ${WORKDIR}/index.html
@@ -170,6 +179,41 @@ runs:
          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
        fi

+    - name: Store Allure test stat in the DB
+      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
+      shell: bash -euxo pipefail {0}
+      env:
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }}
+      run: |
+        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR}
+
+        ./scripts/pysync
+
+        poetry run python3 scripts/ingest_regress_test_result.py \
+          --revision ${COMMIT_SHA} \
+          --reference ${GITHUB_REF} \
+          --build-type unified \
+          --ingest ${WORKDIR}/report/data/suites.json
+
+    - name: Store Allure test stat in the DB (new)
+      if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
+      shell: bash -euxo pipefail {0}
+      env:
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
+      run: |
+        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}
+
+        ./scripts/pysync
+
+        poetry run python3 scripts/ingest_regress_test_result-new-format.py \
+          --reference ${GITHUB_REF} \
+          --revision ${COMMIT_SHA} \
+          --run-id ${GITHUB_RUN_ID} \
+          --run-attempt ${GITHUB_RUN_ATTEMPT} \
+          --test-cases-dir ${WORKDIR}/report/data/test-cases
+
    - name: Cleanup
      if: always()
      shell: bash -euxo pipefail {0}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -432,6 +432,11 @@ jobs:
        if: ${{ !cancelled() }}
        id: create-allure-report
        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+        env:
+          REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}

      - uses: actions/github-script@v6
        if: ${{ !cancelled() }}
@@ -452,45 +457,6 @@ jobs:
              report,
            })

-      - name: Store Allure test stat in the DB
-        if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
-        env:
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
-        run: |
-          ./scripts/pysync
-
-          curl --fail --output suites.json "${REPORT_JSON_URL}"
-          export BUILD_TYPE=unified
-          export DATABASE_URL="$TEST_RESULT_CONNSTR"
-
-          poetry run python3 scripts/ingest_regress_test_result.py \
-            --revision ${COMMIT_SHA} \
-            --reference ${GITHUB_REF} \
-            --build-type ${BUILD_TYPE} \
-            --ingest suites.json
-
-      - name: Store Allure test stat in the DB (new)
-        if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
-        env:
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
-          BASE_S3_URL: ${{ steps.create-allure-report.outputs.base-s3-url }}
-        run: |
-          aws s3 cp --only-show-errors --recursive ${BASE_S3_URL}/data/test-cases ./test-cases
-
-          ./scripts/pysync
-
-          export DATABASE_URL="$TEST_RESULT_CONNSTR"
-          poetry run python3 scripts/ingest_regress_test_result-new-format.py \
-            --reference ${GITHUB_REF} \
-            --revision ${COMMIT_SHA} \
-            --run-id ${GITHUB_RUN_ID} \
-            --run-attempt ${GITHUB_RUN_ATTEMPT} \
-            --test-cases-dir ./test-cases
-
  coverage-report:
    runs-on: [ self-hosted, gen3, small ]
    container:
@@ -814,7 +780,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.15.0-alpha1
+      VM_BUILDER_VERSION: v0.16.3

    steps:
      - name: Checkout
@@ -835,7 +801,11 @@ jobs:

      - name: Build vm image
        run: |
-          ./vm-builder -enable-file-cache -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          ./vm-builder \
+            -enable-file-cache \
+            -enable-monitor \
+            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
+            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

      - name: Pushing vm-compute-node image
        run: |
@@ -975,7 +945,7 @@ jobs:
        version: [ v14, v15 ]

    env:
-      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
      AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
      S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -639,6 +639,12 @@ dependencies = [
 "vsimd",
 ]

+[[package]]
+name = "base64ct"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
+
 [[package]]
 name = "bincode"
 version = "1.3.3"
@@ -886,6 +892,8 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "chrono",
+ "regex",
+ "remote_storage",
 "serde",
 "serde_json",
 "serde_with",
@@ -1010,9 +1018,9 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"

 [[package]]
 name = "cpufeatures"
-version = "0.2.7"
+version = "0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58"
+checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1"
 dependencies = [
 "libc",
 ]
@@ -1192,15 +1200,15 @@ dependencies = [

 [[package]]
 name = "dashmap"
-version = "5.4.0"
+version = "5.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
+checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
 dependencies = [
 "cfg-if",
- "hashbrown 0.12.3",
+ "hashbrown 0.14.0",
 "lock_api",
 "once_cell",
- "parking_lot_core 0.9.7",
+ "parking_lot_core 0.9.8",
 ]

 [[package]]
@@ -1649,6 +1657,12 @@ dependencies = [
 "ahash",
 ]

+[[package]]
+name = "hashbrown"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
+
 [[package]]
 name = "hashlink"
 version = "0.8.2"
@@ -2073,9 +2087,9 @@ checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"

 [[package]]
 name = "lock_api"
-version = "0.4.9"
+version = "0.4.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df"
+checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16"
 dependencies = [
 "autocfg",
 "scopeguard",
@@ -2339,9 +2353,9 @@ dependencies = [

 [[package]]
 name = "once_cell"
-version = "1.17.1"
+version = "1.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
+checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"

 [[package]]
 name = "oorandom"
@@ -2640,7 +2654,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
 dependencies = [
 "lock_api",
- "parking_lot_core 0.9.7",
+ "parking_lot_core 0.9.8",
 ]

 [[package]]
@@ -2659,15 +2673,26 @@ dependencies = [

 [[package]]
 name = "parking_lot_core"
-version = "0.9.7"
+version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521"
+checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447"
 dependencies = [
 "cfg-if",
 "libc",
- "redox_syscall 0.2.16",
+ "redox_syscall 0.3.5",
 "smallvec",
- "windows-sys 0.45.0",
+ "windows-targets 0.48.0",
+]
+
+[[package]]
+name = "password-hash"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "346f04948ba92c43e8469c1ee6736c7563d71012b17d40745260fe106aac2166"
+dependencies = [
+ "base64ct",
+ "rand_core",
+ "subtle",
 ]

 [[package]]
@@ -2678,6 +2703,8 @@ checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
 dependencies = [
 "digest",
 "hmac",
+ "password-hash",
+ "sha2",
 ]

 [[package]]
@@ -2798,7 +2825,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=b25e7f366487f41bc1607e6d824e88996fb02350#b25e7f366487f41bc1607e6d824e88996fb02350"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -2811,7 +2838,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=b25e7f366487f41bc1607e6d824e88996fb02350#b25e7f366487f41bc1607e6d824e88996fb02350"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "native-tls",
 "tokio",
@@ -2822,7 +2849,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=b25e7f366487f41bc1607e6d824e88996fb02350#b25e7f366487f41bc1607e6d824e88996fb02350"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -2840,7 +2867,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=b25e7f366487f41bc1607e6d824e88996fb02350#b25e7f366487f41bc1607e6d824e88996fb02350"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3056,6 +3083,7 @@ dependencies = [
 "chrono",
 "clap",
 "consumption_metrics",
+ "dashmap",
 "futures",
 "git-version",
 "hashbrown 0.13.2",
@@ -3541,9 +3569,9 @@ dependencies = [

 [[package]]
 name = "rustls-webpki"
-version = "0.100.1"
+version = "0.100.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b"
+checksum = "e98ff011474fa39949b7e5c0428f9b4937eda7da7848bbb947786b7be0b27dab"
 dependencies = [
 "ring",
 "untrusted",
@@ -4331,7 +4359,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=b25e7f366487f41bc1607e6d824e88996fb02350#b25e7f366487f41bc1607e6d824e88996fb02350"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
 "async-trait",
 "byteorder",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -54,6 +54,7 @@ comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
+dashmap = "5.5.0"
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
@@ -88,7 +89,7 @@ opentelemetry = "0.19.0"
 opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.11.0"
 parking_lot = "0.12"
-pbkdf2 = "0.12.1"
+pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
@@ -144,11 +145,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="b25e7f366487f41bc1607e6d824e88996fb02350" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="b25e7f366487f41bc1607e6d824e88996fb02350" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="b25e7f366487f41bc1607e6d824e88996fb02350" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="b25e7f366487f41bc1607e6d824e88996fb02350" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="b25e7f366487f41bc1607e6d824e88996fb02350" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -183,7 +184,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="b25e7f366487f41bc1607e6d824e88996fb02350" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }

 ################# Binary contents sections

--- a/2
+++ b/2
@@ -51,6 +51,7 @@ RUN set -e \
      --bin safekeeper  \
      --bin storage_broker  \
      --bin proxy  \
+      --bin neon_local \
      --locked --release \
    && cachepot -s

@@ -76,6 +77,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -38,7 +38,7 @@ use std::fs::File;
 use std::panic;
 use std::path::Path;
 use std::process::exit;
-use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock, RwLock};
+use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
@@ -147,6 +147,7 @@ fn main() -> Result<()> {
    match spec_json {
        // First, try to get cluster spec from the cli argument
        Some(json) => {
+            info!("got spec from cli argument {}", json);
            spec = Some(serde_json::from_str(json)?);
        }
        None => {
@@ -182,6 +183,7 @@ fn main() -> Result<()> {

    if let Some(spec) = spec {
        let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
+        info!("new pspec.spec: {:?}", pspec.spec);
        new_state.pspec = Some(pspec);
        spec_set = true;
    } else {
@@ -196,9 +198,7 @@ fn main() -> Result<()> {
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
        ext_remote_storage,
-        ext_remote_paths: OnceLock::new(),
        ext_download_progress: RwLock::new(HashMap::new()),
-        library_index: OnceLock::new(),
        build_tag,
    };
    let compute = Arc::new(compute_node);
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -5,7 +5,7 @@ use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
-use std::sync::{Condvar, Mutex, OnceLock, RwLock};
+use std::sync::{Condvar, Mutex, RwLock};
 use std::time::Instant;

 use anyhow::{Context, Result};
@@ -14,7 +14,6 @@ use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use postgres::{Client, NoTls};
-use regex::Regex;
 use tokio;
 use tokio_postgres;
 use tracing::{error, info, instrument, warn};
@@ -60,10 +59,6 @@ pub struct ComputeNode {
    pub state_changed: Condvar,
    ///  the S3 bucket that we search for extensions in
    pub ext_remote_storage: Option<GenericRemoteStorage>,
-    // (key: extension name, value: path to extension archive in remote storage)
-    pub ext_remote_paths: OnceLock<HashMap<String, RemotePath>>,
-    // (key: library name, value: name of extension containing this library)
-    pub library_index: OnceLock<HashMap<String, String>>,
    // key: ext_archive_name, value: started download time, download_completed?
    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
    pub build_tag: String,
@@ -75,7 +70,6 @@ pub struct RemoteExtensionMetrics {
    num_ext_downloaded: u64,
    largest_ext_size: u64,
    total_ext_download_size: u64,
-    prep_extensions_ms: u64,
 }

 #[derive(Clone, Debug)]
@@ -745,11 +739,19 @@ impl ComputeNode {
            pspec.timeline_id,
        );

+        info!(
+            "start_compute spec.remote_extensions {:?}",
+            pspec.spec.remote_extensions
+        );
+
        // This part is sync, because we need to download
        // remote shared_preload_libraries before postgres start (if any)
-        {
+        if let Some(remote_extensions) = &pspec.spec.remote_extensions {
+            // First, create control files for all availale extensions
+            extension_server::create_control_files(remote_extensions, &self.pgbin);
+
            let library_load_start_time = Utc::now();
-            let remote_ext_metrics = self.prepare_preload_libraries(&compute_state)?;
+            let remote_ext_metrics = self.prepare_preload_libraries(&pspec.spec)?;

            let library_load_time = Utc::now()
                .signed_duration_since(library_load_start_time)
@@ -761,7 +763,6 @@ impl ComputeNode {
            state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
            state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
            state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
-            state.metrics.prep_extensions_ms = remote_ext_metrics.prep_extensions_ms;
            info!(
                "Loading shared_preload_libraries took {:?}ms",
                library_load_time
@@ -918,38 +919,11 @@ LIMIT 100",
        }
    }

-    // If remote extension storage is configured,
-    // download extension control files
-    pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
-        if let Some(ref ext_remote_storage) = self.ext_remote_storage {
-            let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-            let spec = &pspec.spec;
-            let custom_ext = spec.custom_extensions.clone().unwrap_or(Vec::new());
-            info!("custom extensions: {:?}", &custom_ext);
-
-            let (ext_remote_paths, library_index) = extension_server::get_available_extensions(
-                ext_remote_storage,
-                &self.pgbin,
-                &self.pgversion,
-                &custom_ext,
-                &self.build_tag,
-            )
-            .await?;
-            self.ext_remote_paths
-                .set(ext_remote_paths)
-                .expect("this is the only time we set ext_remote_paths");
-            self.library_index
-                .set(library_index)
-                .expect("this is the only time we set library_index");
-        }
-        Ok(())
-    }
-
    // download an archive, unzip and place files in correct locations
    pub async fn download_extension(
        &self,
-        ext_name: &str,
-        is_library: bool,
+        real_ext_name: String,
+        ext_path: RemotePath,
    ) -> Result<u64, DownloadError> {
        let remote_storage = self
            .ext_remote_storage
@@ -958,35 +932,6 @@ LIMIT 100",
                "Remote extensions storage is not configured",
            )))?;

-        let mut real_ext_name = ext_name;
-        if is_library {
-            // sometimes library names might have a suffix like
-            // library.so or library.so.3. We strip this off
-            // because library_index is based on the name without the file extension
-            let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
-            let lib_raw_name = strip_lib_suffix.replace(real_ext_name, "").to_string();
-
-            real_ext_name = self
-                .library_index
-                .get()
-                .expect("must have already downloaded the library_index")
-                .get(&lib_raw_name)
-                .ok_or(DownloadError::BadInput(anyhow::anyhow!(
-                    "library {} is not found",
-                    lib_raw_name
-                )))?;
-        }
-
-        let ext_path = &self
-            .ext_remote_paths
-            .get()
-            .expect("error accessing ext_remote_paths")
-            .get(real_ext_name)
-            .ok_or(DownloadError::BadInput(anyhow::anyhow!(
-                "real_ext_name {} is not found",
-                real_ext_name
-            )))?;
-
        let ext_archive_name = ext_path.object_name().expect("bad path");

        let mut first_try = false;
@@ -1039,8 +984,8 @@ LIMIT 100",
        info!("downloading new extension {ext_archive_name}");

        let download_size = extension_server::download_extension(
-            real_ext_name,
-            ext_path,
+            &real_ext_name,
+            &ext_path,
            remote_storage,
            &self.pgbin,
        )
@@ -1058,18 +1003,19 @@ LIMIT 100",
    #[tokio::main]
    pub async fn prepare_preload_libraries(
        &self,
-        compute_state: &ComputeState,
+        spec: &ComputeSpec,
    ) -> Result<RemoteExtensionMetrics> {
        if self.ext_remote_storage.is_none() {
            return Ok(RemoteExtensionMetrics {
                num_ext_downloaded: 0,
                largest_ext_size: 0,
                total_ext_download_size: 0,
-                prep_extensions_ms: 0,
            });
        }
-        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-        let spec = &pspec.spec;
+        let remote_extensions = spec
+            .remote_extensions
+            .as_ref()
+            .ok_or(anyhow::anyhow!("Remote extensions are not configured",))?;

        info!("parse shared_preload_libraries from spec.cluster.settings");
        let mut libs_vec = Vec::new();
@@ -1081,6 +1027,7 @@ LIMIT 100",
                .collect();
        }
        info!("parse shared_preload_libraries from provided postgresql.conf");
+
        // that is used in neon_local and python tests
        if let Some(conf) = &spec.cluster.postgresql_conf {
            let conf_lines = conf.split('\n').collect::<Vec<&str>>();
@@ -1101,30 +1048,16 @@ LIMIT 100",
            libs_vec.extend(preload_libs_vec);
        }

-        info!("Download ext_index.json, find the extension paths");
-        let prep_ext_start_time = Utc::now();
-        self.prepare_external_extensions(compute_state).await?;
-        let prep_ext_time_delta = Utc::now()
-            .signed_duration_since(prep_ext_start_time)
-            .to_std()
-            .unwrap()
-            .as_millis() as u64;
-        info!("Prepare extensions took {prep_ext_time_delta}ms");
-
        // Don't try to download libraries that are not in the index.
        // Assume that they are already present locally.
-        libs_vec.retain(|lib| {
-            self.library_index
-                .get()
-                .expect("error accessing ext_remote_paths")
-                .contains_key(lib)
-        });
+        libs_vec.retain(|lib| remote_extensions.library_index.contains_key(lib));

        info!("Downloading to shared preload libraries: {:?}", &libs_vec);

        let mut download_tasks = Vec::new();
        for library in &libs_vec {
-            download_tasks.push(self.download_extension(library, true));
+            let (ext_name, ext_path) = remote_extensions.get_ext(library, true)?;
+            download_tasks.push(self.download_extension(ext_name, ext_path));
        }
        let results = join_all(download_tasks).await;

@@ -1132,7 +1065,6 @@ LIMIT 100",
            num_ext_downloaded: 0,
            largest_ext_size: 0,
            total_ext_download_size: 0,
-            prep_extensions_ms: prep_ext_time_delta,
        };
        for result in results {
            let download_size = match result {
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -73,10 +73,9 @@ More specifically, here is an example ext_index.json
 */
 use anyhow::Context;
 use anyhow::{self, Result};
-use futures::future::join_all;
+use compute_api::spec::RemoteExtSpec;
 use remote_storage::*;
 use serde_json;
-use std::collections::HashMap;
 use std::io::Read;
 use std::num::{NonZeroU32, NonZeroUsize};
 use std::path::Path;
@@ -117,79 +116,6 @@ pub fn get_pg_version(pgbin: &str) -> String {
    panic!("Unsuported postgres version {human_version}");
 }

-// download control files for enabled_extensions
-// return Hashmaps converting library names to extension names (library_index)
-// and specifying the remote path to the archive for each extension name
-pub async fn get_available_extensions(
-    remote_storage: &GenericRemoteStorage,
-    pgbin: &str,
-    pg_version: &str,
-    custom_extensions: &[String],
-    build_tag: &str,
-) -> Result<(HashMap<String, RemotePath>, HashMap<String, String>)> {
-    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
-    let index_path = format!("{build_tag}/{pg_version}/ext_index.json");
-    let index_path = RemotePath::new(Path::new(&index_path)).context("error forming path")?;
-    info!("download ext_index.json from: {:?}", &index_path);
-
-    let mut download = remote_storage.download(&index_path).await?;
-    let mut ext_idx_buffer = Vec::new();
-    download
-        .download_stream
-        .read_to_end(&mut ext_idx_buffer)
-        .await?;
-    info!("ext_index downloaded");
-
-    #[derive(Debug, serde::Deserialize)]
-    struct Index {
-        public_extensions: Vec<String>,
-        library_index: HashMap<String, String>,
-        extension_data: HashMap<String, ExtensionData>,
-    }
-
-    #[derive(Debug, serde::Deserialize)]
-    struct ExtensionData {
-        control_data: HashMap<String, String>,
-        archive_path: String,
-    }
-
-    let ext_index_full = serde_json::from_slice::<Index>(&ext_idx_buffer)?;
-    let mut enabled_extensions = ext_index_full.public_extensions;
-    enabled_extensions.extend_from_slice(custom_extensions);
-    let library_index = ext_index_full.library_index;
-    let all_extension_data = ext_index_full.extension_data;
-    info!("library_index: {:?}", library_index);
-
-    info!("enabled_extensions: {:?}", enabled_extensions);
-    let mut ext_remote_paths = HashMap::new();
-    let mut file_create_tasks = Vec::new();
-    for extension in enabled_extensions {
-        let ext_data = &all_extension_data[&extension];
-        for (control_file, control_contents) in &ext_data.control_data {
-            let extension_name = control_file
-                .strip_suffix(".control")
-                .expect("control files must end in .control");
-            let control_path = local_sharedir.join(control_file);
-            if !control_path.exists() {
-                ext_remote_paths.insert(
-                    extension_name.to_string(),
-                    RemotePath::from_string(&ext_data.archive_path)?,
-                );
-                info!("writing file {:?}{:?}", control_path, control_contents);
-                file_create_tasks.push(tokio::fs::write(control_path, control_contents));
-            } else {
-                warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_file);
-            }
-        }
-    }
-    let results = join_all(file_create_tasks).await;
-    for result in results {
-        result?;
-    }
-    info!("ext_remote_paths {:?}", ext_remote_paths);
-    Ok((ext_remote_paths, library_index))
-}
-
 // download the archive for a given extension,
 // unzip it, and place files in the appropriate locations (share/lib)
 pub async fn download_extension(
@@ -251,6 +177,22 @@ pub async fn download_extension(
    Ok(download_size)
 }

+// Create extension control files from spec
+pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
+    let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
+    for ext_data in remote_extensions.extension_data.values() {
+        for (control_name, control_content) in &ext_data.control_data {
+            let control_path = local_sharedir.join(control_name);
+            if !control_path.exists() {
+                info!("writing file {:?}{:?}", control_path, control_content);
+                std::fs::write(control_path, control_content).unwrap();
+            } else {
+                warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_path);
+            }
+        }
+    }
+}
+
 // This function initializes the necessary structs to use remote storage
 pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
    #[derive(Debug, serde::Deserialize)]
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -13,7 +13,7 @@ use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use num_cpus;
 use serde_json;
 use tokio::task;
-use tracing::{error, info};
+use tracing::{error, info, warn};
 use tracing_utils::http::OtelName;

 fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
@@ -126,6 +126,15 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            info!("serving {:?} POST request", route);
            info!("req.uri {:?}", req.uri());

+            // don't even try to download extensions
+            // if no remote storage is configured
+            if compute.ext_remote_storage.is_none() {
+                info!("no extensions remote storage configured");
+                let mut resp = Response::new(Body::from("no remote storage configured"));
+                *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                return resp;
+            }
+
            let mut is_library = false;
            if let Some(params) = req.uri().query() {
                info!("serving {:?} POST request with params: {}", route, params);
@@ -137,24 +146,47 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
                    return resp;
                }
            }
-
            let filename = route.split('/').last().unwrap().to_string();
            info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");

-            // don't even try to download extensions
-            // if no remote storage is configured
-            if compute.ext_remote_storage.is_none() {
-                info!("no extensions remote storage configured");
-                let mut resp = Response::new(Body::from("no remote storage configured"));
-                *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
-                return resp;
-            }
+            // get ext_name and path from spec
+            // don't lock compute_state for too long
+            let ext = {
+                let compute_state = compute.state.lock().unwrap();
+                let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+                let spec = &pspec.spec;

-            match compute.download_extension(&filename, is_library).await {
-                Ok(_) => Response::new(Body::from("OK")),
+                // debug only
+                info!("spec: {:?}", spec);
+
+                let remote_extensions = match spec.remote_extensions.as_ref() {
+                    Some(r) => r,
+                    None => {
+                        info!("no remote extensions spec was provided");
+                        let mut resp = Response::new(Body::from("no remote storage configured"));
+                        *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                        return resp;
+                    }
+                };
+
+                remote_extensions.get_ext(&filename, is_library)
+            };
+
+            match ext {
+                Ok((ext_name, ext_path)) => {
+                    match compute.download_extension(ext_name, ext_path).await {
+                        Ok(_) => Response::new(Body::from("OK")),
+                        Err(e) => {
+                            error!("extension download failed: {}", e);
+                            let mut resp = Response::new(Body::from(e.to_string()));
+                            *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
+                            resp
+                        }
+                    }
+                }
                Err(e) => {
-                    error!("extension download failed: {}", e);
-                    let mut resp = Response::new(Body::from(e.to_string()));
+                    warn!("extension download failed to find extension: {}", e);
+                    let mut resp = Response::new(Body::from("failed to find file"));
                    *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
                    resp
                }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -270,7 +270,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            }
            RoleAction::Create => {
                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
+                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -825,6 +825,16 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNod
    }
 }

+// Get list of options to append to safekeeper command invocation.
+fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
+    init_match
+        .get_many::<String>("safekeeper-extra-opt")
+        .into_iter()
+        .flatten()
+        .map(|s| s.to_owned())
+        .collect()
+}
+
 fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    let (sub_name, sub_args) = match sub_match.subcommand() {
        Some(safekeeper_command_data) => safekeeper_command_data,
@@ -841,7 +851,9 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul

    match sub_name {
        "start" => {
-            if let Err(e) = safekeeper.start() {
+            let extra_opts = safekeeper_extra_opts(sub_args);
+
+            if let Err(e) = safekeeper.start(extra_opts) {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -866,7 +878,8 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
                exit(1);
            }

-            if let Err(e) = safekeeper.start() {
+            let extra_opts = safekeeper_extra_opts(sub_args);
+            if let Err(e) = safekeeper.start(extra_opts) {
                eprintln!("safekeeper start failed: {}", e);
                exit(1);
            }
@@ -893,7 +906,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow

    for node in env.safekeepers.iter() {
        let safekeeper = SafekeeperNode::from_env(env, node);
-        if let Err(e) = safekeeper.start() {
+        if let Err(e) = safekeeper.start(vec![]) {
            eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
            try_stop_all(env, false);
            exit(1);
@@ -956,6 +969,14 @@ fn cli() -> Command {

    let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);

+    let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt")
+        .short('e')
+        .long("safekeeper-extra-opt")
+        .num_args(1)
+        .action(ArgAction::Append)
+        .help("Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo")
+        .required(false);
+
    let tenant_id_arg = Arg::new("tenant-id")
        .long("tenant-id")
        .help("Tenant id. Represented as a hexadecimal string 32 symbols length")
@@ -1124,6 +1145,7 @@ fn cli() -> Command {
                .subcommand(Command::new("start")
                            .about("Start local safekeeper")
                            .arg(safekeeper_id_arg.clone())
+                            .arg(safekeeper_extra_opt_arg.clone())
                )
                .subcommand(Command::new("stop")
                            .about("Stop local safekeeper")
@@ -1134,6 +1156,7 @@ fn cli() -> Command {
                            .about("Restart local safekeeper")
                            .arg(safekeeper_id_arg)
                            .arg(stop_mode_arg.clone())
+                            .arg(safekeeper_extra_opt_arg)
                )
        )
        .subcommand(
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -493,7 +493,7 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
-            custom_extensions: Some(vec![]),
+            remote_extensions: None,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -101,7 +101,7 @@ impl SafekeeperNode {
        self.datadir_path().join("safekeeper.pid")
    }

-    pub fn start(&self) -> anyhow::Result<Child> {
+    pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
        print!(
            "Starting safekeeper at '{}' in '{}'",
            self.pg_connection_config.raw_address(),
@@ -161,17 +161,28 @@ impl SafekeeperNode {

        let key_path = self.env.base_data_dir.join("auth_public_key.pem");
        if self.conf.auth_enabled {
+            let key_path_string = key_path
+                .to_str()
+                .with_context(|| {
+                    format!("Key path {key_path:?} cannot be represented as a unicode string")
+                })?
+                .to_owned();
            args.extend([
-                "--auth-validation-public-key-path".to_owned(),
-                key_path
-                    .to_str()
-                    .with_context(|| {
-                        format!("Key path {key_path:?} cannot be represented as a unicode string")
-                    })?
-                    .to_owned(),
+                "--pg-auth-public-key-path".to_owned(),
+                key_path_string.clone(),
+            ]);
+            args.extend([
+                "--pg-tenant-only-auth-public-key-path".to_owned(),
+                key_path_string.clone(),
+            ]);
+            args.extend([
+                "--http-auth-public-key-path".to_owned(),
+                key_path_string.clone(),
            ]);
        }

+        args.extend(extra_opts);
+
        background_process::start_process(
            &format!("safekeeper-{id}"),
            &datadir,
--- a/deny.toml
+++ b/deny.toml
@@ -18,7 +18,7 @@ vulnerability = "deny"
 unmaintained = "warn"
 yanked = "warn"
 notice = "warn"
-ignore = []
+ignore = ["RUSTSEC-2023-0052"]

 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
--- a/docs/rfcs/026-pageserver-s3-mvcc.md
+++ b/docs/rfcs/026-pageserver-s3-mvcc.md
@@ -0,0 +1,316 @@
+This is a copy from the [original Notion page](https://www.notion.so/neondatabase/Proposal-Pageserver-MVCC-S3-Storage-8a424c0c7ec5459e89d3e3f00e87657c?pvs=4), taken on 2023-08-16.
+
+This is for archival mostly.
+The RFC that we're likely to go with is https://github.com/neondatabase/neon/pull/4919.
+
+---
+
+# Proposal: Pageserver MVCC S3 Storage
+
+tl;dr: this proposal enables Control Plane to attach a tenant to a new pageserver without being 100% certain that it has been detached from the old pageserver. This enables us to automate failover if a pageserver dies (no human in the loop).
+
+# Problem Statement
+
+The current Neon architecture requires the Control Plane to guarantee that a tenant is only attached to one pageserver at a time. If a tenant is attached to multiple pageservers simultaneously, the pageservers will overwrite each other’s changes in S3 for that tenant, resulting in data loss for that tenant.
+
+The above imposes limitations on tenant relocation and future designs for high availability. For instance, Control Plane cannot relocate a tenant to another pageserver before it is 100% certain that the tenant is detached from the source pageserver. If the source pageserver is unresponsive, the tenant detach procedure cannot proceed, and Control Plane has no choice but to wait for either the source to become responsive again, or rely on a node failure detection mechanism to detect that the source pageserver is dead, and give permission to skip the detachment step. Either way, the tenant is unavailable for an extended period, and we have no means to improve it in the current architecture.
+
+Note that there is no 100% correct node failure detection mechanism, and even techniques to accelerate failure detection, such as ********************************shoot-the-other-node-in-the-head,******************************** have their limits. So, we currently rely on humans as node failure detectors: they get alerted via PagerDuty, assess the situation under high stress, and make the decision. If they make the wrong call, or the apparent dead pageserver somehow resurrects later, we’ll have data loss.
+
+Also, by relying on humans, we’re [incurring needless unscalable toil](https://sre.google/sre-book/eliminating-toil/): as Neon grows, pageserver failures will become more and more frequent because our fleet grows. Each instance will need quick response time to minimize downtime for the affected tenants, which implies higher toil, higher resulting attrition, and/or higher personnel cost.
+
+Lastly, there are foreseeable needs by operation and product such as zero-downtime relocation and automatic failover/HA. For such features, the ability to have a tenant purposefully or accidentally attached to more than one pageserver will greatly reduce risk of data loss, and improve availability.
+
+# High-Level Idea
+
+The core idea is to evolve the per-Tenant S3 state to an MVCC-like scheme, allowing multiple pageservers to operate on the same tenant S3 state without interference. To make changes to S3, pageservers acquire long-running transactions from Control Plane. After opening a transaction, Pageservers make PUTs directly against S3, but they keys include the transaction ID,  so overwrites never happen. Periodically, pageservers talk back to Control Plane to commit their transaction. This is where Control Plane enforces strict linearizability, favoring availability over work-conservation: commit is only granted if no transaction started after the one that’s requesting commit. Garbage collection is done through deadlists, and it’s simplified tremendously by above commit grant/reject policy.
+
+Minimal changes are required for safekeepers to allow WAL for a single timeline be consumed by more than one pageserver without premature truncation.
+
+**Above scheme makes it safe to attach tenants without a 100% correct node failure detection mechanism. Further, it makes it safe to interleave tenant-attachment to pageservers, unlocking new capabilities for (internal) product features:**
+
+- **Fast, Zero-Toil Failover on Network Partitions or Instance Failure**: if a pageserver is not reachable (network partition, hardware failure, overload) we want to spread its attached tenants to new pageservers to restore availability, within the range of *seconds*. We cannot afford gracious timeouts to maximize the probability that the unreachable pageserver has ceased writing to S3. This proposal enables us to attach the tenants to the replacement pageservers,  and redirect their computes, without having to wait for confirmation that the unreachable pageserver has ceased writing to S3.
+- **************************************Zero-Downtime Relocation:************************************** we want to be able to relocate tenants to different pageservers with minimized availability or a latency impact. This proposal enables us to attach the relocating Tenant to the destination Pageserver before detaching it from the source Pageserver. This can help minimize downtime because we can wait for the destination to catch up on WAL processing before redirecting Computes.
+
+# Design
+
+The core idea is to evolve the per-Tenant S3 state to a per-tenant MVCC-like scheme.
+
+To make S3 changes for a given tenant, Pageserver requests a transaction ID from control plane for that tenant. Without a transaction ID, Pageserver does not write to S3.
+
+Once Pageserver received a transaction ID it is allowed to produce new objects and overwrite objects created in this transaction. Pageserver is not allowed to delete any objects; instead, it marks the object as deleted by appending the key to the transaction’s deadlist for later deletion. Commits of transactions are serialized through Control Plane: when Pageserver wants to commit a transaction, it sends an RPC to Control Plane. Control Plane responds with a commit grant or commit reject message. Commit grant means that the transaction’s changes are now visible to subsequent transactions. Commit reject means that the transaction’s changes are not and never will be visible to another Pageserver instance, and the rejected Pageserver is to cease further activity on that tenant.
+
+## ****************************************************Commit grant/reject policy****************************************************
+
+For the purposes of Pageserver, we want **linearizability** of a tenant’s S3 state. Since our transactions are scoped per tenant, it is sufficient for linearizability to grant commit if and only if no other transaction has been started since the commit-requesting transaction started.
+
+For example, consider the case of a single tenant, attached to Pageserver A. Pageserver A has an open transaction but becomes unresponsive. Control Plane decides to relocate the tenant to another Pageserver B. It need *not* wait for A to be 100%-certainly down before B can start uploading to S3 for that tenant. Instead, B can start a new transaction right away, make progress, and get commit grants;  What about A? The transaction is RejectPending in Control Plane until A eventually becomes responsive again, tries to commit, gets a rejection, acknowledges it, and thus its transaction becomes RejectAcknowledge. If A is definitively dead, operator can also force-transition from state RejectPending to RejectAcknowledged. But critically, Control Plane doesn’t have for A’s transaction to become RejectAcknowledge before attaching the tenant to B.
+
+```mermaid
+sequenceDiagram
+
+   participant CP
+   participant A
+   participant S3
+   participant B
+
+	 CP -->> A: attach tenant
+   activate A
+	 A -->> CP: start txn
+	 CP -->> A: txn=23, last_committed_txn=22
+
+
+	 Note over CP,A: network partition
+	 CP --x A: heartbeat
+	 CP --x A: heartbeat
+
+	 Note over CP: relocate tenant to avoid downtime
+	 CP -->> B: attach tenant
+   activate B
+	 B -->> CP: start txn
+   Note over CP: mark A's txn 23 as RejectPending
+	 CP -->> B: txn=24, last-committed txn is 22
+	 B -->> S3: PUT X.layer.24<br>PUT index_part.json.24 referencing X.layer.24
+	 B -->> CP: request commit
+	 CP -->> B: granted
+   B -->> CP: start txn
+  CP -->> B: txn=25, last_committed_txn=22
+
+   A -->> S3: PUT Y.layer.23 <br> PUT index_part.json.23 referencing Y.layer.23
+  A --x CP: request commit
+	 A --x CP: request commit
+
+   Note over CP,A: partition is over
+
+   A -->> CP: request commit
+
+   Note over CP: most recently started txn is 25, not 23, reject
+
+   CP -->> A: reject
+   A -->> CP: acknowledge reject
+
+   Note over CP: mark A's txn 23 as RejectAcknowledged
+
+  deactivate A
+
+  B -->> S3: PUT 000-FFF_X-Y.layer.**************25**************<br>...
+
+  deactivate B
+
+
+```
+
+If a Pageserver gets a rejection to a commit request, it acknowledges rejection and cedes further S3 uploads for the tenant, until it receives a `/detach` request for the tenant (control plane has most likely attached the tenant to another pageserver in the meantime).
+
+In practice, Control Plane will probably extend the commit grant/reject schema above, taking into account the pageserver to which it last attached the tenant. In the above example, Control Plane could remember that the pageserver that is supposed to host the tenant is pageserver B, and reject start-txn and commit requests from pageserver A. It would also use such requests from A as a signal that A is reachable again, and retry the `/detach` .
+
+<aside>
+💡 A commit failure causes the tenant to become effectively `Broken`. Pageserver should persist this locally so it doesn’t bother ControlPlane for a new txn when Pageserver is restarted.
+
+</aside>
+
+## ********************Visibility********************
+
+We mentioned earlier that once a transaction commits, its changes are visible to subsequent transactions. But how does a given transaction know where to look for the data? There is no longer a single `index_part.json` per timeline, or a single `timelines/:timeline_id` prefix to look for; they’re all multi-versioned, suffixed by the txn number.
+The solution is: at transaction start, Pageserver receives the last-committed transaction ID from Control Plane (`last_committed_txn` in the diagram). last_commited_txn is the upper bound for what is visible for the current transaction. Control Plane keeps track of each open transaction’s last_committed_txn for purposes of garbage collection (see later paragraph).
+Equipped with last_committed_txn, Pageserver then discovers
+
+- the current index part of a timeline at `tenants/:tenant_id/timelines/:timeline_id/index_part.json.$last_committed_txn`. The `index_part.json.$last_committed_txn` has the exact same contents as the current architecture’s index_part.json, i.e. full list of layers.
+- the list of existent timelines as part of the `attach` RPC from CP;
+
+There is no other S3 state per tenant, so, that’s all the visibility required.
+An alternative to receiving the list of existent timelines from CP is to introduce a proper **********SetOfTimelines********** object in S3, and multi-version it just like above. For example, we could have a `tenants/:tenant_id/timelines.json.$txn` file that references `index_part.json.$last_committed_txn` . It can be added later if more separation between CP and PS is desired.
+
+So, the only MVCC’ed object types in this proposal are LayerFile and IndexPart (=individual timeline), but not the SetOfTimelines in a given tenant. Is this a problem? For example, the Pageserver’s garbage collection code needs to know the full set of timelines of a tenant. Otherwise it’ll make incorrect decisions. What if Pageserver A knows about timelines {R,S}, but another Pageserver B created an additional branch T, so, its set of timelines is {R,S,T}. Both pageservers will run GC code, and so, PS A may decide to delete a layer that’s still needed for branch T. Not a problem with this propsoal, because the effect of GC (i.e., layer deletion) is properly MVCC’ed.
+
+## Longevity Of Transactions & Availability
+
+Pageserver depends on Control Plane to start a new transaction. If ControlPlane is down, no new transactions can be started.
+
+Pageservers commit transactions based on a maximum amount of uncommitted changes that have accumulated in S3. A lower maximum increases dependence and load on ControlPlane which decreases availability. A higher maximum risks losing more work in the event of failover; the work will have to be re-done in a new transaction on the new node.
+
+Pageservers are persist the open txn id in local storage, so that they can resume the transaction after restart, without dependence on Control Plane.
+
+## **Operations**
+
+********PUTs:********
+
+- **layer files**
+    - current architecture: layer files are supposed to be write-once, but actually, there are edge-cases where we PUT the same layer file name twice; namely if we PUT the file to S3 but crash before uploading the index part that references it; then detach + attach, and re-run compaction, which is non-deterministic.
+    - this proposal: with transactions, we can now upload layers and index_part.json concurrently, just need to make sure layer file upload is done before we request txn commit.
+- **index part** upload: `index_part.json.$txn` may be created and subsequently overwritten multiple times in a transaction; it is an availability/work-loss trade-off how often to request a commit from CP.
+
+**************DELETEs**************: for deletion, we maintain a deadlist per transaction. It is located at `tenants/:tenant_id/deadlist/deadlist.json.$txn`. It is PUT once before the pageserver requests requests commit, and not changed after sending request to commit. An object created in the current txn need not (but can) be on the deadlist — it can be DELETEd immediately because it’s not visible to other transactions. An example use case would be an L0 layer that gets compacted within one transaction; or, if we ever start MVCC’ing the set of timelines of a tenant, a short-lived branch that is created & destroyed within one transaction.
+
+<aside>
+☝ **Deadlist Invariant:** if a an object is on a deadlist of transaction T, it is not referenced from anywhere else in the full state visible to T or any later started transaction > T.
+
+</aside>
+
+### Rationale For Deadlist.json
+
+Given that this proposal only MVCC’s layers and indexparts, one may ask why the deadlist isn’t part of indexpart. The reason is to not lose generality: the deadlist is just a list of keys; it is not necessary to understand the data format of the versioned object to process the deadlist. This is important for garbage collection / vacuuming, which we’ll come to in the next section.
+
+## Garbage Collection / Vacuuming
+
+After a transaction has reached reject-acknowledged state,  Control Plane initiates a garbage collection procedure for the aborted transaction.
+
+Control Plane is in the unique position about transaction states. Here is a sketch of the exact transaction states and what Control Plane keeps track of.
+
+```
+struct Tenant {
+  ...
+
+  txns: HashMap<TxnId, Transaction>,
+  // the most recently started txn's id; only most recently sarted can win
+  next_winner_txn: Option<TxnId>,
+}
+struct Transaction {
+  id: TxnId, // immutable
+  last_committed_txn: TxnId, // immutable; the most recent txn in state `Committed`
+                             // when self was started
+  pageserver_id: PageserverId,
+  state: enum {
+    Open,
+    Committed,
+    RejectPending,
+    RejectAcknowledged, // invariant: we know all S3 activity has ceded
+    GarbageCollected,
+  }
+}
+```
+
+Object creations & deletions by a rejected transaction have never been visible to other transactions. That is true for both RejectPending and RejectAcknowledged states. The difference is that, in RejectPending, the pageserver may still be uploading to S3, whereas in RejectAcknowledged, Control Plane can be certain that all S3 activity in the name of that transaction has ceded. So, once a transaction reaches state RejectAcknowledged state, it is safe to DELETE all objects created by that transaction, and discard the transaction’s deadlists.
+
+A transaction T in state Committed has subsequent transactions that may or may not reference the objects it created. None of the subsequent transaction can reference the objects on T’s deadlist, though, as per the Deadlist Invariant (see previous section).
+
+So, for garbage collection, we need to assess transactions in state Committed and RejectAcknowledged:
+
+- Commited: delete objects on the deadlist.
+    - We don’t need a LIST request here, the deadlist is sufficient. So, it’s really cheap.
+    - This is **not true MVCC garbage collection**; by deleting the objects on Committed transaction T ’s deadlist, we might delete data referenced by other transactions that were concurrent with T, i.e., they started while T was still open. However, the fact that T is committed means that the other transactions are RejectPending or RejectAcknowledged, so, they don’t matter. Pageservers executing these doomed RejectPending transactions must handle 404 for GETs gracefully, e.g., by trying to commit txn so they observe the rejection they’re destined to get anyways. 404’s for RejectAcknowledged is handled below.
+- RejectAcknowledged: delete all objects created in that txn, and discard deadlists.
+    - 404s / object-already-deleted type messages must be expected because of Committed garbage collection (see above)
+    - How to get this list of objects created in a txn? Open but solvable design question; Ideas:
+        - **Brute force**: within tenant prefix, search for all keys ending in `.$txn` and delete them.
+        - **WAL for PUTs**: before a txn PUTs an object, it logs to S3, or some other equivalently durable storage, that it’s going to do it. If we log to S3, this means we have to do an additional WAL PUT per “readl” PUT.
+        - ******************************LIST with reorg’ed S3 layout (preferred one right now):****************************** layout S3 key space such that `$txn` comes first, i.e., `tenants/:tenant_id/$txn/timelines/:timeline_id/*.json.$txn` . That way, when we need to GC a RejectAcknowledged txn, we just LIST the entire `tenants/:tenant_id/$txn` prefix and delete it. The cost of GC for RejectAcknowledged transactions is thus proportional to the number of objects created in that transaction.
+
+## Branches
+
+This proposal only MVCC’s layer files and and index_part.json, but leaves the tenant object not-MVCCed. We argued earlier that it’s fine to ignore this for now, because
+
+1. Control Plane can act as source-of-truth for the set of timelines, and
+2. The only operation that makes decision based on “set of timelines” is GC, which in turn only does layer deletions, and layer deletions ***are*** properly MVCC’ed.
+
+Now that we’ve introduced garbage collection, let’s elaborate a little more on (2). Recall our example from earlier: Pageserver A knows about timelines {R,S}, but another Pageserver B created an additional branch T, so, its set of timelines is {R,S,T}. Both pageservers will run GC code, and so, PS A may decide to delete a layer that’s still needed for branch T.
+
+How does the MVCC’ing of layer files protect us here? If A decides to delete that layer, it’s just on A’s transaction’s deadlist, but still present in S3 and usable by B. If A commits first, B won’t be able to commit and the layers in timeline T will be vacuumed. If B commits first, A’s deadlist is discarded and the layer continues to exist.
+
+## Safekeeper Changes
+
+We need to teach the safekeepers that there can be multiple pageservers requesting WAL for the same timeline, in order to prevent premature WAL truncation.
+
+In the current architecture, the Safekeeper service currently assumes only one Pageserver and is allowed to prune WAL older than that Pageserver’s `remote_consistent_lsn`. Safekeeper currently learns the `remote_consistent_lsn` through the walreceiver protocol.
+
+So, if we have a tenant attached to two pageservers at the same time, they will both try to stream WAL and the Safekeeper will get confused about which connection’s `remote_consistent_lsn` to use as a basis for WAL pruning.
+
+What do we need to change to make it work? We need to make sure that the Safekeepers only prune WAL up to the `remote_consistent_lsn` of the last-committed transaction.
+
+The straight-forward way to get it is to re-design WAL pruning as follows:
+
+1. Pageserver reports remote_consistent_lsn as part of transaction commit to Control Plane.
+2. Control Plane makes sure transaction state update is persisted.
+3. Control Plane (asynchronous to transaction commit) reconciles with Safekeepers to ensure WAL pruning happens.
+
+The above requires non-trivial changes, but, in the light of other planned projects such as restore-tenant-from-safekeeper-wal-backups, I think Control Plane will need to get involved in WAL pruning anyways.
+
+# How This Proposal Unlocks Future Features
+
+Let us revisit the example from the introduction where we were thinking about handling network partitions. Network partitions need to be solved first, because they’re unavoidable in distributed systems. We did that. Now let’s see how we can solve actual product problems:
+
+## **Fast, Zero-Toil Failover on Network Partitions or Instance Failure**
+
+The “Problem Statement” section outlined the current architecture’s problems with regards to network partitions or instance failure: it requires a 100% correct node-dead detector to make decisions, which doesn’t exist in reality. We rely instead on human toil: an oncall engineer has to inspect the situation and make a decision, which may be incorrect and in any case take time in the order of minutes, which means equivalent downtime for users.
+
+With this proposal, automatic failover for pageservers is trivial:
+
+If a pageserver is unresponsive from Control Plane’s / Compute’s perspective, Control Plane does the following:
+
+- attach all tenants of the unresponsive pageserver to new pageservers
+- switch over these tenants’ computes immediately;
+
+At this point, availability is restored and user pain relieved.
+
+What’s left is to somehow close the doomed transaction of the unresponsive pageserver, so that it beomes RejectAcknowledged, and GC can make progress. Since S3 is cheap, we can afford to wait a really long time here, especially if we put a soft bound on the amount of data a transaction may produce before it must commit. Procedure:
+
+1. Ensure the unresponsive pageserver is taken out of rotation for new attachments. That probably should happen as part of the routine above.
+2. Make a human operator investigate decide what to do (next morning, NO ONCALL ALERT):
+    1. Inspect the instance, investigate logs, understand root cause.
+    2. Try to re-establish connectivity between pageserver and Control Plane so that pageserver can retry commits, get rejected, ack rejection ⇒ enable GC.
+    3. Use below procedure to decomission pageserver.
+
+### Decomissioning A Pageserver (Dead or Alive-but-Unrespsonive)
+
+The solution, enabled by this proposal:
+
+1. Ensure that pageserver’s S3 credentials are revoked so that it cannot make new uploads, which wouldn’t be tracked anywhere.
+2. Let enough time pass for the S3 credential revocation to propagate. Amazon doesn’t give a guarantee here. As stated earlier, we can easily afford to wait here.
+3. Mark all Open and RejectPending transactions of that pageserver as RejectAcknowledge.
+
+Revocation of the S3 credentials is required so that, once we transition all the transactions of that pageserver to RejectAcknowledge, once garbage-collection pass is guaranteed to delete all objects that will ever exist for that pageserver. That way, we need not check *****GarbageCollected***** transactions every again.
+
+## Workflow: Zero-Downtime Relocation
+
+With zero-downtime relocation, the goal is to have the target pageserver warmed up, i.e., at the same `last_record_lsn` as the source pageserver, before switching over Computes from source to target pageserver.
+
+With this proposal, it works like so:
+
+1. Grant source pageserver its last open transaction. This one is doomed to be rejected later, unless the relocation fails.
+2. Grant target pageserver its first open transaction.
+3. Have target pageserver catch up on WAL, streaming from last-committed-txn’s remote_consistent_lsn onwards.
+4. Once target pageserver reports `last_record_lsn` close enough to source pageserver, target pageserver requests commit.
+5. Drain compute traffic from source to target pageserver. (Source can still answer requests until it tries to commit and gets reject, so, this will be quite smooth).
+
+Note that as soon as we complete step (4), the source pageserver’s transaction is doomed to be rejected later. Conversely, if the target can’t catch up fast enough, the source will make a transaction commit earlier. This will generally happen if there is a lot of write traffic coming in. The design space to make thing smooth here is large, but well explored in other areas of computing, e.g., VM live migration. We have all the important policy levers at hand, e.g.,
+
+- delaying source commits if we see target making progress
+- slowing down source consumption (need some signalling mechanism for it)
+- slowing down compute wal generation
+- …
+
+It doesn’t really matter, what’s important is that two pageservers can overlap.
+
+# Additional Trade-Offs / Remarks Brought Up During Peer Review
+
+This proposal was read by and discussed @Stas and @Dmitry Rodionov prior to publishing it with the broader team. (This does not mean they endorse this proposal!).
+
+Issues that we discussed:
+
+1. **Frequency of transactions:** If even idle tenants commit every 10min or so, that’s quite a lot of load on Control Plane. Can we minimize it by Equating Transaction Commit Period to Attachment Period? I.e. start txn on attach, commit on detach?
+    1. Would be nice, but, if a tenant is attached for 1 month, then PS dies, we lose 1 month of work.
+    2. ⇒ my solution to this problem: Adjusted this proposal to make transaction commit frequency proportional to amount of uncommitted data.
+        1. It’s ok to spend resources on active users, they pay us money to do it!
+        2. The amount of work per transaction is minimal.
+            1. In current Control Plane, it’s a small database transaction that is super unlikely to conflict with other transactions.
+            2. I have very little concerns about scalability of the commit workload on CP side because it's trivially horizontally scalable by sharding by tenant.
+        3. There's no super stringent availability requirement on control plane; if a txn can't commit because it can't reach the CP, PS can continue & retry in the background, speculating that it's CP downtime and not PS-partitioned-off scenario.
+        4. Without stringent availability requirement, there's flexibility for future changes to CP-side-implementation.
+2. ************************************************Does this proposal address mirroring / no-performance-degradation failover ?************************************************
+    1. No it doesn’t. It only provides the building block for attaching a tenant to a new pageserver without having to worry that the tenant is detached on the old pageserver.
+    2. A simple scheme to build no-performance-degradation failover on top of this proposal is to have an asynchronous read-only replica of a tenant on another pageserver in the same region.
+    3. Another more ambitious scheme to get no-performance-degradation would be [One-Pager: Layer File Spreading (Christian)](https://www.notion.so/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843?pvs=21); this proposal would be used in layer file spreading for risk-free automation of TenantLeader failover, which hasn’t been addressed Ithere.
+    4. In any way, failover would restart from an older S3 state, and need to re-ingest WAL before being able to server recently written pages.
+        1. Is that a show-stopper? I think not.
+        2. Is it suboptimal? Absolutely: if a pageserver instance fails, all its tenants will be distributed among the remaining pageservers (OK), and all these tenants will ask the safekeepers for WAL at the same time (BAD). So, pageserver instance failure will cause a load spike in safekeepers.
+            1. Personally I think that’s an OK trade-off to make.
+            2. There are countless options to avoid / mitigate the load spike. E.g., pro-actively streaming WAL to the standby read-only replica.
+
+3. ********************************************Does this proposal allow multiple writers for a tenant?********************************************
+    1. In abstract terms, this proposal provides a linearized history for a given S3 prefix.
+    2. In concrete terms, this proposal provides a linearized history per tenant.
+    3. There can be multiple writers at a given time, but only one of them will win to become part of the linearized history.
+4. ************************************************************************************Alternative ideas mentioned during meetings that should be turned into a written prospoal like this one:************************************************************************************
+    1. @Dmitry Rodionov : having linearized storage of index_part.json in some database that allows serializable transactions / atomic compare-and-swap PUT
+    2. @Dmitry Rodionov :
+    3. @Stas : something like this scheme, but somehow find a way to equate attachment duration with transaction duration, without losing work if pageserver dies months after attachment.
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -10,6 +10,9 @@ chrono.workspace = true
 serde.workspace = true
 serde_with.workspace = true
 serde_json.workspace = true
+regex.workspace = true

 utils = { path = "../utils" }
+remote_storage = { version = "0.1", path = "../remote_storage/" }
+
 workspace_hack.workspace = true
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -107,7 +107,6 @@ pub struct ComputeMetrics {
    pub num_ext_downloaded: u64,
    pub largest_ext_size: u64, // these are measured in bytes
    pub total_ext_download_size: u64,
-    pub prep_extensions_ms: u64,
 }

 /// Response of the `/computes/{compute_id}/spec` control-plane API.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -3,11 +3,16 @@
 //! The spec.json file is used to pass information to 'compute_ctl'. It contains
 //! all the information needed to start up the right version of PostgreSQL,
 //! and connect it to the storage nodes.
+use std::collections::HashMap;
+
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

+use regex::Regex;
+use remote_storage::RemotePath;
+
 /// String type alias representing Postgres identifier and
 /// intended to be used for DB / role names.
 pub type PgIdent = String;
@@ -61,8 +66,55 @@ pub struct ComputeSpec {
    /// the pageserver and safekeepers.
    pub storage_auth_token: Option<String>,

-    // list of prefixes to search for custom extensions in remote extension storage
+    // information about available remote extensions
+    pub remote_extensions: Option<RemoteExtSpec>,
+}
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub struct RemoteExtSpec {
+    pub public_extensions: Option<Vec<String>>,
    pub custom_extensions: Option<Vec<String>>,
+    pub library_index: HashMap<String, String>,
+    pub extension_data: HashMap<String, ExtensionData>,
+}
+
+#[derive(Clone, Debug, Serialize, Deserialize)]
+pub struct ExtensionData {
+    pub control_data: HashMap<String, String>,
+    pub archive_path: String,
+}
+
+impl RemoteExtSpec {
+    pub fn get_ext(
+        &self,
+        ext_name: &str,
+        is_library: bool,
+    ) -> anyhow::Result<(String, RemotePath)> {
+        let mut real_ext_name = ext_name;
+        if is_library {
+            // sometimes library names might have a suffix like
+            // library.so or library.so.3. We strip this off
+            // because library_index is based on the name without the file extension
+            let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
+            let lib_raw_name = strip_lib_suffix.replace(real_ext_name, "").to_string();
+
+            real_ext_name = self
+                .library_index
+                .get(&lib_raw_name)
+                .ok_or(anyhow::anyhow!("library {} is not found", lib_raw_name))?;
+        }
+
+        match self.extension_data.get(real_ext_name) {
+            Some(ext_data) => Ok((
+                real_ext_name.to_string(),
+                RemotePath::from_string(&ext_data.archive_path)?,
+            )),
+            None => Err(anyhow::anyhow!(
+                "real_ext_name {} is not found",
+                real_ext_name
+            )),
+        }
+    }
 }

 #[serde_as]
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -205,5 +205,43 @@
            "name": "zenith new",
            "new_name": "zenith \"new\""
        }
-    ]
+    ],
+    "remote_extensions": {
+        "library_index": {
+          "anon": "anon",
+          "postgis-3": "postgis",
+          "libpgrouting-3.4": "postgis",
+          "postgis_raster-3": "postgis",
+          "postgis_sfcgal-3": "postgis",
+          "postgis_topology-3": "postgis",
+          "address_standardizer-3": "postgis"
+        },
+        "extension_data": {
+          "anon": {
+            "archive_path": "5834329303/v15/extensions/anon.tar.zst",
+            "control_data": {
+              "anon.control": "# PostgreSQL Anonymizer (anon) extension\ncomment = ''Data anonymization tools''\ndefault_version = ''1.1.0''\ndirectory=''extension/anon''\nrelocatable = false\nrequires = ''pgcrypto''\nsuperuser = false\nmodule_pathname = ''$libdir/anon''\ntrusted = true\n"
+            }
+          },
+          "postgis": {
+            "archive_path": "5834329303/v15/extensions/postgis.tar.zst",
+            "control_data": {
+              "postgis.control": "# postgis extension\ncomment = ''PostGIS geometry and geography spatial types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis-3''\nrelocatable = false\ntrusted = true\n",
+              "pgrouting.control": "# pgRouting Extension\ncomment = ''pgRouting Extension''\ndefault_version = ''3.4.2''\nmodule_pathname = ''$libdir/libpgrouting-3.4''\nrelocatable = true\nrequires = ''plpgsql''\nrequires = ''postgis''\ntrusted = true\n",
+              "postgis_raster.control": "# postgis_raster extension\ncomment = ''PostGIS raster types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis_raster-3''\nrelocatable = false\nrequires = postgis\ntrusted = true\n",
+              "postgis_sfcgal.control": "# postgis topology extension\ncomment = ''PostGIS SFCGAL functions''\ndefault_version = ''3.3.2''\nrelocatable = true\nrequires = postgis\ntrusted = true\n",
+              "postgis_topology.control": "# postgis topology extension\ncomment = ''PostGIS topology spatial types and functions''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = topology\nrequires = postgis\ntrusted = true\n",
+              "address_standardizer.control": "# address_standardizer extension\ncomment = ''Used to parse an address into constituent elements. Generally used to support geocoding address normalization step.''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n",
+              "postgis_tiger_geocoder.control": "# postgis tiger geocoder extension\ncomment = ''PostGIS tiger geocoder and reverse geocoder''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = tiger\nrequires = ''postgis,fuzzystrmatch''\nsuperuser= false\ntrusted = true\n",
+              "address_standardizer_data_us.control": "# address standardizer us dataset\ncomment = ''Address Standardizer US dataset example''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n"
+            }
+          }
+        },
+        "custom_extensions": [
+          "anon"
+        ],
+        "public_extensions": [
+          "postgis"
+        ]
+      }
 }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -189,8 +189,6 @@ impl S3Bucket {
        let kind = RequestKind::Get;
        let permit = self.owned_permit(kind).await;

-        metrics::inc_get_object();
-
        let started_at = start_measuring_requests(kind);

        let get_object = self
@@ -205,7 +203,6 @@ impl S3Bucket {
        let started_at = ScopeGuard::into_inner(started_at);

        if get_object.is_err() {
-            metrics::inc_get_object_fail();
            metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
                kind,
                AttemptOutcome::Err,
@@ -337,7 +334,6 @@ impl RemoteStorage for S3Bucket {

        loop {
            let _guard = self.permit(kind).await;
-            metrics::inc_list_objects();
            let started_at = start_measuring_requests(kind);

            let fetch_response = self
@@ -350,10 +346,6 @@ impl RemoteStorage for S3Bucket {
                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
-                .map_err(|e| {
-                    metrics::inc_list_objects_fail();
-                    e
-                })
                .context("Failed to list S3 prefixes")
                .map_err(DownloadError::Other);

@@ -395,7 +387,6 @@ impl RemoteStorage for S3Bucket {
        let mut all_files = vec![];
        loop {
            let _guard = self.permit(kind).await;
-            metrics::inc_list_objects();
            let started_at = start_measuring_requests(kind);

            let response = self
@@ -407,10 +398,6 @@ impl RemoteStorage for S3Bucket {
                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
-                .map_err(|e| {
-                    metrics::inc_list_objects_fail();
-                    e
-                })
                .context("Failed to list files in S3 bucket");

            let started_at = ScopeGuard::into_inner(started_at);
@@ -443,7 +430,6 @@ impl RemoteStorage for S3Bucket {
        let kind = RequestKind::Put;
        let _guard = self.permit(kind).await;

-        metrics::inc_put_object();
        let started_at = start_measuring_requests(kind);

        let body = Body::wrap_stream(ReaderStream::new(from));
@@ -458,11 +444,7 @@ impl RemoteStorage for S3Bucket {
            .content_length(from_size_bytes.try_into()?)
            .body(bytes_stream)
            .send()
-            .await
-            .map_err(|e| {
-                metrics::inc_put_object_fail();
-                e
-            });
+            .await;

        let started_at = ScopeGuard::into_inner(started_at);
        metrics::BUCKET_METRICS
@@ -519,7 +501,6 @@ impl RemoteStorage for S3Bucket {
        }

        for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
-            metrics::inc_delete_objects(chunk.len() as u64);
            let started_at = start_measuring_requests(kind);

            let resp = self
@@ -537,8 +518,10 @@ impl RemoteStorage for S3Bucket {

            match resp {
                Ok(resp) => {
+                    metrics::BUCKET_METRICS
+                        .deleted_objects_total
+                        .inc_by(chunk.len() as u64);
                    if let Some(errors) = resp.errors {
-                        metrics::inc_delete_objects_fail(errors.len() as u64);
                        return Err(anyhow::format_err!(
                            "Failed to delete {} objects",
                            errors.len()
@@ -546,7 +529,6 @@ impl RemoteStorage for S3Bucket {
                    }
                }
                Err(e) => {
-                    metrics::inc_delete_objects_fail(chunk.len() as u64);
                    return Err(e.into());
                }
            }
@@ -555,32 +537,8 @@ impl RemoteStorage for S3Bucket {
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        let kind = RequestKind::Delete;
-        let _guard = self.permit(kind).await;
-
-        metrics::inc_delete_object();
-        let started_at = start_measuring_requests(kind);
-
-        let res = self
-            .client
-            .delete_object()
-            .bucket(self.bucket_name.clone())
-            .key(self.relative_path_to_s3_object(path))
-            .send()
-            .await
-            .map_err(|e| {
-                metrics::inc_delete_object_fail();
-                e
-            });
-
-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
-
-        res?;
-
-        Ok(())
+        let paths = std::array::from_ref(path);
+        self.delete_objects(paths).await
    }
 }

--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -1,4 +1,6 @@
-use metrics::{register_histogram_vec, register_int_counter_vec, Histogram, IntCounter};
+use metrics::{
+    register_histogram_vec, register_int_counter, register_int_counter_vec, Histogram, IntCounter,
+};
 use once_cell::sync::Lazy;

 pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);
@@ -125,41 +127,22 @@ impl PassFailCancelledRequestTyped<Histogram> {
 }

 pub(super) struct BucketMetrics {
-    /// Total requests attempted
-    // TODO: remove after next release and migrate dashboards to `sum by (result) (remote_storage_s3_requests_count)`
-    requests: RequestTyped<IntCounter>,
-    /// Subset of attempted requests failed
-    // TODO: remove after next release and migrate dashboards to `remote_storage_s3_requests_count{result="err"}`
-    failed: RequestTyped<IntCounter>,
-
+    /// Full request duration until successful completion, error or cancellation.
    pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
+    /// Total amount of seconds waited on queue.
    pub(super) wait_seconds: RequestTyped<Histogram>,

    /// Track how many semaphore awaits were cancelled per request type.
    ///
    /// This is in case cancellations are happening more than expected.
    pub(super) cancelled_waits: RequestTyped<IntCounter>,
+
+    /// Total amount of deleted objects in batches or single requests.
+    pub(super) deleted_objects_total: IntCounter,
 }

 impl Default for BucketMetrics {
    fn default() -> Self {
-        let requests = register_int_counter_vec!(
-            "remote_storage_s3_requests_count",
-            "Number of s3 requests of particular type",
-            &["request_type"],
-        )
-        .expect("failed to define a metric");
-        let requests =
-            RequestTyped::build_with(|kind| requests.with_label_values(&[kind.as_str()]));
-
-        let failed = register_int_counter_vec!(
-            "remote_storage_s3_failures_count",
-            "Number of failed s3 requests of particular type",
-            &["request_type"],
-        )
-        .expect("failed to define a metric");
-        let failed = RequestTyped::build_with(|kind| failed.with_label_values(&[kind.as_str()]));
-
        let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];

        let req_seconds = register_histogram_vec!(
@@ -192,52 +175,17 @@ impl Default for BucketMetrics {
        let cancelled_waits =
            RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()]));

+        let deleted_objects_total = register_int_counter!(
+            "remote_storage_s3_deleted_objects_total",
+            "Amount of deleted objects in total",
+        )
+        .unwrap();
+
        Self {
-            requests,
-            failed,
            req_seconds,
            wait_seconds,
            cancelled_waits,
+            deleted_objects_total,
        }
    }
 }
-
-pub fn inc_get_object() {
-    BUCKET_METRICS.requests.get(Get).inc()
-}
-
-pub fn inc_get_object_fail() {
-    BUCKET_METRICS.failed.get(Get).inc()
-}
-
-pub fn inc_put_object() {
-    BUCKET_METRICS.requests.get(Put).inc()
-}
-
-pub fn inc_put_object_fail() {
-    BUCKET_METRICS.failed.get(Put).inc()
-}
-
-pub fn inc_delete_object() {
-    BUCKET_METRICS.requests.get(Delete).inc()
-}
-
-pub fn inc_delete_objects(count: u64) {
-    BUCKET_METRICS.requests.get(Delete).inc_by(count)
-}
-
-pub fn inc_delete_object_fail() {
-    BUCKET_METRICS.failed.get(Delete).inc()
-}
-
-pub fn inc_delete_objects_fail(count: u64) {
-    BUCKET_METRICS.failed.get(Delete).inc_by(count)
-}
-
-pub fn inc_list_objects() {
-    BUCKET_METRICS.requests.get(List).inc()
-}
-
-pub fn inc_list_objects_fail() {
-    BUCKET_METRICS.failed.get(List).inc()
-}
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -71,6 +71,13 @@ impl UnreliableWrapper {
            }
        }
    }
+
+    async fn delete_inner(&self, path: &RemotePath, attempt: bool) -> anyhow::Result<()> {
+        if attempt {
+            self.attempt(RemoteOp::Delete(path.clone()))?;
+        }
+        self.inner.delete(path).await
+    }
 }

 #[async_trait::async_trait]
@@ -122,15 +129,15 @@ impl RemoteStorage for UnreliableWrapper {
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        self.attempt(RemoteOp::Delete(path.clone()))?;
-        self.inner.delete(path).await
+        self.delete_inner(path, true).await
    }

    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
        self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
        let mut error_counter = 0;
        for path in paths {
-            if (self.delete(path).await).is_err() {
+            // Dont record attempt because it was already recorded above
+            if (self.delete_inner(path, false).await).is_err() {
                error_counter += 1;
            }
        }
--- a/libs/utils/src/backoff.rs
+++ b/libs/utils/src/backoff.rs
@@ -0,0 +1,188 @@
+use std::fmt::{Debug, Display};
+
+use futures::Future;
+
+pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
+pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
+
+pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
+    let backoff_duration_seconds =
+        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
+    if backoff_duration_seconds > 0.0 {
+        tracing::info!(
+            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
+        );
+        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
+    }
+}
+
+pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
+    if n == 0 {
+        0.0
+    } else {
+        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
+    }
+}
+
+/// retries passed operation until one of the following conditions are met:
+/// Encountered error is considered as permanent (non-retryable)
+/// Retries have been exhausted.
+/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
+/// When attempts cross `warn_threshold` function starts to emit log warnings.
+/// `description` argument is added to log messages. Its value should identify the `op` is doing
+pub async fn retry<T, O, F, E>(
+    mut op: O,
+    is_permanent: impl Fn(&E) -> bool,
+    warn_threshold: u32,
+    max_retries: u32,
+    description: &str,
+) -> Result<T, E>
+where
+    // Not std::error::Error because anyhow::Error doesnt implement it.
+    // For context see https://github.com/dtolnay/anyhow/issues/63
+    E: Display + Debug,
+    O: FnMut() -> F,
+    F: Future<Output = Result<T, E>>,
+{
+    let mut attempts = 0;
+    loop {
+        let result = op().await;
+        match result {
+            Ok(_) => {
+                if attempts > 0 {
+                    tracing::info!("{description} succeeded after {attempts} retries");
+                }
+                return result;
+            }
+
+            // These are "permanent" errors that should not be retried.
+            Err(ref e) if is_permanent(e) => {
+                return result;
+            }
+            // Assume that any other failure might be transient, and the operation might
+            // succeed if we just keep trying.
+            Err(err) if attempts < warn_threshold => {
+                tracing::info!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(err) if attempts < max_retries => {
+                tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
+            }
+            Err(ref err) => {
+                // Operation failed `max_attempts` times. Time to give up.
+                tracing::warn!(
+                    "{description} still failed after {attempts} retries, giving up: {err:?}"
+                );
+                return result;
+            }
+        }
+        // sleep and retry
+        exponential_backoff(
+            attempts,
+            DEFAULT_BASE_BACKOFF_SECONDS,
+            DEFAULT_MAX_BACKOFF_SECONDS,
+        )
+        .await;
+        attempts += 1;
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io;
+
+    use tokio::sync::Mutex;
+
+    use super::*;
+
+    #[test]
+    fn backoff_defaults_produce_growing_backoff_sequence() {
+        let mut current_backoff_value = None;
+
+        for i in 0..10_000 {
+            let new_backoff_value = exponential_backoff_duration_seconds(
+                i,
+                DEFAULT_BASE_BACKOFF_SECONDS,
+                DEFAULT_MAX_BACKOFF_SECONDS,
+            );
+
+            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
+                assert!(
+                    old_backoff_value <= new_backoff_value,
+                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
+                )
+            }
+        }
+
+        assert_eq!(
+            current_backoff_value.expect("Should have produced backoff values to compare"),
+            DEFAULT_MAX_BACKOFF_SECONDS,
+            "Given big enough of retries, backoff should reach its allowed max value"
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn retry_always_error() {
+        let count = Mutex::new(0);
+        let err_result = retry(
+            || async {
+                *count.lock().await += 1;
+                Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
+            },
+            |_e| false,
+            1,
+            1,
+            "work",
+        )
+        .await;
+
+        assert!(err_result.is_err());
+
+        assert_eq!(*count.lock().await, 2);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn retry_ok_after_err() {
+        let count = Mutex::new(0);
+        retry(
+            || async {
+                let mut locked = count.lock().await;
+                if *locked > 1 {
+                    Ok(())
+                } else {
+                    *locked += 1;
+                    Err(io::Error::from(io::ErrorKind::Other))
+                }
+            },
+            |_e| false,
+            2,
+            2,
+            "work",
+        )
+        .await
+        .unwrap();
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn dont_retry_permanent_errors() {
+        let count = Mutex::new(0);
+        let _ = retry(
+            || async {
+                let mut locked = count.lock().await;
+                if *locked > 1 {
+                    Ok(())
+                } else {
+                    *locked += 1;
+                    Err(io::Error::from(io::ErrorKind::Other))
+                }
+            },
+            |_e| true,
+            2,
+            2,
+            "work",
+        )
+        .await
+        .unwrap_err();
+
+        assert_eq!(*count.lock().await, 1);
+    }
+}
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -111,6 +111,10 @@ pub fn fsync(path: &Path) -> io::Result<()> {
        .map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
 }

+pub async fn fsync_async(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
+    tokio::fs::File::open(path).await?.sync_all().await
+}
+
 #[cfg(test)]
 mod tests {
    use tempfile::tempdir;
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -24,6 +24,20 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
    Ok(dir.next_entry().await?.is_none())
 }

+pub async fn list_dir(path: impl AsRef<Path>) -> anyhow::Result<Vec<String>> {
+    let mut dir = tokio::fs::read_dir(&path)
+        .await
+        .context(format!("read_dir({})", path.as_ref().display()))?;
+
+    let mut content = vec![];
+    while let Some(next) = dir.next_entry().await? {
+        let file_name = next.file_name();
+        content.push(file_name.to_string_lossy().to_string());
+    }
+
+    Ok(content)
+}
+
 pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
    if e.kind() == io::ErrorKind::NotFound {
        Ok(())
@@ -43,7 +57,7 @@ where
 mod test {
    use std::path::PathBuf;

-    use crate::fs_ext::is_directory_empty;
+    use crate::fs_ext::{is_directory_empty, list_dir};

    use super::ignore_absent_files;

@@ -109,4 +123,25 @@ mod test {

        assert!(!file_path.exists());
    }
+
+    #[tokio::test]
+    async fn list_dir_works() {
+        let dir = tempfile::tempdir().unwrap();
+        let dir_path = dir.path();
+
+        assert!(list_dir(dir_path).await.unwrap().is_empty());
+
+        let file_path: PathBuf = dir_path.join("testfile");
+        let _ = std::fs::File::create(&file_path).unwrap();
+
+        assert_eq!(&list_dir(dir_path).await.unwrap(), &["testfile"]);
+
+        let another_dir_path: PathBuf = dir_path.join("testdir");
+        std::fs::create_dir(another_dir_path).unwrap();
+
+        let expected = &["testdir", "testfile"];
+        let mut actual = list_dir(dir_path).await.unwrap();
+        actual.sort();
+        assert_eq!(actual, expected);
+    }
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,6 +1,8 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.

+pub mod backoff;
+
 /// `Lsn` type implements common tasks on Log Sequence Numbers
 pub mod lsn;
 /// SeqWait allows waiting for a future sequence number to arrive
@@ -66,44 +68,6 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;

-mod failpoint_macro_helpers {
-
-    /// use with fail::cfg("$name", "return(2000)")
-    ///
-    /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
-    /// specified time (in milliseconds). The main difference is that we use async
-    /// tokio sleep function. Another difference is that we print lines to the log,
-    /// which can be useful in tests to check that the failpoint was hit.
-    #[macro_export]
-    macro_rules! failpoint_sleep_millis_async {
-        ($name:literal) => {{
-            // If the failpoint is used with a "return" action, set should_sleep to the
-            // returned value (as string). Otherwise it's set to None.
-            let should_sleep = (|| {
-                ::fail::fail_point!($name, |x| x);
-                ::std::option::Option::None
-            })();
-
-            // Sleep if the action was a returned value
-            if let ::std::option::Option::Some(duration_str) = should_sleep {
-                $crate::failpoint_sleep_helper($name, duration_str).await
-            }
-        }};
-    }
-
-    // Helper function used by the macro. (A function has nicer scoping so we
-    // don't need to decorate everything with "::")
-    pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
-        let millis = duration_str.parse::<u64>().unwrap();
-        let d = std::time::Duration::from_millis(millis);
-
-        tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
-        tokio::time::sleep(d).await;
-        tracing::info!("failpoint {:?}: sleep done", name);
-    }
-}
-pub use failpoint_macro_helpers::failpoint_sleep_helper;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -72,7 +72,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
        .await?;
    let cursor = BlockCursor::new(&file);
    for (k, v) in all {
-        let value = cursor.read_blob(v.pos())?;
+        let value = cursor.read_blob(v.pos()).await?;
        println!("key:{} value_len:{}", k, value.len());
    }
    // TODO(chi): special handling for last key?
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -6,11 +6,12 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};

 use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};
-use fail::FailScenario;
+
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::WALRECEIVER_RUNTIME;
+use pageserver::tenant::TenantSharedResources;
 use remote_storage::GenericRemoteStorage;
 use tokio::time::Instant;
 use tracing::*;
@@ -121,7 +122,7 @@ fn main() -> anyhow::Result<()> {
    }

    // Initialize up failpoints support
-    let scenario = FailScenario::setup();
+    let scenario = pageserver::failpoint_support::init();

    // Basic initialization of things that don't change after startup
    virtual_file::init(conf.max_file_descriptors);
@@ -373,7 +374,7 @@ fn start_pageserver(
    let order = pageserver::InitializationOrder {
        initial_tenant_load: Some(init_done_tx),
        initial_logical_size_can_start: init_done_rx.clone(),
-        initial_logical_size_attempt: init_logical_size_done_tx,
+        initial_logical_size_attempt: Some(init_logical_size_done_tx),
        background_jobs_can_start: background_jobs_barrier.clone(),
    };

@@ -382,8 +383,10 @@ fn start_pageserver(

    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
-        broker_client.clone(),
-        remote_storage.clone(),
+        TenantSharedResources {
+            broker_client: broker_client.clone(),
+            remote_storage: remote_storage.clone(),
+        },
        order,
    ))?;

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -31,7 +31,9 @@ use utils::{
 use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
+use crate::tenant::{
+    TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
+};
 use crate::{
    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
    TIMELINE_UNINIT_MARK_SUFFIX,
@@ -613,6 +615,11 @@ impl PageServerConf {
        )
    }

+    pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
+        self.tenant_path(tenant_id)
+            .join(TENANT_DELETED_MARKER_FILE_NAME)
+    }
+
    pub fn traces_path(&self) -> PathBuf {
        self.workdir.join("traces")
    }
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -85,6 +85,7 @@
 //! The solution is that all code paths are infected with precisely one
 //! [`RequestContext`] argument. Functions in the middle of the call chain
 //! only need to pass it on.
+
 use crate::task_mgr::TaskKind;

 // The main structure of this module, see module-level comment.
@@ -92,6 +93,7 @@ use crate::task_mgr::TaskKind;
 pub struct RequestContext {
    task_kind: TaskKind,
    download_behavior: DownloadBehavior,
+    access_stats_behavior: AccessStatsBehavior,
 }

 /// Desired behavior if the operation requires an on-demand download
@@ -109,6 +111,67 @@ pub enum DownloadBehavior {
    Error,
 }

+/// Whether this request should update access times used in LRU eviction
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
+pub(crate) enum AccessStatsBehavior {
+    /// Update access times: this request's access to data should be taken
+    /// as a hint that the accessed layer is likely to be accessed again
+    Update,
+
+    /// Do not update access times: this request is accessing the layer
+    /// but does not want to indicate that the layer should be retained in cache,
+    /// perhaps because the requestor is a compaction routine that will soon cover
+    /// this layer with another.
+    Skip,
+}
+
+pub struct RequestContextBuilder {
+    inner: RequestContext,
+}
+
+impl RequestContextBuilder {
+    /// A new builder with default settings
+    pub fn new(task_kind: TaskKind) -> Self {
+        Self {
+            inner: RequestContext {
+                task_kind,
+                download_behavior: DownloadBehavior::Download,
+                access_stats_behavior: AccessStatsBehavior::Update,
+            },
+        }
+    }
+
+    pub fn extend(original: &RequestContext) -> Self {
+        Self {
+            // This is like a Copy, but avoid implementing Copy because ordinary users of
+            // RequestContext should always move or ref it.
+            inner: RequestContext {
+                task_kind: original.task_kind,
+                download_behavior: original.download_behavior,
+                access_stats_behavior: original.access_stats_behavior,
+            },
+        }
+    }
+
+    /// Configure the DownloadBehavior of the context: whether to
+    /// download missing layers, and/or warn on the download.
+    pub fn download_behavior(mut self, b: DownloadBehavior) -> Self {
+        self.inner.download_behavior = b;
+        self
+    }
+
+    /// Configure the AccessStatsBehavior of the context: whether layer
+    /// accesses should update the access time of the layer.
+    pub(crate) fn access_stats_behavior(mut self, b: AccessStatsBehavior) -> Self {
+        self.inner.access_stats_behavior = b;
+        self
+    }
+
+    pub fn build(self) -> RequestContext {
+        self.inner
+    }
+}
+
 impl RequestContext {
    /// Create a new RequestContext that has no parent.
    ///
@@ -123,10 +186,9 @@ impl RequestContext {
    /// because someone explicitly canceled it.
    /// It has no parent, so it cannot inherit cancellation from there.
    pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        RequestContext {
-            task_kind,
-            download_behavior,
-        }
+        RequestContextBuilder::new(task_kind)
+            .download_behavior(download_behavior)
+            .build()
    }

    /// Create a detached child context for a task that may outlive `self`.
@@ -187,10 +249,7 @@ impl RequestContext {
    }

    fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        RequestContext {
-            task_kind,
-            download_behavior,
-        }
+        Self::new(task_kind, download_behavior)
    }

    pub fn task_kind(&self) -> TaskKind {
@@ -200,4 +259,8 @@ impl RequestContext {
    pub fn download_behavior(&self) -> DownloadBehavior {
        self.download_behavior
    }
+
+    pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior {
+        self.access_stats_behavior
+    }
 }
--- a/pageserver/src/failpoint_support.rs
+++ b/pageserver/src/failpoint_support.rs
@@ -0,0 +1,86 @@
+/// use with fail::cfg("$name", "return(2000)")
+///
+/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
+/// specified time (in milliseconds). The main difference is that we use async
+/// tokio sleep function. Another difference is that we print lines to the log,
+/// which can be useful in tests to check that the failpoint was hit.
+#[macro_export]
+macro_rules! __failpoint_sleep_millis_async {
+    ($name:literal) => {{
+        // If the failpoint is used with a "return" action, set should_sleep to the
+        // returned value (as string). Otherwise it's set to None.
+        let should_sleep = (|| {
+            ::fail::fail_point!($name, |x| x);
+            ::std::option::Option::None
+        })();
+
+        // Sleep if the action was a returned value
+        if let ::std::option::Option::Some(duration_str) = should_sleep {
+            $crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await
+        }
+    }};
+}
+pub use __failpoint_sleep_millis_async as sleep_millis_async;
+
+// Helper function used by the macro. (A function has nicer scoping so we
+// don't need to decorate everything with "::")
+#[doc(hidden)]
+pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
+    let millis = duration_str.parse::<u64>().unwrap();
+    let d = std::time::Duration::from_millis(millis);
+
+    tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
+    tokio::time::sleep(d).await;
+    tracing::info!("failpoint {:?}: sleep done", name);
+}
+
+pub fn init() -> fail::FailScenario<'static> {
+    // The failpoints lib provides support for parsing the `FAILPOINTS` env var.
+    // We want non-default behavior for `exit`, though, so, we handle it separately.
+    //
+    // Format for FAILPOINTS is "name=actions" separated by ";".
+    let actions = std::env::var("FAILPOINTS");
+    if actions.is_ok() {
+        std::env::remove_var("FAILPOINTS");
+    } else {
+        // let the library handle non-utf8, or nothing for not present
+    }
+
+    let scenario = fail::FailScenario::setup();
+
+    if let Ok(val) = actions {
+        val.split(';')
+            .enumerate()
+            .map(|(i, s)| s.split_once('=').ok_or((i, s)))
+            .for_each(|res| {
+                let (name, actions) = match res {
+                    Ok(t) => t,
+                    Err((i, s)) => {
+                        panic!(
+                            "startup failpoints: missing action on the {}th failpoint; try `{s}=return`",
+                            i + 1,
+                        );
+                    }
+                };
+                if let Err(e) = apply_failpoint(name, actions) {
+                    panic!("startup failpoints: failed to apply failpoint {name}={actions}: {e}");
+                }
+            });
+    }
+
+    scenario
+}
+
+pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
+    if actions == "exit" {
+        fail::cfg_callback(name, exit_failpoint)
+    } else {
+        fail::cfg(name, actions)
+    }
+}
+
+#[inline(never)]
+fn exit_failpoint() {
+    tracing::info!("Exit requested by failpoint");
+    std::process::exit(1);
+}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -93,6 +93,47 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+    delete:
+      description: |
+        Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
+        404 means that deletion successfully finished"
+      responses:
+        "400":
+          description: Error when no tenant id found in path
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "404":
+          description: Tenant not found
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/NotFoundError"
+        "409":
+          description: Deletion is already in progress, continue polling
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"

  /v1/tenant/{tenant_id}/timeline:
    parameters:
@@ -820,6 +861,7 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+
  /v1/tenant/config:
    put:
      description: |
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -187,7 +187,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
                format!("Cannot delete timeline which has child timelines: {children:?}")
                    .into_boxed_str(),
            ),
-            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
+            a @ AlreadyInProgress(_) => ApiError::Conflict(a.to_string()),
            Other(e) => ApiError::InternalServerError(e),
        }
    }
@@ -208,6 +208,19 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
    }
 }

+impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
+    fn from(value: crate::tenant::delete::DeleteTenantError) -> Self {
+        use crate::tenant::delete::DeleteTenantError::*;
+        match value {
+            Get(g) => ApiError::from(g),
+            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
+            Timeline(t) => ApiError::from(t),
+            Other(o) => ApiError::InternalServerError(o),
+            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
+        }
+    }
+}
+
 // Helper function to construct a TimelineInfo struct for a timeline
 async fn build_timeline_info(
    timeline: &Arc<Timeline>,
@@ -504,7 +517,6 @@ async fn timeline_delete_handler(
        .instrument(info_span!("timeline_delete", %tenant_id, %timeline_id))
        .await?;

-    // FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
    json_response(StatusCode::ACCEPTED, ())
 }

@@ -617,6 +629,23 @@ async fn tenant_status(
    json_response(StatusCode::OK, tenant_info)
 }

+async fn tenant_delete_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    // TODO openapi spec
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+
+    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
+        .instrument(info_span!("tenant_delete_handler", %tenant_id))
+        .await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
 /// HTTP endpoint to query the current tenant_size of a tenant.
 ///
 /// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
@@ -950,14 +979,7 @@ async fn failpoints_handler(

        // We recognize one extra "action" that's not natively recognized
        // by the failpoints crate: exit, to immediately kill the process
-        let cfg_result = if fp.actions == "exit" {
-            fail::cfg_callback(fp.name, || {
-                info!("Exit requested by failpoint");
-                std::process::exit(1);
-            })
-        } else {
-            fail::cfg(fp.name, &fp.actions)
-        };
+        let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);

        if let Err(err_msg) = cfg_result {
            return Err(ApiError::BadRequest(anyhow!(
@@ -1345,6 +1367,9 @@ pub fn make_router(
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
+        .delete("/v1/tenant/:tenant_id", |r| {
+            api_handler(r, tenant_delete_handler)
+        })
        .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
            api_handler(r, tenant_size_handler)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -21,6 +21,8 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;

+pub mod failpoint_support;
+
 use std::path::Path;

 use crate::task_mgr::TaskKind;
@@ -95,28 +97,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {
    std::process::exit(exit_code);
 }

-const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
-const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
-
-async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
-    let backoff_duration_seconds =
-        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
-    if backoff_duration_seconds > 0.0 {
-        info!(
-            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
-        );
-        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
-    }
-}
-
-pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
-    if n == 0 {
-        0.0
-    } else {
-        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
-    }
-}
-
 /// The name of the metadata file pageserver creates per timeline.
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
 pub const METADATA_FILE_NAME: &str = "metadata";
@@ -190,7 +170,7 @@ pub struct InitializationOrder {

    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
    /// attempt. It is important to drop this once the attempt has completed.
-    pub initial_logical_size_attempt: utils::completion::Completion,
+    pub initial_logical_size_attempt: Option<utils::completion::Completion>,

    /// Barrier for when we can start any background jobs.
    ///
@@ -238,37 +218,6 @@ async fn timed<Fut: std::future::Future>(
    }
 }

-#[cfg(test)]
-mod backoff_defaults_tests {
-    use super::*;
-
-    #[test]
-    fn backoff_defaults_produce_growing_backoff_sequence() {
-        let mut current_backoff_value = None;
-
-        for i in 0..10_000 {
-            let new_backoff_value = exponential_backoff_duration_seconds(
-                i,
-                DEFAULT_BASE_BACKOFF_SECONDS,
-                DEFAULT_MAX_BACKOFF_SECONDS,
-            );
-
-            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
-                assert!(
-                    old_backoff_value <= new_backoff_value,
-                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
-                )
-            }
-        }
-
-        assert_eq!(
-            current_backoff_value.expect("Should have produced backoff values to compare"),
-            DEFAULT_MAX_BACKOFF_SECONDS,
-            "Given big enough of retries, backoff should reach its allowed max value"
-        );
-    }
-}
-
 #[cfg(test)]
 mod timed_tests {
    use super::timed;
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -10,6 +10,42 @@
 //! PostgreSQL buffer size, and a Slot struct for each buffer to contain
 //! information about what's stored in the buffer.
 //!
+//! # Types Of Pages
+//!
+//! [`PageCache`] only supports immutable pages.
+//! Hence there is no need to worry about coherency.
+//!
+//! Two types of pages are supported:
+//!
+//! * **Materialized pages**, filled & used by page reconstruction
+//! * **Immutable File pages**, filled & used by [`crate::tenant::block_io`] and [`crate::tenant::ephemeral_file`].
+//!
+//! Note that [`crate::tenant::ephemeral_file::EphemeralFile`] is generally mutable, but, it's append-only.
+//! It uses the page cache only for the blocks that are already fully written and immutable.
+//!
+//! # Filling The Page Cache
+//!
+//! Page cache maps from a cache key to a buffer slot.
+//! The cache key uniquely identifies the piece of data that is being cached.
+//!
+//! The cache key for **materialized pages** is  [`TenantId`], [`TimelineId`], [`Key`], and [`Lsn`].
+//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
+//!
+//! The cache key for **immutable file** pages is [`FileId`] and a block number.
+//! Users of page cache that wish to page-cache an arbitrary (immutable!) on-disk file do the following:
+//! * Have a mechanism to deterministically associate the on-disk file with a [`FileId`].
+//! * Get a [`FileId`] using [`next_file_id`].
+//! * Use the mechanism to associate the on-disk file with the returned [`FileId`].
+//! * Use [`PageCache::read_immutable_buf`] to get a [`ReadBufResult`].
+//! * If the page was already cached, it'll be the [`ReadBufResult::Found`] variant that contains
+//!   a read guard for the page. Just use it.
+//! * If the page was not cached, it'll be the [`ReadBufResult::NotFound`] variant that contains
+//!   a write guard for the page. Fill the page with the contents of the on-disk file.
+//!   Then call [`PageWriteGuard::mark_valid`] to mark the page as valid.
+//!   Then try again to [`PageCache::read_immutable_buf`].
+//!   Unless there's high cache pressure, the page should now be cached.
+//!   (TODO: allow downgrading the write guard to a read guard to ensure forward progress.)
+//!
 //! # Locking
 //!
 //! There are two levels of locking involved: There's one lock for the "mapping"
@@ -40,20 +76,18 @@ use std::{
    collections::{hash_map::Entry, HashMap},
    convert::TryInto,
    sync::{
-        atomic::{AtomicU8, AtomicUsize, Ordering},
+        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
        RwLock, RwLockReadGuard, RwLockWriteGuard, TryLockError,
    },
 };

 use anyhow::Context;
 use once_cell::sync::OnceCell;
-use tracing::error;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };

-use crate::tenant::writeback_ephemeral_file;
 use crate::{metrics::PageCacheSizeMetrics, repository::Key};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -87,6 +121,17 @@ pub fn get() -> &'static PageCache {
 pub const PAGE_SZ: usize = postgres_ffi::BLCKSZ as usize;
 const MAX_USAGE_COUNT: u8 = 5;

+/// See module-level comment.
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
+pub struct FileId(u64);
+
+static NEXT_ID: AtomicU64 = AtomicU64::new(1);
+
+/// See module-level comment.
+pub fn next_file_id() -> FileId {
+    FileId(NEXT_ID.fetch_add(1, Ordering::Relaxed))
+}
+
 ///
 /// CacheKey uniquely identifies a "thing" to cache in the page cache.
 ///
@@ -97,12 +142,8 @@ enum CacheKey {
        hash_key: MaterializedPageHashKey,
        lsn: Lsn,
    },
-    EphemeralPage {
-        file_id: u64,
-        blkno: u32,
-    },
    ImmutableFilePage {
-        file_id: u64,
+        file_id: FileId,
        blkno: u32,
    },
 }
@@ -128,7 +169,6 @@ struct Slot {
 struct SlotInner {
    key: Option<CacheKey>,
    buf: &'static mut [u8; PAGE_SZ],
-    dirty: bool,
 }

 impl Slot {
@@ -177,9 +217,7 @@ pub struct PageCache {
    /// can have a separate mapping map, next to this field.
    materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,

-    ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,
-
-    immutable_page_map: RwLock<HashMap<(u64, u32), usize>>,
+    immutable_page_map: RwLock<HashMap<(FileId, u32), usize>>,

    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,
@@ -258,14 +296,6 @@ impl PageWriteGuard<'_> {
        );
        self.valid = true;
    }
-    pub fn mark_dirty(&mut self) {
-        // only ephemeral pages can be dirty ATM.
-        assert!(matches!(
-            self.inner.key,
-            Some(CacheKey::EphemeralPage { .. })
-        ));
-        self.inner.dirty = true;
-    }
 }

 impl Drop for PageWriteGuard<'_> {
@@ -280,7 +310,6 @@ impl Drop for PageWriteGuard<'_> {
            let self_key = self.inner.key.as_ref().unwrap();
            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
            self.inner.key = None;
-            self.inner.dirty = false;
        }
    }
 }
@@ -388,50 +417,16 @@ impl PageCache {
        Ok(())
    }

-    // Section 1.2: Public interface functions for working with Ephemeral pages.
+    // Section 1.2: Public interface functions for working with immutable file pages.

-    pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
-        let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };
-
-        self.lock_for_read(&mut cache_key)
-    }
-
-    pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<WriteBufResult> {
-        let cache_key = CacheKey::EphemeralPage { file_id, blkno };
-
-        self.lock_for_write(&cache_key)
-    }
-
-    /// Immediately drop all buffers belonging to given file, without writeback
-    pub fn drop_buffers_for_ephemeral(&self, drop_file_id: u64) {
-        for slot_idx in 0..self.slots.len() {
-            let slot = &self.slots[slot_idx];
-
-            let mut inner = slot.inner.write().unwrap();
-            if let Some(key) = &inner.key {
-                match key {
-                    CacheKey::EphemeralPage { file_id, blkno: _ } if *file_id == drop_file_id => {
-                        // remove mapping for old buffer
-                        self.remove_mapping(key);
-                        inner.key = None;
-                        inner.dirty = false;
-                    }
-                    _ => {}
-                }
-            }
-        }
-    }
-
-    // Section 1.3: Public interface functions for working with immutable file pages.
-
-    pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
+    pub fn read_immutable_buf(&self, file_id: FileId, blkno: u32) -> anyhow::Result<ReadBufResult> {
        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };

        self.lock_for_read(&mut cache_key)
    }

-    /// Immediately drop all buffers belonging to given file, without writeback
-    pub fn drop_buffers_for_immutable(&self, drop_file_id: u64) {
+    /// Immediately drop all buffers belonging to given file
+    pub fn drop_buffers_for_immutable(&self, drop_file_id: FileId) {
        for slot_idx in 0..self.slots.len() {
            let slot = &self.slots[slot_idx];

@@ -444,7 +439,6 @@ impl PageCache {
                        // remove mapping for old buffer
                        self.remove_mapping(key);
                        inner.key = None;
-                        inner.dirty = false;
                    }
                    _ => {}
                }
@@ -522,10 +516,6 @@ impl PageCache {
            CacheKey::MaterializedPage { .. } => {
                unreachable!("Materialized pages use lookup_materialized_page")
            }
-            CacheKey::EphemeralPage { .. } => (
-                &crate::metrics::PAGE_CACHE.read_accesses_ephemeral,
-                &crate::metrics::PAGE_CACHE.read_hits_ephemeral,
-            ),
            CacheKey::ImmutableFilePage { .. } => (
                &crate::metrics::PAGE_CACHE.read_accesses_immutable,
                &crate::metrics::PAGE_CACHE.read_hits_immutable,
@@ -566,7 +556,6 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
-            inner.dirty = false;
            slot.usage_count.store(1, Ordering::Relaxed);

            return Ok(ReadBufResult::NotFound(PageWriteGuard {
@@ -628,7 +617,6 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
-            inner.dirty = false;
            slot.usage_count.store(1, Ordering::Relaxed);

            return Ok(WriteBufResult::NotFound(PageWriteGuard {
@@ -667,10 +655,6 @@ impl PageCache {
                *lsn = version.lsn;
                Some(version.slot_idx)
            }
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                let map = self.ephemeral_page_map.read().unwrap();
-                Some(*map.get(&(*file_id, *blkno))?)
-            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let map = self.immutable_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
@@ -694,10 +678,6 @@ impl PageCache {
                    None
                }
            }
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                let map = self.ephemeral_page_map.read().unwrap();
-                Some(*map.get(&(*file_id, *blkno))?)
-            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let map = self.immutable_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
@@ -731,12 +711,6 @@ impl PageCache {
                    panic!("could not find old key in mapping")
                }
            }
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                let mut map = self.ephemeral_page_map.write().unwrap();
-                map.remove(&(*file_id, *blkno))
-                    .expect("could not find old key in mapping");
-                self.size_metrics.current_bytes_ephemeral.sub_page_sz(1);
-            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
@@ -776,17 +750,7 @@ impl PageCache {
                    }
                }
            }
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                let mut map = self.ephemeral_page_map.write().unwrap();
-                match map.entry((*file_id, *blkno)) {
-                    Entry::Occupied(entry) => Some(*entry.get()),
-                    Entry::Vacant(entry) => {
-                        entry.insert(slot_idx);
-                        self.size_metrics.current_bytes_ephemeral.add_page_sz(1);
-                        None
-                    }
-                }
-            }
+
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                match map.entry((*file_id, *blkno)) {
@@ -837,25 +801,8 @@ impl PageCache {
                    }
                };
                if let Some(old_key) = &inner.key {
-                    if inner.dirty {
-                        if let Err(err) = Self::writeback(old_key, inner.buf) {
-                            // Writing the page to disk failed.
-                            //
-                            // FIXME: What to do here, when? We could propagate the error to the
-                            // caller, but victim buffer is generally unrelated to the original
-                            // call. It can even belong to a different tenant. Currently, we
-                            // report the error to the log and continue the clock sweep to find
-                            // a different victim. But if the problem persists, the page cache
-                            // could fill up with dirty pages that we cannot evict, and we will
-                            // loop retrying the writebacks indefinitely.
-                            error!("writeback of buffer {:?} failed: {}", old_key, err);
-                            continue;
-                        }
-                    }
-
                    // remove mapping for old buffer
                    self.remove_mapping(old_key);
-                    inner.dirty = false;
                    inner.key = None;
                }
                return Ok((slot_idx, inner));
@@ -863,28 +810,6 @@ impl PageCache {
        }
    }

-    fn writeback(cache_key: &CacheKey, buf: &[u8]) -> Result<(), std::io::Error> {
-        match cache_key {
-            CacheKey::MaterializedPage {
-                hash_key: _,
-                lsn: _,
-            } => Err(std::io::Error::new(
-                std::io::ErrorKind::Other,
-                "unexpected dirty materialized page",
-            )),
-            CacheKey::EphemeralPage { file_id, blkno } => {
-                writeback_ephemeral_file(*file_id, *blkno, buf)
-            }
-            CacheKey::ImmutableFilePage {
-                file_id: _,
-                blkno: _,
-            } => Err(std::io::Error::new(
-                std::io::ErrorKind::Other,
-                "unexpected dirty immutable page",
-            )),
-        }
-    }
-
    /// Initialize a new page cache
    ///
    /// This should be called only once at page server startup.
@@ -895,7 +820,6 @@ impl PageCache {

        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
        size_metrics.max_bytes.set_page_sz(num_pages);
-        size_metrics.current_bytes_ephemeral.set_page_sz(0);
        size_metrics.current_bytes_immutable.set_page_sz(0);
        size_metrics.current_bytes_materialized_page.set_page_sz(0);

@@ -905,11 +829,7 @@ impl PageCache {
                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();

                Slot {
-                    inner: RwLock::new(SlotInner {
-                        key: None,
-                        buf,
-                        dirty: false,
-                    }),
+                    inner: RwLock::new(SlotInner { key: None, buf }),
                    usage_count: AtomicU8::new(0),
                }
            })
@@ -917,7 +837,6 @@ impl PageCache {

        Self {
            materialized_page_map: Default::default(),
-            ephemeral_page_map: Default::default(),
            immutable_page_map: Default::default(),
            slots,
            next_evict_slot: AtomicUsize::new(0),
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -28,6 +28,8 @@ use std::cmp::min;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
+use std::fmt::Debug;
+use std::fmt::Display;
 use std::fs;
 use std::fs::File;
 use std::fs::OpenOptions;
@@ -46,12 +48,15 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

 use self::config::TenantConf;
+use self::delete::DeleteTenantFlow;
 use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
+use self::mgr::TenantsMap;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
+use self::timeline::TimelineResources;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir;
@@ -106,6 +111,7 @@ macro_rules! pausable_failpoint {

 pub mod blob_io;
 pub mod block_io;
+
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
@@ -118,6 +124,7 @@ mod remote_timeline_client;
 pub mod storage_layer;

 pub mod config;
+pub mod delete;
 pub mod mgr;
 pub mod tasks;
 pub mod upload_queue;
@@ -131,9 +138,6 @@ pub use timeline::{
    LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
 };

-// re-export this function so that page_cache.rs can use it.
-pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
-
 // re-export for use in remote_timeline_client.rs
 pub use crate::tenant::metadata::save_metadata;

@@ -145,6 +149,16 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines";

 pub const TENANT_ATTACHING_MARKER_FILENAME: &str = "attaching";

+pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
+
+/// References to shared objects that are passed into each tenant, such
+/// as the shared remote storage client and process initialization state.
+#[derive(Clone)]
+pub struct TenantSharedResources {
+    pub broker_client: storage_broker::BrokerClientChannel,
+    pub remote_storage: Option<GenericRemoteStorage>,
+}
+
 ///
 /// Tenant consists of multiple timelines. Keep them in a hash table.
 ///
@@ -183,6 +197,8 @@ pub struct Tenant {
    cached_synthetic_tenant_size: Arc<AtomicU64>,

    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
+
+    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
 }

 // We should not blindly overwrite local metadata with remote one.
@@ -274,7 +290,7 @@ pub enum LoadLocalTimelineError {
    ResumeDeletion(#[source] anyhow::Error),
 }

-#[derive(Debug, thiserror::Error)]
+#[derive(thiserror::Error)]
 pub enum DeleteTimelineError {
    #[error("NotFound")]
    NotFound,
@@ -283,17 +299,37 @@ pub enum DeleteTimelineError {
    HasChildren(Vec<TimelineId>),

    #[error("Timeline deletion is already in progress")]
-    AlreadyInProgress,
+    AlreadyInProgress(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>),

    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }

+impl Debug for DeleteTimelineError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::NotFound => write!(f, "NotFound"),
+            Self::HasChildren(c) => f.debug_tuple("HasChildren").field(c).finish(),
+            Self::AlreadyInProgress(_) => f.debug_tuple("AlreadyInProgress").finish(),
+            Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
+        }
+    }
+}
+
 pub enum SetStoppingError {
    AlreadyStopping(completion::Barrier),
    Broken,
 }

+impl Debug for SetStoppingError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::AlreadyStopping(_) => f.debug_tuple("AlreadyStopping").finish(),
+            Self::Broken => write!(f, "Broken"),
+        }
+    }
+}
+
 struct RemoteStartupData {
    index_part: IndexPart,
    remote_metadata: TimelineMetadata,
@@ -362,7 +398,7 @@ impl Tenant {
    async fn timeline_init_and_sync(
        &self,
        timeline_id: TimelineId,
-        remote_client: Option<RemoteTimelineClient>,
+        resources: TimelineResources,
        remote_startup_data: Option<RemoteStartupData>,
        local_metadata: Option<TimelineMetadata>,
        ancestor: Option<Arc<Timeline>>,
@@ -383,7 +419,7 @@ impl Tenant {
            timeline_id,
            up_to_date_metadata,
            ancestor.clone(),
-            remote_client,
+            resources,
            init_order,
            CreateTimelineCause::Load,
        )?;
@@ -473,6 +509,7 @@ impl Tenant {
        conf: &'static PageServerConf,
        tenant_id: TenantId,
        broker_client: storage_broker::BrokerClientChannel,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        remote_storage: GenericRemoteStorage,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
@@ -487,7 +524,7 @@ impl Tenant {
            tenant_conf,
            wal_redo_manager,
            tenant_id,
-            Some(remote_storage),
+            Some(remote_storage.clone()),
        ));

        // Do all the hard work in the background
@@ -502,17 +539,61 @@ impl Tenant {
            "attach tenant",
            false,
            async move {
+                // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
+                let make_broken = |t: &Tenant, err: anyhow::Error| {
+                    error!("attach failed, setting tenant state to Broken: {err:?}");
+                    t.state.send_modify(|state| {
+                        assert_eq!(
+                            *state,
+                            TenantState::Attaching,
+                            "the attach task owns the tenant state until activation is complete"
+                        );
+                        *state = TenantState::broken_from_reason(err.to_string());
+                    });
+                };
+
+                let pending_deletion = {
+                    match DeleteTenantFlow::should_resume_deletion(
+                        conf,
+                        Some(&remote_storage),
+                        &tenant_clone,
+                    )
+                    .await
+                    {
+                        Ok(should_resume_deletion) => should_resume_deletion,
+                        Err(err) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            return Ok(());
+                        }
+                    }
+                };
+
+                info!("pending_deletion {}", pending_deletion.is_some());
+
+                if let Some(deletion) = pending_deletion {
+                    match DeleteTenantFlow::resume_from_attach(
+                        deletion,
+                        &tenant_clone,
+                        tenants,
+                        &ctx,
+                    )
+                    .await
+                    {
+                        Err(err) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            return Ok(());
+                        }
+                        Ok(()) => return Ok(()),
+                    }
+                }
+
                match tenant_clone.attach(&ctx).await {
                    Ok(()) => {
                        info!("attach finished, activating");
                        tenant_clone.activate(broker_client, None, &ctx);
                    }
                    Err(e) => {
-                        error!("attach failed, setting tenant state to Broken: {:?}", e);
-                        tenant_clone.state.send_modify(|state| {
-                            assert_eq!(*state, TenantState::Attaching, "the attach task owns the tenant state until activation is complete");
-                            *state = TenantState::broken_from_reason(e.to_string());
-                        });
+                        make_broken(&tenant_clone, anyhow::anyhow!(e));
                    }
                }
                Ok(())
@@ -590,6 +671,9 @@ impl Tenant {
                .instrument(info_span!("download_index_part", %timeline_id)),
            );
        }
+
+        let mut timelines_to_resume_deletions = vec![];
+
        // Wait for all the download tasks to complete & collect results.
        let mut remote_index_and_client = HashMap::new();
        let mut timeline_ancestors = HashMap::new();
@@ -606,9 +690,12 @@ impl Tenant {
                    );
                    remote_index_and_client.insert(timeline_id, (index_part, client));
                }
-                MaybeDeletedIndexPart::Deleted(_) => {
-                    info!("timeline {} is deleted, skipping", timeline_id);
-                    continue;
+                MaybeDeletedIndexPart::Deleted(index_part) => {
+                    info!(
+                        "timeline {} is deleted, picking to resume deletion",
+                        timeline_id
+                    );
+                    timelines_to_resume_deletions.push((timeline_id, index_part, client));
                }
            }
        }
@@ -616,21 +703,48 @@ impl Tenant {
        // For every timeline, download the metadata file, scan the local directory,
        // and build a layer map that contains an entry for each remote and local
        // layer file.
-        let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
+        let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
        for (timeline_id, remote_metadata) in sorted_timelines {
            let (index_part, remote_client) = remote_index_and_client
                .remove(&timeline_id)
                .expect("just put it in above");

            // TODO again handle early failure
-            self.load_remote_timeline(timeline_id, index_part, remote_metadata, remote_client, ctx)
-                .await
-                .with_context(|| {
-                    format!(
-                        "failed to load remote timeline {} for tenant {}",
-                        timeline_id, self.tenant_id
-                    )
-                })?;
+            self.load_remote_timeline(
+                timeline_id,
+                index_part,
+                remote_metadata,
+                TimelineResources {
+                    remote_client: Some(remote_client),
+                },
+                ctx,
+            )
+            .await
+            .with_context(|| {
+                format!(
+                    "failed to load remote timeline {} for tenant {}",
+                    timeline_id, self.tenant_id
+                )
+            })?;
+        }
+
+        // Walk through deleted timelines, resume deletion
+        for (timeline_id, index_part, remote_timeline_client) in timelines_to_resume_deletions {
+            remote_timeline_client
+                .init_upload_queue_stopped_to_continue_deletion(&index_part)
+                .context("init queue stopped")
+                .map_err(LoadLocalTimelineError::ResumeDeletion)?;
+
+            DeleteTimelineFlow::resume_deletion(
+                Arc::clone(self),
+                timeline_id,
+                &index_part.parse_metadata().context("parse_metadata")?,
+                Some(remote_timeline_client),
+                None,
+            )
+            .await
+            .context("resume_deletion")
+            .map_err(LoadLocalTimelineError::ResumeDeletion)?;
        }

        std::fs::remove_file(&marker_file)
@@ -638,7 +752,7 @@ impl Tenant {
        crashsafe::fsync(marker_file.parent().expect("marker file has parent dir"))
            .context("fsync tenant directory after unlinking attach marker file")?;

-        utils::failpoint_sleep_millis_async!("attach-before-activate");
+        crate::failpoint_support::sleep_millis_async!("attach-before-activate");

        info!("Done");

@@ -666,7 +780,7 @@ impl Tenant {
        timeline_id: TimelineId,
        index_part: IndexPart,
        remote_metadata: TimelineMetadata,
-        remote_client: RemoteTimelineClient,
+        resources: TimelineResources,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        span::debug_assert_current_span_has_tenant_id();
@@ -696,7 +810,7 @@ impl Tenant {

        self.timeline_init_and_sync(
            timeline_id,
-            Some(remote_client),
+            resources,
            Some(RemoteStartupData {
                index_part,
                remote_metadata,
@@ -740,12 +854,12 @@ impl Tenant {
    /// If the loading fails for some reason, the Tenant will go into Broken
    /// state.
    #[instrument(skip_all, fields(tenant_id=%tenant_id))]
-    pub fn spawn_load(
+    pub(crate) fn spawn_load(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
-        broker_client: storage_broker::BrokerClientChannel,
-        remote_storage: Option<GenericRemoteStorage>,
+        resources: TenantSharedResources,
        init_order: Option<InitializationOrder>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        ctx: &RequestContext,
    ) -> Arc<Tenant> {
        span::debug_assert_current_span_has_tenant_id();
@@ -758,6 +872,9 @@ impl Tenant {
            }
        };

+        let broker_client = resources.broker_client;
+        let remote_storage = resources.remote_storage;
+
        let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
        let tenant = Tenant::new(
            TenantState::Loading,
@@ -765,7 +882,7 @@ impl Tenant {
            tenant_conf,
            wal_redo_manager,
            tenant_id,
-            remote_storage,
+            remote_storage.clone(),
        );
        let tenant = Arc::new(tenant);

@@ -781,27 +898,84 @@ impl Tenant {
            "initial tenant load",
            false,
            async move {
+                // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
+                let make_broken = |t: &Tenant, err: anyhow::Error| {
+                    error!("load failed, setting tenant state to Broken: {err:?}");
+                    t.state.send_modify(|state| {
+                        assert!(
+                            matches!(*state, TenantState::Loading | TenantState::Stopping { .. }),
+                            "the loading task owns the tenant state until activation is complete"
+                        );
+                        *state = TenantState::broken_from_reason(err.to_string());
+                    });
+                };
+
                let mut init_order = init_order;

                // take the completion because initial tenant loading will complete when all of
                // these tasks complete.
-                let _completion = init_order.as_mut().and_then(|x| x.initial_tenant_load.take());
+                let _completion = init_order
+                    .as_mut()
+                    .and_then(|x| x.initial_tenant_load.take());
+
+                // Dont block pageserver startup on figuring out deletion status
+                let pending_deletion = {
+                    match DeleteTenantFlow::should_resume_deletion(
+                        conf,
+                        remote_storage.as_ref(),
+                        &tenant_clone,
+                    )
+                    .await
+                    {
+                        Ok(should_resume_deletion) => should_resume_deletion,
+                        Err(err) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            return Ok(());
+                        }
+                    }
+                };
+
+                info!("pending deletion {}", pending_deletion.is_some());
+
+                if let Some(deletion) = pending_deletion {
+                    // as we are no longer loading, signal completion by dropping
+                    // the completion while we resume deletion
+                    drop(_completion);
+                    // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
+                    let _ = init_order
+                        .as_mut()
+                        .and_then(|x| x.initial_logical_size_attempt.take());
+
+                    match DeleteTenantFlow::resume_from_load(
+                        deletion,
+                        &tenant_clone,
+                        init_order.as_ref(),
+                        tenants,
+                        &ctx,
+                    )
+                    .await
+                    {
+                        Err(err) => {
+                            make_broken(&tenant_clone, anyhow::anyhow!(err));
+                            return Ok(());
+                        }
+                        Ok(()) => return Ok(()),
+                    }
+                }
+
+                let background_jobs_can_start =
+                    init_order.as_ref().map(|x| &x.background_jobs_can_start);

                match tenant_clone.load(init_order.as_ref(), &ctx).await {
                    Ok(()) => {
-                        debug!("load finished, activating");
-                        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
+                        debug!("load finished");
+
                        tenant_clone.activate(broker_client, background_jobs_can_start, &ctx);
                    }
-                    Err(err) => {
-                        error!("load failed, setting tenant state to Broken: {err:?}");
-                        tenant_clone.state.send_modify(|state| {
-                            assert_eq!(*state, TenantState::Loading, "the loading task owns the tenant state until activation is complete");
-                            *state = TenantState::broken_from_reason(err.to_string());
-                        });
-                    }
+                    Err(err) => make_broken(&tenant_clone, err),
                }
-               Ok(())
+
+                Ok(())
            }
            .instrument({
                let span = tracing::info_span!(parent: None, "load", tenant_id=%tenant_id);
@@ -877,6 +1051,8 @@ impl Tenant {
                        )
                    })?;

+                info!("Found deletion mark for timeline {}", timeline_id);
+
                match load_metadata(self.conf, &self.tenant_id, &timeline_id) {
                    Ok(metadata) => {
                        timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
@@ -966,9 +1142,11 @@ impl Tenant {

        // Sort the array of timeline IDs into tree-order, so that parent comes before
        // all its children.
-        tree_sort_timelines(timelines_to_load).map(|sorted_timelines| TenantDirectoryScan {
-            sorted_timelines_to_load: sorted_timelines,
-            timelines_to_resume_deletion,
+        tree_sort_timelines(timelines_to_load, |m| m.ancestor_timeline()).map(|sorted_timelines| {
+            TenantDirectoryScan {
+                sorted_timelines_to_load: sorted_timelines,
+                timelines_to_resume_deletion,
+            }
        })
    }

@@ -986,7 +1164,7 @@ impl Tenant {

        debug!("loading tenant task");

-        utils::failpoint_sleep_millis_async!("before-loading-tenant");
+        crate::failpoint_support::sleep_millis_async!("before-loading-tenant");

        // Load in-memory state to reflect the local files on disk
        //
@@ -1014,8 +1192,9 @@ impl Tenant {
            {
                match e {
                    LoadLocalTimelineError::Load(source) => {
-                        return Err(anyhow::anyhow!(source)
-                            .context("Failed to load local timeline: {timeline_id}"))
+                        return Err(anyhow::anyhow!(source)).with_context(|| {
+                            format!("Failed to load local timeline: {timeline_id}")
+                        })
                    }
                    LoadLocalTimelineError::ResumeDeletion(source) => {
                        // Make sure resumed deletion wont fail loading for entire tenant.
@@ -1081,16 +1260,9 @@ impl Tenant {
    ) -> Result<(), LoadLocalTimelineError> {
        span::debug_assert_current_span_has_tenant_id();

-        let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
-            RemoteTimelineClient::new(
-                remote_storage.clone(),
-                self.conf,
-                self.tenant_id,
-                timeline_id,
-            )
-        });
+        let mut resources = self.build_timeline_resources(timeline_id);

-        let (remote_startup_data, remote_client) = match remote_client {
+        let (remote_startup_data, remote_client) = match resources.remote_client {
            Some(remote_client) => match remote_client.download_index_file().await {
                Ok(index_part) => {
                    let index_part = match index_part {
@@ -1178,9 +1350,10 @@ impl Tenant {
                    return Ok(());
                }

-                (None, remote_client)
+                (None, resources.remote_client)
            }
        };
+        resources.remote_client = remote_client;

        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
@@ -1193,7 +1366,7 @@ impl Tenant {

        self.timeline_init_and_sync(
            timeline_id,
-            remote_client,
+            resources,
            remote_startup_data,
            Some(local_metadata),
            ancestor,
@@ -1682,7 +1855,7 @@ impl Tenant {
        // It's mesed up.
        // we just ignore the failure to stop

-        match self.set_stopping(shutdown_progress).await {
+        match self.set_stopping(shutdown_progress, false, false).await {
            Ok(()) => {}
            Err(SetStoppingError::Broken) => {
                // assume that this is acceptable
@@ -1722,18 +1895,28 @@ impl Tenant {
    /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
    ///
    /// This function is not cancel-safe!
-    async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> {
+    ///
+    /// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant.
+    /// `allow_transition_from_attaching` is needed for the special case of attaching deleted tenant.
+    async fn set_stopping(
+        &self,
+        progress: completion::Barrier,
+        allow_transition_from_loading: bool,
+        allow_transition_from_attaching: bool,
+    ) -> Result<(), SetStoppingError> {
        let mut rx = self.state.subscribe();

        // cannot stop before we're done activating, so wait out until we're done activating
        rx.wait_for(|state| match state {
-            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+            TenantState::Attaching if allow_transition_from_attaching => true,
+            TenantState::Activating(_) | TenantState::Attaching => {
                info!(
                    "waiting for {} to turn Active|Broken|Stopping",
                    <&'static str>::from(state)
                );
                false
            }
+            TenantState::Loading => allow_transition_from_loading,
            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
        })
        .await
@@ -1742,8 +1925,22 @@ impl Tenant {
        // we now know we're done activating, let's see whether this task is the winner to transition into Stopping
        let mut err = None;
        let stopping = self.state.send_if_modified(|current_state| match current_state {
-            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
-                unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
+            TenantState::Activating(_) => {
+                unreachable!("1we ensured above that we're done with activation, and, there is no re-activation")
+            }
+            TenantState::Attaching => {
+                if !allow_transition_from_attaching {
+                    unreachable!("2we ensured above that we're done with activation, and, there is no re-activation")
+                };
+                *current_state = TenantState::Stopping { progress };
+                true
+            }
+            TenantState::Loading => {
+                if !allow_transition_from_loading {
+                    unreachable!("3we ensured above that we're done with activation, and, there is no re-activation")
+                };
+                *current_state = TenantState::Stopping { progress };
+                true
            }
            TenantState::Active => {
                // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
@@ -1813,6 +2010,11 @@ impl Tenant {
        .expect("cannot drop self.state while on a &self method");

        // we now know we're done activating, let's see whether this task is the winner to transition into Broken
+        self.set_broken_no_wait(reason)
+    }
+
+    pub(crate) fn set_broken_no_wait(&self, reason: impl Display) {
+        let reason = reason.to_string();
        self.state.send_modify(|current_state| {
            match *current_state {
                TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
@@ -1878,22 +2080,28 @@ impl Tenant {
 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
 /// perform a topological sort, so that the parent of each timeline comes
 /// before the children.
-fn tree_sort_timelines(
-    timelines: HashMap<TimelineId, TimelineMetadata>,
-) -> anyhow::Result<Vec<(TimelineId, TimelineMetadata)>> {
+/// E extracts the ancestor from T
+/// This allows for T to be different. It can be TimelineMetadata, can be Timeline itself, etc.
+fn tree_sort_timelines<T, E>(
+    timelines: HashMap<TimelineId, T>,
+    extractor: E,
+) -> anyhow::Result<Vec<(TimelineId, T)>>
+where
+    E: Fn(&T) -> Option<TimelineId>,
+{
    let mut result = Vec::with_capacity(timelines.len());

    let mut now = Vec::with_capacity(timelines.len());
    // (ancestor, children)
-    let mut later: HashMap<TimelineId, Vec<(TimelineId, TimelineMetadata)>> =
+    let mut later: HashMap<TimelineId, Vec<(TimelineId, T)>> =
        HashMap::with_capacity(timelines.len());

-    for (timeline_id, metadata) in timelines {
-        if let Some(ancestor_id) = metadata.ancestor_timeline() {
+    for (timeline_id, value) in timelines {
+        if let Some(ancestor_id) = extractor(&value) {
            let children = later.entry(ancestor_id).or_default();
-            children.push((timeline_id, metadata));
+            children.push((timeline_id, value));
        } else {
-            now.push((timeline_id, metadata));
+            now.push((timeline_id, value));
        }
    }

@@ -2030,7 +2238,7 @@ impl Tenant {
        new_timeline_id: TimelineId,
        new_metadata: &TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
-        remote_client: Option<RemoteTimelineClient>,
+        resources: TimelineResources,
        init_order: Option<&InitializationOrder>,
        cause: CreateTimelineCause,
    ) -> anyhow::Result<Arc<Timeline>> {
@@ -2059,10 +2267,10 @@ impl Tenant {
            new_timeline_id,
            self.tenant_id,
            Arc::clone(&self.walredo_mgr),
-            remote_client,
+            resources,
            pg_version,
            initial_logical_size_can_start.cloned(),
-            initial_logical_size_attempt.cloned(),
+            initial_logical_size_attempt.cloned().flatten(),
            state,
        );

@@ -2146,6 +2354,7 @@ impl Tenant {
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
+            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
        }
    }

@@ -2162,6 +2371,7 @@ impl Tenant {
        // FIXME If the config file is not found, assume that we're attaching
        // a detached tenant and config is passed via attach command.
        // https://github.com/neondatabase/neon/issues/1555
+        // OR: we're loading after incomplete deletion that managed to remove config.
        if !target_config_path.exists() {
            info!("tenant config not found in {target_config_display}");
            return Ok(TenantConfOpt::default());
@@ -2299,7 +2509,9 @@ impl Tenant {
            .refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx)
            .await?;

-        utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
+        crate::failpoint_support::sleep_millis_async!(
+            "gc_iteration_internal_after_getting_gc_timelines"
+        );

        // If there is nothing to GC, we don't want any messages in the INFO log.
        if !gc_timelines.is_empty() {
@@ -2703,6 +2915,23 @@ impl Tenant {
        Ok(timeline)
    }

+    /// Call this before constructing a timeline, to build its required structures
+    fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
+        let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
+            let remote_client = RemoteTimelineClient::new(
+                remote_storage.clone(),
+                self.conf,
+                self.tenant_id,
+                timeline_id,
+            );
+            Some(remote_client)
+        } else {
+            None
+        };
+
+        TimelineResources { remote_client }
+    }
+
    /// Creates intermediate timeline structure and its files.
    ///
    /// An empty layer map is initialized, and new data and WAL can be imported starting
@@ -2719,25 +2948,17 @@ impl Tenant {
    ) -> anyhow::Result<UninitializedTimeline> {
        let tenant_id = self.tenant_id;

-        let remote_client = if let Some(remote_storage) = self.remote_storage.as_ref() {
-            let remote_client = RemoteTimelineClient::new(
-                remote_storage.clone(),
-                self.conf,
-                tenant_id,
-                new_timeline_id,
-            );
+        let resources = self.build_timeline_resources(new_timeline_id);
+        if let Some(remote_client) = &resources.remote_client {
            remote_client.init_upload_queue_for_empty_remote(new_metadata)?;
-            Some(remote_client)
-        } else {
-            None
-        };
+        }

        let timeline_struct = self
            .create_timeline_struct(
                new_timeline_id,
                new_metadata,
                ancestor,
-                remote_client,
+                resources,
                None,
                CreateTimelineCause::Load,
            )
@@ -3810,6 +4031,31 @@ mod tests {
        Ok(())
    }

+    #[tokio::test]
+    async fn delta_layer_dumping() -> anyhow::Result<()> {
+        let (tenant, ctx) = TenantHarness::create("test_layer_dumping")?.load().await;
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+
+        let layer_map = tline.layers.read().await;
+        let level0_deltas = layer_map.layer_map().get_level0_deltas()?;
+
+        assert!(!level0_deltas.is_empty());
+
+        for delta in level0_deltas {
+            let delta = layer_map.get_from_desc(&delta);
+            // Ensure we are dumping a delta layer here
+            let delta = delta.downcast_delta_layer().unwrap();
+
+            delta.dump(false, &ctx).await.unwrap();
+            delta.dump(true, &ctx).await.unwrap();
+        }
+
+        Ok(())
+    }
+
    #[tokio::test]
    async fn corrupt_metadata() -> anyhow::Result<()> {
        const TEST_NAME: &str = "corrupt_metadata";
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -21,14 +21,14 @@ where
    R: BlockReader,
 {
    /// Read a blob into a new buffer.
-    pub fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
+    pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
        let mut buf = Vec::new();
-        self.read_blob_into_buf(offset, &mut buf)?;
+        self.read_blob_into_buf(offset, &mut buf).await?;
        Ok(buf)
    }
    /// Read blob into the given buffer. Any previous contents in the buffer
    /// are overwritten.
-    pub fn read_blob_into_buf(
+    pub async fn read_blob_into_buf(
        &self,
        offset: u64,
        dstbuf: &mut Vec<u8>,
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -2,12 +2,10 @@
 //! Low-level Block-oriented I/O functions
 //!

-use crate::page_cache;
-use crate::page_cache::{ReadBufResult, PAGE_SZ};
+use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
 use bytes::Bytes;
 use std::ops::{Deref, DerefMut};
 use std::os::unix::fs::FileExt;
-use std::sync::atomic::AtomicU64;

 /// This is implemented by anything that can read 8 kB (PAGE_SZ)
 /// blocks, using the page cache
@@ -15,14 +13,12 @@ use std::sync::atomic::AtomicU64;
 /// There are currently two implementations: EphemeralFile, and FileBlockReader
 /// below.
 pub trait BlockReader {
-    type BlockLease: Deref<Target = [u8; PAGE_SZ]> + 'static;
-
    ///
    /// Read a block. Returns a "lease" object that can be used to
    /// access to the contents of the page. (For the page cache, the
    /// lease object represents a lock on the buffer.)
    ///
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error>;
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error>;

    ///
    /// Create a new "cursor" for reading from this reader.
@@ -41,13 +37,45 @@ impl<B> BlockReader for &B
 where
    B: BlockReader,
 {
-    type BlockLease = B::BlockLease;
-
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        (*self).read_blk(blknum)
    }
 }

+/// Reference to an in-memory copy of an immutable on-disk block.
+pub enum BlockLease<'a> {
+    PageReadGuard(PageReadGuard<'static>),
+    EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
+    #[cfg(test)]
+    Rc(std::rc::Rc<[u8; PAGE_SZ]>),
+}
+
+impl From<PageReadGuard<'static>> for BlockLease<'static> {
+    fn from(value: PageReadGuard<'static>) -> BlockLease<'static> {
+        BlockLease::PageReadGuard(value)
+    }
+}
+
+#[cfg(test)]
+impl<'a> From<std::rc::Rc<[u8; PAGE_SZ]>> for BlockLease<'a> {
+    fn from(value: std::rc::Rc<[u8; PAGE_SZ]>) -> Self {
+        BlockLease::Rc(value)
+    }
+}
+
+impl<'a> Deref for BlockLease<'a> {
+    type Target = [u8; PAGE_SZ];
+
+    fn deref(&self) -> &Self::Target {
+        match self {
+            BlockLease::PageReadGuard(v) => v.deref(),
+            BlockLease::EphemeralFileMutableTail(v) => v,
+            #[cfg(test)]
+            BlockLease::Rc(v) => v.deref(),
+        }
+    }
+}
+
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
@@ -80,11 +108,10 @@ where
        BlockCursor { reader }
    }

-    pub fn read_blk(&self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
+    pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        self.reader.read_blk(blknum)
    }
 }
-static NEXT_ID: AtomicU64 = AtomicU64::new(1);

 /// An adapter for reading a (virtual) file using the page cache.
 ///
@@ -94,7 +121,7 @@ pub struct FileBlockReader<F> {
    pub file: F,

    /// Unique ID of this file, used as key in the page cache.
-    file_id: u64,
+    file_id: page_cache::FileId,
 }

 impl<F> FileBlockReader<F>
@@ -102,7 +129,7 @@ where
    F: FileExt,
 {
    pub fn new(file: F) -> Self {
-        let file_id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        let file_id = page_cache::next_file_id();

        FileBlockReader { file_id, file }
    }
@@ -118,10 +145,7 @@ impl<F> BlockReader for FileBlockReader<F>
 where
    F: FileExt,
 {
-    type BlockLease = page_cache::PageReadGuard<'static>;
-
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
-        // Look up the right page
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        let cache = page_cache::get();
        loop {
            match cache
@@ -132,7 +156,7 @@ where
                        format!("Failed to read immutable buf: {e:#}"),
                    )
                })? {
-                ReadBufResult::Found(guard) => break Ok(guard),
+                ReadBufResult::Found(guard) => break Ok(guard.into()),
                ReadBufResult::NotFound(mut write_guard) => {
                    // Read the page from disk into the buffer
                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -0,0 +1,610 @@
+use std::{
+    path::{Path, PathBuf},
+    sync::Arc,
+};
+
+use anyhow::Context;
+use pageserver_api::models::TenantState;
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use tokio::sync::OwnedMutexGuard;
+use tracing::{error, info, instrument, warn, Instrument, Span};
+
+use utils::{
+    backoff, completion, crashsafe, fs_ext,
+    id::{TenantId, TimelineId},
+};
+
+use crate::{
+    config::PageServerConf,
+    context::RequestContext,
+    task_mgr::{self, TaskKind},
+    InitializationOrder,
+};
+
+use super::{
+    mgr::{GetTenantError, TenantsMap},
+    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
+    span,
+    timeline::delete::DeleteTimelineFlow,
+    tree_sort_timelines, DeleteTimelineError, Tenant,
+};
+
+const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
+
+#[derive(Debug, thiserror::Error)]
+pub enum DeleteTenantError {
+    #[error("GetTenant {0}")]
+    Get(#[from] GetTenantError),
+
+    #[error("Invalid state {0}. Expected Active or Broken")]
+    InvalidState(TenantState),
+
+    #[error("Tenant deletion is already in progress")]
+    AlreadyInProgress,
+
+    #[error("Timeline {0}")]
+    Timeline(#[from] DeleteTimelineError),
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
+
+fn remote_tenant_delete_mark_path(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+) -> anyhow::Result<RemotePath> {
+    let tenant_remote_path = conf
+        .tenant_path(tenant_id)
+        .strip_prefix(&conf.workdir)
+        .context("Failed to strip workdir prefix")
+        .and_then(RemotePath::new)
+        .context("tenant path")?;
+    Ok(tenant_remote_path.join(Path::new("deleted")))
+}
+
+async fn create_remote_delete_mark(
+    conf: &PageServerConf,
+    remote_storage: &GenericRemoteStorage,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
+
+    let data: &[u8] = &[];
+    backoff::retry(
+        || async {
+            remote_storage
+                .upload(data, 0, &remote_mark_path, None)
+                .await
+        },
+        |_e| false,
+        FAILED_UPLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        "mark_upload",
+    )
+    .await
+    .context("mark_upload")?;
+
+    Ok(())
+}
+
+async fn create_local_delete_mark(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    let marker_path = conf.tenant_deleted_mark_file_path(tenant_id);
+
+    // Note: we're ok to replace existing file.
+    let _ = std::fs::OpenOptions::new()
+        .write(true)
+        .create(true)
+        .open(&marker_path)
+        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
+
+    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
+
+    Ok(())
+}
+
+async fn schedule_ordered_timeline_deletions(
+    tenant: &Arc<Tenant>,
+) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
+    // Tenant is stopping at this point. We know it will be deleted.
+    // No new timelines should be created.
+    // Tree sort timelines to delete from leafs to the root.
+    // NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
+    // can complete and remove timeline from the map in between our call to clone
+    // and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
+    // timelines.lock is currently synchronous so we cant hold it across await point.
+    // So just ignore NotFound error if we get it from `run`.
+    // Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
+    let timelines = tenant.timelines.lock().unwrap().clone();
+    let sorted =
+        tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
+
+    let mut already_running_deletions = vec![];
+
+    for (timeline_id, _) in sorted.into_iter().rev() {
+        if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await {
+            match e {
+                DeleteTimelineError::NotFound => {
+                    // Timeline deletion finished after call to clone above but before call
+                    // to `DeleteTimelineFlow::run` and removed timeline from the map.
+                    continue;
+                }
+                DeleteTimelineError::AlreadyInProgress(guard) => {
+                    already_running_deletions.push((guard, timeline_id));
+                    continue;
+                }
+                e => return Err(DeleteTenantError::Timeline(e)),
+            }
+        }
+    }
+
+    Ok(already_running_deletions)
+}
+
+async fn ensure_timelines_dir_empty(timelines_path: &Path) -> Result<(), DeleteTenantError> {
+    // Assert timelines dir is empty.
+    if !fs_ext::is_directory_empty(timelines_path).await? {
+        // Display first 10 items in directory
+        let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
+        return Err(DeleteTenantError::Other(anyhow::anyhow!(
+            "Timelines directory is not empty after all timelines deletion: {list:?}"
+        )));
+    }
+
+    Ok(())
+}
+
+async fn remove_tenant_remote_delete_mark(
+    conf: &PageServerConf,
+    remote_storage: Option<&GenericRemoteStorage>,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    if let Some(remote_storage) = remote_storage {
+        let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
+        backoff::retry(
+            || async { remote_storage.delete(&path).await },
+            |_e| false,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "remove_tenant_remote_delete_mark",
+        )
+        .await
+        .context("remove_tenant_remote_delete_mark")?;
+    }
+    Ok(())
+}
+
+// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
+async fn cleanup_remaining_fs_traces(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+) -> Result<(), DeleteTenantError> {
+    let rm = |p: PathBuf, is_dir: bool| async move {
+        if is_dir {
+            tokio::fs::remove_dir(&p).await
+        } else {
+            tokio::fs::remove_file(&p).await
+        }
+        .or_else(fs_ext::ignore_not_found)
+        .with_context(|| {
+            let to_display = p.display();
+            format!("failed to delete {to_display}")
+        })
+    };
+
+    rm(conf.tenant_config_path(tenant_id), false).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-timelines-dir"
+        ))?
+    });
+
+    rm(conf.timelines_path(tenant_id), true).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-deleted-mark"
+        ))?
+    });
+
+    // Make sure previous deletions are ordered before mark removal.
+    // Otherwise there is no guarantee that they reach the disk before mark deletion.
+    // So its possible for mark to reach disk first and for other deletions
+    // to be reordered later and thus missed if a crash occurs.
+    // Note that we dont need to sync after mark file is removed
+    // because we can tolerate the case when mark file reappears on startup.
+    let tenant_path = &conf.tenant_path(tenant_id);
+    if tenant_path.exists() {
+        crashsafe::fsync_async(&conf.tenant_path(tenant_id))
+            .await
+            .context("fsync_pre_mark_remove")?;
+    }
+
+    rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
+
+    fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: tenant-delete-before-remove-tenant-dir"
+        ))?
+    });
+
+    rm(conf.tenant_path(tenant_id), true).await?;
+
+    Ok(())
+}
+
+pub(crate) async fn remote_delete_mark_exists(
+    conf: &PageServerConf,
+    tenant_id: &TenantId,
+    remote_storage: &GenericRemoteStorage,
+) -> anyhow::Result<bool> {
+    // If remote storage is there we rely on it
+    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?;
+
+    let result = backoff::retry(
+        || async { remote_storage.download(&remote_mark_path).await },
+        |e| matches!(e, DownloadError::NotFound),
+        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+        SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
+        "fetch_tenant_deletion_mark",
+    )
+    .await;
+
+    match result {
+        Ok(_) => Ok(true),
+        Err(DownloadError::NotFound) => Ok(false),
+        Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?,
+    }
+}
+
+/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
+/// and deletes its data from both disk and s3.
+/// The sequence of steps:
+/// 1. Upload remote deletion mark.
+/// 2. Create local mark file.
+/// 3. Shutdown tasks
+/// 4. Run ordered timeline deletions
+/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
+/// 6. Remove remote mark
+/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
+/// It is resumable from any step in case a crash/restart occurs.
+/// There are three entrypoints to the process:
+/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
+/// 2. [`DeleteTenantFlow::resume_from_load`] is called during restarts when local or remote deletion marks are still there.
+/// 3. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
+///  Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
+#[derive(Default)]
+pub enum DeleteTenantFlow {
+    #[default]
+    NotStarted,
+    InProgress,
+    Finished,
+}
+
+impl DeleteTenantFlow {
+    // These steps are run in the context of management api request handler.
+    // Long running steps are continued to run in the background.
+    // NB: If this fails half-way through, and is retried, the retry will go through
+    // all the same steps again. Make sure the code here is idempotent, and don't
+    // error out if some of the shutdown tasks have already been completed!
+    // NOTE: static needed for background part.
+    // We assume that calling code sets up the span with tenant_id.
+    #[instrument(skip_all)]
+    pub(crate) async fn run(
+        conf: &'static PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
+    ) -> Result<(), DeleteTenantError> {
+        span::debug_assert_current_span_has_tenant_id();
+
+        let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
+
+        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
+            tenant.set_broken(format!("{e:#}")).await;
+            return Err(e);
+        }
+
+        Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
+
+        Ok(())
+    }
+
+    // Helper function needed to be able to match once on returned error and transition tenant into broken state.
+    // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
+    // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
+    // So the solution is to set tenant state to broken.
+    async fn run_inner(
+        guard: &mut OwnedMutexGuard<Self>,
+        conf: &'static PageServerConf,
+        remote_storage: Option<&GenericRemoteStorage>,
+        tenant: &Tenant,
+    ) -> Result<(), DeleteTenantError> {
+        guard.mark_in_progress()?;
+
+        fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-create-remote-mark"
+            ))?
+        });
+
+        // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
+        // Though sounds scary, different mark name?
+        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
+        if let Some(remote_storage) = &remote_storage {
+            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
+                .await
+                .context("remote_mark")?
+        }
+
+        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-create-local-mark"
+            ))?
+        });
+
+        create_local_delete_mark(conf, &tenant.tenant_id)
+            .await
+            .context("local delete mark")?;
+
+        fail::fail_point!("tenant-delete-before-background", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-background"
+            ))?
+        });
+
+        Ok(())
+    }
+
+    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
+        match self {
+            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
+            Self::InProgress { .. } => { /* We're in a retry */ }
+            Self::NotStarted => { /* Fresh start */ }
+        }
+
+        *self = Self::InProgress;
+
+        Ok(())
+    }
+
+    pub async fn should_resume_deletion(
+        conf: &'static PageServerConf,
+        remote_storage: Option<&GenericRemoteStorage>,
+        tenant: &Tenant,
+    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
+        let acquire = |t: &Tenant| {
+            Some(
+                Arc::clone(&t.delete_progress)
+                    .try_lock_owned()
+                    .expect("we're the only owner during init"),
+            )
+        };
+
+        let tenant_id = tenant.tenant_id;
+        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
+        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
+            return Ok(acquire(tenant));
+        }
+
+        let remote_storage = match remote_storage {
+            Some(remote_storage) => remote_storage,
+            None => return Ok(None),
+        };
+
+        if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? {
+            Ok(acquire(tenant))
+        } else {
+            Ok(None)
+        }
+    }
+
+    pub(crate) async fn resume_from_load(
+        guard: DeletionGuard,
+        tenant: &Arc<Tenant>,
+        init_order: Option<&InitializationOrder>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        ctx: &RequestContext,
+    ) -> Result<(), DeleteTenantError> {
+        let (_, progress) = completion::channel();
+
+        tenant
+            .set_stopping(progress, true, false)
+            .await
+            .expect("cant be stopping or broken");
+
+        // Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
+        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
+        if let Some(background) = background_jobs_can_start {
+            info!("waiting for backgound jobs barrier");
+            background.clone().wait().await;
+            info!("ready for backgound jobs barrier");
+        }
+
+        // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
+        let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
+        if timelines_path.exists() {
+            tenant.load(init_order, ctx).await.context("load")?;
+        }
+
+        Self::background(
+            guard,
+            tenant.conf,
+            tenant.remote_storage.clone(),
+            tenants,
+            tenant,
+        )
+        .await
+    }
+
+    pub(crate) async fn resume_from_attach(
+        guard: DeletionGuard,
+        tenant: &Arc<Tenant>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        ctx: &RequestContext,
+    ) -> Result<(), DeleteTenantError> {
+        let (_, progress) = completion::channel();
+
+        tenant
+            .set_stopping(progress, false, true)
+            .await
+            .expect("cant be stopping or broken");
+
+        tenant.attach(ctx).await.context("attach")?;
+
+        Self::background(
+            guard,
+            tenant.conf,
+            tenant.remote_storage.clone(),
+            tenants,
+            tenant,
+        )
+        .await
+    }
+
+    async fn prepare(
+        tenants: &tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
+    ) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
+        let m = tenants.read().await;
+
+        let tenant = m
+            .get(&tenant_id)
+            .ok_or(GetTenantError::NotFound(tenant_id))?;
+
+        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
+        // so at least for now allow deletions only for active tenants. TODO recheck
+        // Broken and Stopping is needed for retries.
+        if !matches!(
+            tenant.current_state(),
+            TenantState::Active | TenantState::Broken { .. }
+        ) {
+            return Err(DeleteTenantError::InvalidState(tenant.current_state()));
+        }
+
+        let guard = Arc::clone(&tenant.delete_progress)
+            .try_lock_owned()
+            .map_err(|_| DeleteTenantError::AlreadyInProgress)?;
+
+        fail::fail_point!("tenant-delete-before-shutdown", |_| {
+            Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
+        });
+
+        // make pageserver shutdown not to wait for our completion
+        let (_, progress) = completion::channel();
+
+        // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
+        // i e it is an error to do:
+        // tenant.set_stopping
+        // tenant.shutdown
+        // Its also bad that we're holding tenants.read here.
+        // TODO relax set_stopping to be idempotent?
+        if tenant.shutdown(progress, false).await.is_err() {
+            return Err(DeleteTenantError::Other(anyhow::anyhow!(
+                "tenant shutdown is already in progress"
+            )));
+        }
+
+        Ok((Arc::clone(tenant), guard))
+    }
+
+    fn schedule_background(
+        guard: OwnedMutexGuard<Self>,
+        conf: &'static PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant: Arc<Tenant>,
+    ) {
+        let tenant_id = tenant.tenant_id;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::TimelineDeletionWorker,
+            Some(tenant_id),
+            None,
+            "tenant_delete",
+            false,
+            async move {
+                if let Err(err) =
+                    Self::background(guard, conf, remote_storage, tenants, &tenant).await
+                {
+                    error!("Error: {err:#}");
+                    tenant.set_broken(format!("{err:#}")).await;
+                };
+                Ok(())
+            }
+            .instrument({
+                let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
+                span.follows_from(Span::current());
+                span
+            }),
+        );
+    }
+
+    async fn background(
+        mut guard: OwnedMutexGuard<Self>,
+        conf: &PageServerConf,
+        remote_storage: Option<GenericRemoteStorage>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant: &Arc<Tenant>,
+    ) -> Result<(), DeleteTenantError> {
+        // Tree sort timelines, schedule delete for them. Mention retries from the console side.
+        // Note that if deletion fails we dont mark timelines as broken,
+        // the whole tenant will become broken as by `Self::schedule_background` logic
+        let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
+            .await
+            .context("schedule_ordered_timeline_deletions")?;
+
+        fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-polling-ongoing-deletions"
+            ))?
+        });
+
+        // Wait for deletions that were already running at the moment when tenant deletion was requested.
+        // When we can lock deletion guard it means that corresponding timeline deletion finished.
+        for (guard, timeline_id) in already_running_timeline_deletions {
+            let flow = guard.lock().await;
+            if !flow.is_finished() {
+                return Err(DeleteTenantError::Other(anyhow::anyhow!(
+                    "already running timeline deletion failed: {timeline_id}"
+                )));
+            }
+        }
+
+        let timelines_path = conf.timelines_path(&tenant.tenant_id);
+        // May not exist if we fail in cleanup_remaining_fs_traces after removing it
+        if timelines_path.exists() {
+            // sanity check to guard against layout changes
+            ensure_timelines_dir_empty(&timelines_path)
+                .await
+                .context("timelines dir not empty")?;
+        }
+
+        remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
+
+        fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
+            ))?
+        });
+
+        cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
+            .await
+            .context("cleanup_remaining_fs_traces")?;
+
+        let mut locked = tenants.write().await;
+        if locked.remove(&tenant.tenant_id).is_none() {
+            warn!("Tenant got removed from tenants map during deletion");
+        };
+
+        *guard = Self::Finished;
+
+        Ok(())
+    }
+}
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -685,6 +685,7 @@ impl<const L: usize> BuildNode<L> {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use crate::tenant::block_io::BlockLease;
    use rand::Rng;
    use std::collections::BTreeMap;
    use std::sync::atomic::{AtomicUsize, Ordering};
@@ -699,12 +700,10 @@ mod tests {
        }
    }
    impl BlockReader for TestDisk {
-        type BlockLease = std::rc::Rc<[u8; PAGE_SZ]>;
-
-        fn read_blk(&self, blknum: u32) -> io::Result<Self::BlockLease> {
+        fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
            let mut buf = [0u8; PAGE_SZ];
            buf.copy_from_slice(&self.blocks[blknum as usize]);
-            Ok(std::rc::Rc::new(buf))
+            Ok(std::rc::Rc::new(buf).into())
        }
    }
    impl BlockWriter for &mut TestDisk {
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -2,46 +2,31 @@
 //! used to keep in-memory layers spilled on disk.

 use crate::config::PageServerConf;
-use crate::page_cache::{self, ReadBufResult, WriteBufResult, PAGE_SZ};
+use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::blob_io::BlobWriter;
-use crate::tenant::block_io::BlockReader;
+use crate::tenant::block_io::{BlockLease, BlockReader};
 use crate::virtual_file::VirtualFile;
-use once_cell::sync::Lazy;
 use std::cmp::min;
-use std::collections::HashMap;
 use std::fs::OpenOptions;
 use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
+use std::os::unix::prelude::FileExt;
 use std::path::PathBuf;
-use std::sync::{Arc, RwLock};
+use std::sync::atomic::AtomicU64;
 use tracing::*;
 use utils::id::{TenantId, TimelineId};

-use std::os::unix::fs::FileExt;
-
-///
-/// This is the global cache of file descriptors (File objects).
-///
-static EPHEMERAL_FILES: Lazy<RwLock<EphemeralFiles>> = Lazy::new(|| {
-    RwLock::new(EphemeralFiles {
-        next_file_id: 1,
-        files: HashMap::new(),
-    })
-});
-
-pub struct EphemeralFiles {
-    next_file_id: u64,
-
-    files: HashMap<u64, Arc<VirtualFile>>,
-}
-
 pub struct EphemeralFile {
-    file_id: u64,
+    page_cache_file_id: page_cache::FileId,
+
    _tenant_id: TenantId,
    _timeline_id: TimelineId,
-    file: Arc<VirtualFile>,
-
-    pub size: u64,
+    file: VirtualFile,
+    size: u64,
+    /// An ephemeral file is append-only.
+    /// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
+    /// The other pages, which can no longer be modified, are accessed through the page cache.
+    mutable_tail: [u8; PAGE_SZ],
 }

 impl EphemeralFile {
@@ -50,71 +35,31 @@ impl EphemeralFile {
        tenant_id: TenantId,
        timeline_id: TimelineId,
    ) -> Result<EphemeralFile, io::Error> {
-        let mut l = EPHEMERAL_FILES.write().unwrap();
-        let file_id = l.next_file_id;
-        l.next_file_id += 1;
+        static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
+        let filename_disambiguator =
+            NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);

        let filename = conf
            .timeline_path(&tenant_id, &timeline_id)
-            .join(PathBuf::from(format!("ephemeral-{}", file_id)));
+            .join(PathBuf::from(format!("ephemeral-{filename_disambiguator}")));

        let file = VirtualFile::open_with_options(
            &filename,
            OpenOptions::new().read(true).write(true).create(true),
        )?;
-        let file_rc = Arc::new(file);
-        l.files.insert(file_id, file_rc.clone());

        Ok(EphemeralFile {
-            file_id,
+            page_cache_file_id: page_cache::next_file_id(),
            _tenant_id: tenant_id,
            _timeline_id: timeline_id,
-            file: file_rc,
+            file,
            size: 0,
+            mutable_tail: [0u8; PAGE_SZ],
        })
    }

-    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> {
-        let mut off = 0;
-        while off < PAGE_SZ {
-            let n = self
-                .file
-                .read_at(&mut buf[off..], blkno as u64 * PAGE_SZ as u64 + off as u64)?;
-
-            if n == 0 {
-                // Reached EOF. Fill the rest of the buffer with zeros.
-                const ZERO_BUF: [u8; PAGE_SZ] = [0u8; PAGE_SZ];
-
-                buf[off..].copy_from_slice(&ZERO_BUF[off..]);
-                break;
-            }
-
-            off += n;
-        }
-        Ok(())
-    }
-
-    fn get_buf_for_write(&self, blkno: u32) -> Result<page_cache::PageWriteGuard, io::Error> {
-        // Look up the right page
-        let cache = page_cache::get();
-        let mut write_guard = match cache
-            .write_ephemeral_buf(self.file_id, blkno)
-            .map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
-        {
-            WriteBufResult::Found(guard) => guard,
-            WriteBufResult::NotFound(mut guard) => {
-                // Read the page from disk into the buffer
-                // TODO: if we're overwriting the whole page, no need to read it in first
-                self.fill_buffer(guard.deref_mut(), blkno)?;
-                guard.mark_valid();
-
-                // And then fall through to modify it.
-                guard
-            }
-        };
-        write_guard.mark_dirty();
-
-        Ok(write_guard)
+    pub(crate) fn size(&self) -> u64 {
+        self.size
    }
 }

@@ -127,121 +72,104 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
    }
 }

-impl FileExt for EphemeralFile {
-    fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, io::Error> {
-        // Look up the right page
-        let blkno = (offset / PAGE_SZ as u64) as u32;
-        let off = offset as usize % PAGE_SZ;
-        let len = min(PAGE_SZ - off, dstbuf.len());
-
-        let read_guard;
-        let mut write_guard;
-
-        let cache = page_cache::get();
-        let buf = match cache
-            .read_ephemeral_buf(self.file_id, blkno)
-            .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
-        {
-            ReadBufResult::Found(guard) => {
-                read_guard = guard;
-                read_guard.as_ref()
-            }
-            ReadBufResult::NotFound(guard) => {
-                // Read the page from disk into the buffer
-                write_guard = guard;
-                self.fill_buffer(write_guard.deref_mut(), blkno)?;
-                write_guard.mark_valid();
-
-                // And then fall through to read the requested slice from the
-                // buffer.
-                write_guard.as_ref()
-            }
-        };
-
-        dstbuf[0..len].copy_from_slice(&buf[off..(off + len)]);
-        Ok(len)
-    }
-
-    fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, io::Error> {
-        // Look up the right page
-        let blkno = (offset / PAGE_SZ as u64) as u32;
-        let off = offset as usize % PAGE_SZ;
-        let len = min(PAGE_SZ - off, srcbuf.len());
-
-        let mut write_guard;
-        let cache = page_cache::get();
-        let buf = match cache
-            .write_ephemeral_buf(self.file_id, blkno)
-            .map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
-        {
-            WriteBufResult::Found(guard) => {
-                write_guard = guard;
-                write_guard.deref_mut()
-            }
-            WriteBufResult::NotFound(guard) => {
-                // Read the page from disk into the buffer
-                // TODO: if we're overwriting the whole page, no need to read it in first
-                write_guard = guard;
-                self.fill_buffer(write_guard.deref_mut(), blkno)?;
-                write_guard.mark_valid();
-
-                // And then fall through to modify it.
-                write_guard.deref_mut()
-            }
-        };
-
-        buf[off..(off + len)].copy_from_slice(&srcbuf[0..len]);
-        write_guard.mark_dirty();
-        Ok(len)
-    }
-}
-
 impl BlobWriter for EphemeralFile {
    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
+        struct Writer<'a> {
+            ephemeral_file: &'a mut EphemeralFile,
+            /// The block to which the next [`push_bytes`] will write.
+            blknum: u32,
+            /// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
+            off: usize,
+        }
+        impl<'a> Writer<'a> {
+            fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
+                Ok(Writer {
+                    blknum: (ephemeral_file.size / PAGE_SZ as u64) as u32,
+                    off: (ephemeral_file.size % PAGE_SZ as u64) as usize,
+                    ephemeral_file,
+                })
+            }
+            #[inline(always)]
+            fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
+                let mut src_remaining = src;
+                while !src_remaining.is_empty() {
+                    let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
+                    let n = min(dst_remaining.len(), src_remaining.len());
+                    dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
+                    self.off += n;
+                    src_remaining = &src_remaining[n..];
+                    if self.off == PAGE_SZ {
+                        match self.ephemeral_file.file.write_all_at(
+                            &self.ephemeral_file.mutable_tail,
+                            self.blknum as u64 * PAGE_SZ as u64,
+                        ) {
+                            Ok(_) => {
+                                // Pre-warm the page cache with what we just wrote.
+                                // This isn't necessary for coherency/correctness, but it's how we've always done it.
+                                let cache = page_cache::get();
+                                match cache.read_immutable_buf(
+                                    self.ephemeral_file.page_cache_file_id,
+                                    self.blknum,
+                                ) {
+                                    Ok(page_cache::ReadBufResult::Found(_guard)) => {
+                                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
+                                        unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
+                                    }
+                                    Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
+                                        let buf: &mut [u8] = write_guard.deref_mut();
+                                        debug_assert_eq!(buf.len(), PAGE_SZ);
+                                        buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
+                                        write_guard.mark_valid();
+                                        // pre-warm successful
+                                    }
+                                    Err(e) => {
+                                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
+                                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
+                                    }
+                                }
+                                // Zero the buffer for re-use.
+                                // Zeroing is critical for correcntess because the write_blob code below
+                                // and similarly read_blk expect zeroed pages.
+                                self.ephemeral_file.mutable_tail.fill(0);
+                                // This block is done, move to next one.
+                                self.blknum += 1;
+                                self.off = 0;
+                            }
+                            Err(e) => {
+                                return Err(std::io::Error::new(
+                                    ErrorKind::Other,
+                                    // order error before path because path is long and error is short
+                                    format!(
+                                        "ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}",
+                                        self.blknum,
+                                        e,
+                                        self.ephemeral_file.file.path.display(),
+                                    ),
+                                ));
+                            }
+                        }
+                    }
+                }
+                Ok(())
+            }
+        }
+
        let pos = self.size;
-
-        let mut blknum = (self.size / PAGE_SZ as u64) as u32;
-        let mut off = (pos % PAGE_SZ as u64) as usize;
-
-        let mut buf = self.get_buf_for_write(blknum)?;
+        let mut writer = Writer::new(self)?;

        // Write the length field
        if srcbuf.len() < 0x80 {
-            buf[off] = srcbuf.len() as u8;
-            off += 1;
+            // short one-byte length header
+            let len_buf = [srcbuf.len() as u8];
+            writer.push_bytes(&len_buf)?;
        } else {
            let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
            len_buf[0] |= 0x80;
-            let thislen = PAGE_SZ - off;
-            if thislen < 4 {
-                // it needs to be split across pages
-                buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]);
-                blknum += 1;
-                buf = self.get_buf_for_write(blknum)?;
-                buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]);
-                off = 4 - thislen;
-            } else {
-                buf[off..off + 4].copy_from_slice(&len_buf);
-                off += 4;
-            }
+            writer.push_bytes(&len_buf)?;
        }

        // Write the payload
-        let mut buf_remain = srcbuf;
-        while !buf_remain.is_empty() {
-            let mut page_remain = PAGE_SZ - off;
-            if page_remain == 0 {
-                blknum += 1;
-                buf = self.get_buf_for_write(blknum)?;
-                off = 0;
-                page_remain = PAGE_SZ;
-            }
-            let this_blk_len = min(page_remain, buf_remain.len());
-            buf[off..(off + this_blk_len)].copy_from_slice(&buf_remain[..this_blk_len]);
-            off += this_blk_len;
-            buf_remain = &buf_remain[this_blk_len..];
-        }
-        drop(buf);
+        writer.push_bytes(srcbuf)?;

        if srcbuf.len() < 0x80 {
            self.size += 1;
@@ -258,10 +186,7 @@ impl Drop for EphemeralFile {
    fn drop(&mut self) {
        // drop all pages from page cache
        let cache = page_cache::get();
-        cache.drop_buffers_for_ephemeral(self.file_id);
-
-        // remove entry from the hash map
-        EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id);
+        cache.drop_buffers_for_immutable(self.page_cache_file_id);

        // unlink the file
        let res = std::fs::remove_file(&self.file.path);
@@ -281,62 +206,54 @@ impl Drop for EphemeralFile {
    }
 }

-pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
-    if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
-        match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
-            Ok(_) => Ok(()),
-            Err(e) => Err(io::Error::new(
-                ErrorKind::Other,
-                format!(
-                    "failed to write back to ephemeral file at {} error: {}",
-                    file.path.display(),
-                    e
-                ),
-            )),
-        }
-    } else {
-        Err(io::Error::new(
-            ErrorKind::Other,
-            "could not write back page, not found in ephemeral files hash",
-        ))
-    }
-}
-
 impl BlockReader for EphemeralFile {
-    type BlockLease = page_cache::PageReadGuard<'static>;
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
+        let flushed_blknums = 0..self.size / PAGE_SZ as u64;
+        if flushed_blknums.contains(&(blknum as u64)) {
+            let cache = page_cache::get();
+            loop {
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum)
+                    .map_err(|e| {
+                        std::io::Error::new(
+                            std::io::ErrorKind::Other,
+                            // order path before error because error is anyhow::Error => might have many contexts
+                            format!(
+                                "ephemeral file: read immutable page #{}: {}: {:#}",
+                                blknum,
+                                self.file.path.display(),
+                                e,
+                            ),
+                        )
+                    })? {
+                    page_cache::ReadBufResult::Found(guard) => {
+                        return Ok(BlockLease::PageReadGuard(guard))
+                    }
+                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                        let buf: &mut [u8] = write_guard.deref_mut();
+                        debug_assert_eq!(buf.len(), PAGE_SZ);
+                        self.file
+                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
+                        write_guard.mark_valid();

-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, io::Error> {
-        // Look up the right page
-        let cache = page_cache::get();
-        loop {
-            match cache
-                .read_ephemeral_buf(self.file_id, blknum)
-                .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
-            {
-                ReadBufResult::Found(guard) => return Ok(guard),
-                ReadBufResult::NotFound(mut write_guard) => {
-                    // Read the page from disk into the buffer
-                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
-                    write_guard.mark_valid();
-
-                    // Swap for read lock
-                    continue;
-                }
-            };
+                        // Swap for read lock
+                        continue;
+                    }
+                };
+            }
+        } else {
+            debug_assert_eq!(blknum as u64, self.size / PAGE_SZ as u64);
+            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
        }
    }
 }

-fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
-    io::Error::new(ErrorKind::Other, format!("{context}: {e:#}"))
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::tenant::blob_io::BlobWriter;
    use crate::tenant::block_io::BlockCursor;
-    use rand::{seq::SliceRandom, thread_rng, RngCore};
+    use rand::{thread_rng, RngCore};
    use std::fs;
    use std::str::FromStr;

@@ -357,61 +274,26 @@ mod tests {
        Ok((conf, tenant_id, timeline_id))
    }

-    // Helper function to slurp contents of a file, starting at the current position,
-    // into a string
-    fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, io::Error> {
-        let mut buf = Vec::new();
-        buf.resize(len, 0u8);
-
-        efile.read_exact_at(&mut buf, offset)?;
-
-        Ok(String::from_utf8_lossy(&buf)
-            .trim_end_matches('\0')
-            .to_string())
-    }
-
-    #[test]
-    fn test_ephemeral_files() -> Result<(), io::Error> {
-        let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?;
-
-        let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?;
-
-        file_a.write_all_at(b"foo", 0)?;
-        assert_eq!("foo", read_string(&file_a, 0, 20)?);
-
-        file_a.write_all_at(b"bar", 3)?;
-        assert_eq!("foobar", read_string(&file_a, 0, 20)?);
-
-        // Open a lot of files, enough to cause some page evictions.
-        let mut efiles = Vec::new();
-        for fileno in 0..100 {
-            let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?;
-            efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?;
-            assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
-            efiles.push((fileno, efile));
-        }
-
-        // Check that all the files can still be read from. Use them in random order for
-        // good measure.
-        efiles.as_mut_slice().shuffle(&mut thread_rng());
-        for (fileno, efile) in efiles.iter_mut() {
-            assert_eq!(format!("file {}", fileno), read_string(efile, 0, 10)?);
-        }
-
-        Ok(())
-    }
-
-    #[test]
-    fn test_ephemeral_blobs() -> Result<(), io::Error> {
+    #[tokio::test]
+    async fn test_ephemeral_blobs() -> Result<(), io::Error> {
        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;

        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;

        let pos_foo = file.write_blob(b"foo")?;
-        assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
+        assert_eq!(
+            b"foo",
+            file.block_cursor().read_blob(pos_foo).await?.as_slice()
+        );
        let pos_bar = file.write_blob(b"bar")?;
-        assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
-        assert_eq!(b"bar", file.block_cursor().read_blob(pos_bar)?.as_slice());
+        assert_eq!(
+            b"foo",
+            file.block_cursor().read_blob(pos_foo).await?.as_slice()
+        );
+        assert_eq!(
+            b"bar",
+            file.block_cursor().read_blob(pos_bar).await?.as_slice()
+        );

        let mut blobs = Vec::new();
        for i in 0..10000 {
@@ -428,7 +310,7 @@ mod tests {

        let cursor = BlockCursor::new(&file);
        for (pos, expected) in blobs {
-            let actual = cursor.read_blob(pos)?;
+            let actual = cursor.read_blob(pos).await?;
            assert_eq!(actual, expected);
        }

@@ -437,7 +319,7 @@ mod tests {
        large_data.resize(20000, 0);
        thread_rng().fill_bytes(&mut large_data);
        let pos_large = file.write_blob(&large_data)?;
-        let result = file.block_cursor().read_blob(pos_large)?;
+        let result = file.block_cursor().read_blob(pos_large).await?;
        assert_eq!(result, large_data);

        Ok(())
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -20,17 +20,20 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
+use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};

 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};

+use super::delete::DeleteTenantError;
 use super::timeline::delete::DeleteTimelineFlow;
+use super::TenantSharedResources;

 /// The tenants known to the pageserver.
 /// The enum variants are used to distinguish the different states that the pageserver can be in.
-enum TenantsMap {
+pub(crate) enum TenantsMap {
    /// [`init_tenant_mgr`] is not done yet.
    Initializing,
    /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
@@ -42,13 +45,13 @@ enum TenantsMap {
 }

 impl TenantsMap {
-    fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
+    pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id),
        }
    }
-    fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
+    pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
@@ -64,8 +67,7 @@ static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::
 #[instrument(skip_all)]
 pub async fn init_tenant_mgr(
    conf: &'static PageServerConf,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
+    resources: TenantSharedResources,
    init_order: InitializationOrder,
 ) -> anyhow::Result<()> {
    // Scan local filesystem for attached tenants
@@ -97,7 +99,9 @@ pub async fn init_tenant_mgr(
                        );
                    }
                } else {
-                    // This case happens if we crash during attach before creating the attach marker file
+                    // This case happens if we:
+                    // * crash during attach before creating the attach marker file
+                    // * crash during tenant delete before removing tenant directory
                    let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
                        format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
                    })?;
@@ -121,9 +125,9 @@ pub async fn init_tenant_mgr(
                    match schedule_local_tenant_processing(
                        conf,
                        &tenant_dir_path,
-                        broker_client.clone(),
-                        remote_storage.clone(),
+                        resources.clone(),
                        Some(init_order.clone()),
+                        &TENANTS,
                        &ctx,
                    ) {
                        Ok(tenant) => {
@@ -154,12 +158,12 @@ pub async fn init_tenant_mgr(
    Ok(())
 }

-pub fn schedule_local_tenant_processing(
+pub(crate) fn schedule_local_tenant_processing(
    conf: &'static PageServerConf,
    tenant_path: &Path,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
+    resources: TenantSharedResources,
    init_order: Option<InitializationOrder>,
+    tenants: &'static tokio::sync::RwLock<TenantsMap>,
    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
    anyhow::ensure!(
@@ -194,8 +198,15 @@ pub fn schedule_local_tenant_processing(

    let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
        info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
-        if let Some(remote_storage) = remote_storage {
-            match Tenant::spawn_attach(conf, tenant_id, broker_client, remote_storage, ctx) {
+        if let Some(remote_storage) = resources.remote_storage {
+            match Tenant::spawn_attach(
+                conf,
+                tenant_id,
+                resources.broker_client,
+                tenants,
+                remote_storage,
+                ctx,
+            ) {
                Ok(tenant) => tenant,
                Err(e) => {
                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
@@ -213,14 +224,7 @@ pub fn schedule_local_tenant_processing(
    } else {
        info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
        // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(
-            conf,
-            tenant_id,
-            broker_client,
-            remote_storage,
-            init_order,
-            ctx,
-        )
+        Tenant::spawn_load(conf, tenant_id, resources, init_order, tenants, ctx)
    };
    Ok(tenant)
 }
@@ -355,8 +359,12 @@ pub async fn create_tenant(
        // TODO: tenant directory remains on disk if we bail out from here on.
        //       See https://github.com/neondatabase/neon/issues/4233

+        let tenant_resources = TenantSharedResources {
+            broker_client,
+            remote_storage,
+        };
        let created_tenant =
-            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
+            schedule_local_tenant_processing(conf, &tenant_directory, tenant_resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -417,6 +425,14 @@ pub async fn get_tenant(
    }
 }

+pub async fn delete_tenant(
+    conf: &'static PageServerConf,
+    remote_storage: Option<GenericRemoteStorage>,
+    tenant_id: TenantId,
+) -> Result<(), DeleteTenantError> {
+    DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant_id).await
+}
+
 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
    #[error("Tenant {0}")]
@@ -432,7 +448,7 @@ pub async fn delete_timeline(
    _ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
    let tenant = get_tenant(tenant_id, true).await?;
-    DeleteTimelineFlow::run(&tenant, timeline_id).await?;
+    DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
    Ok(())
 }

@@ -507,7 +523,11 @@ pub async fn load_tenant(
                .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
        }

-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
+        let resources = TenantSharedResources {
+            broker_client,
+            remote_storage,
+        };
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path,  resources, None,  &TENANTS, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;
@@ -588,7 +608,11 @@ pub async fn attach_tenant(
            .context("check for attach marker file existence")?;
        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");

-        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
+        let resources = TenantSharedResources {
+            broker_client,
+            remote_storage: Some(remote_storage),
+        };
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, resources, None, &TENANTS, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -211,6 +211,9 @@ use chrono::{NaiveDateTime, Utc};
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;
+use utils::backoff::{
+    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
+};

 use std::collections::{HashMap, VecDeque};
 use std::path::Path;
@@ -219,7 +222,6 @@ use std::sync::{Arc, Mutex};

 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use std::ops::DerefMut;
-use tokio::runtime::Runtime;
 use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
@@ -241,7 +243,6 @@ use crate::{
    tenant::upload_queue::{
        UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
    },
-    {exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
 };

 use utils::id::{TenantId, TimelineId};
@@ -256,12 +257,12 @@ use super::upload_queue::SetDeletedFlagProgress;
 // But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
 // level instead, as repeated failures can mean a more serious problem. If it
 // fails more than FAILED_DOWNLOAD_RETRIES times, we give up
-const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
-const FAILED_DOWNLOAD_RETRIES: u32 = 10;
+pub(crate) const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
+pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;

 // Similarly log failed uploads and deletions at WARN level, after this many
 // retries. Uploads and deletions are retried forever, though.
-const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
+pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;

 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
@@ -309,7 +310,7 @@ pub enum PersistIndexPartWithDeletedFlagError {
 pub struct RemoteTimelineClient {
    conf: &'static PageServerConf,

-    runtime: &'static Runtime,
+    runtime: tokio::runtime::Handle,

    tenant_id: TenantId,
    timeline_id: TimelineId,
@@ -336,7 +337,7 @@ impl RemoteTimelineClient {
    ) -> RemoteTimelineClient {
        RemoteTimelineClient {
            conf,
-            runtime: &BACKGROUND_RUNTIME,
+            runtime: BACKGROUND_RUNTIME.handle().to_owned(),
            tenant_id,
            timeline_id,
            storage_impl: remote_storage,
@@ -752,12 +753,24 @@ impl RemoteTimelineClient {

        pausable_failpoint!("persist_deleted_index_part");

-        upload::upload_index_part(
-            self.conf,
-            &self.storage_impl,
-            &self.tenant_id,
-            &self.timeline_id,
-            &index_part_with_deleted_at,
+        backoff::retry(
+            || async {
+                upload::upload_index_part(
+                    self.conf,
+                    &self.storage_impl,
+                    &self.tenant_id,
+                    &self.timeline_id,
+                    &index_part_with_deleted_at,
+                )
+                .await
+            },
+            |_e| false,
+            1,
+            // have just a couple of attempts
+            // when executed as part of timeline deletion this happens in context of api call
+            // when executed as part of tenant deletion this happens in the background
+            2,
+            "persist_index_part_with_deleted_flag",
        )
        .await?;

@@ -834,10 +847,19 @@ impl RemoteTimelineClient {
        let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
        let timeline_storage_path = self.conf.remote_path(&timeline_path)?;

-        let remaining = self
-            .storage_impl
-            .list_prefixes(Some(&timeline_storage_path))
-            .await?;
+        let remaining = backoff::retry(
+            || async {
+                self.storage_impl
+                    .list_files(Some(&timeline_storage_path))
+                    .await
+            },
+            |_e| false,
+            FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "list_prefixes",
+        )
+        .await
+        .context("list prefixes")?;

        let remaining: Vec<RemotePath> = remaining
            .into_iter()
@@ -852,7 +874,15 @@ impl RemoteTimelineClient {
            .collect();

        if !remaining.is_empty() {
-            self.storage_impl.delete_objects(&remaining).await?;
+            backoff::retry(
+                || async { self.storage_impl.delete_objects(&remaining).await },
+                |_e| false,
+                FAILED_UPLOAD_WARN_THRESHOLD,
+                FAILED_REMOTE_OP_RETRIES,
+                "delete_objects",
+            )
+            .await
+            .context("delete_objects")?;
        }

        fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -864,7 +894,16 @@ impl RemoteTimelineClient {
        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));

        debug!("deleting index part");
-        self.storage_impl.delete(&index_file_path).await?;
+
+        backoff::retry(
+            || async { self.storage_impl.delete(&index_file_path).await },
+            |_e| false,
+            FAILED_UPLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "delete_index",
+        )
+        .await
+        .context("delete_index")?;

        fail::fail_point!("timeline-delete-after-index-delete", |_| {
            Err(anyhow::anyhow!(
@@ -954,7 +993,7 @@ impl RemoteTimelineClient {
            let tenant_id = self.tenant_id;
            let timeline_id = self.timeline_id;
            task_mgr::spawn(
-                self.runtime.handle(),
+                &self.runtime,
                TaskKind::RemoteUploadTask,
                Some(self.tenant_id),
                Some(self.timeline_id),
@@ -1307,7 +1346,7 @@ mod tests {
        context::RequestContext,
        tenant::{
            harness::{TenantHarness, TIMELINE_ID},
-            Tenant,
+            Tenant, Timeline,
        },
        DEFAULT_PG_VERSION,
    };
@@ -1316,7 +1355,6 @@ mod tests {
        collections::HashSet,
        path::{Path, PathBuf},
    };
-    use tokio::runtime::EnterGuard;
    use utils::lsn::Lsn;

    pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1366,35 +1404,25 @@ mod tests {
    }

    struct TestSetup {
-        runtime: &'static tokio::runtime::Runtime,
-        entered_runtime: EnterGuard<'static>,
        harness: TenantHarness,
        tenant: Arc<Tenant>,
+        timeline: Arc<Timeline>,
        tenant_ctx: RequestContext,
        remote_fs_dir: PathBuf,
        client: Arc<RemoteTimelineClient>,
    }

    impl TestSetup {
-        fn new(test_name: &str) -> anyhow::Result<Self> {
+        async fn new(test_name: &str) -> anyhow::Result<Self> {
            // Use a current-thread runtime in the test
-            let runtime = Box::leak(Box::new(
-                tokio::runtime::Builder::new_current_thread()
-                    .enable_all()
-                    .build()?,
-            ));
-            let entered_runtime = runtime.enter();
-
            let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
            let harness = TenantHarness::create(test_name)?;
-            let (tenant, ctx) = runtime.block_on(harness.load());
+            let (tenant, ctx) = harness.load().await;
+
            // create an empty timeline directory
-            let _ = runtime.block_on(tenant.create_test_timeline(
-                TIMELINE_ID,
-                Lsn(8),
-                DEFAULT_PG_VERSION,
-                &ctx,
-            ))?;
+            let timeline = tenant
+                .create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
+                .await?;

            let remote_fs_dir = harness.conf.workdir.join("remote_fs");
            std::fs::create_dir_all(remote_fs_dir)?;
@@ -1416,7 +1444,7 @@ mod tests {

            let client = Arc::new(RemoteTimelineClient {
                conf: harness.conf,
-                runtime,
+                runtime: tokio::runtime::Handle::current(),
                tenant_id: harness.tenant_id,
                timeline_id: TIMELINE_ID,
                storage_impl: storage,
@@ -1428,10 +1456,9 @@ mod tests {
            });

            Ok(Self {
-                runtime,
-                entered_runtime,
                harness,
                tenant,
+                timeline,
                tenant_ctx: ctx,
                remote_fs_dir,
                client,
@@ -1440,8 +1467,8 @@ mod tests {
    }

    // Test scheduling
-    #[test]
-    fn upload_scheduling() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn upload_scheduling() {
        // Test outline:
        //
        // Schedule upload of a bunch of layers. Check that they are started immediately, not queued
@@ -1457,25 +1484,26 @@ mod tests {
        // Schedule index upload. Check that it's queued

        let TestSetup {
-            runtime,
-            entered_runtime: _entered_runtime,
            harness,
            tenant: _tenant,
+            timeline: _timeline,
            tenant_ctx: _tenant_ctx,
            remote_fs_dir,
            client,
-        } = TestSetup::new("upload_scheduling").unwrap();
+        } = TestSetup::new("upload_scheduling").await.unwrap();

        let timeline_path = harness.timeline_path(&TIMELINE_ID);

        println!("workdir: {}", harness.conf.workdir.display());

        let remote_timeline_dir =
-            remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir)?);
+            remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
        println!("remote_timeline_dir: {}", remote_timeline_dir.display());

        let metadata = dummy_metadata(Lsn(0x10));
-        client.init_upload_queue_for_empty_remote(&metadata)?;
+        client
+            .init_upload_queue_for_empty_remote(&metadata)
+            .unwrap();

        // Create a couple of dummy files,  schedule upload for them
        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
@@ -1484,26 +1512,32 @@ mod tests {
        let content_1 = dummy_contents("foo");
        let content_2 = dummy_contents("bar");
        let content_3 = dummy_contents("baz");
-        std::fs::write(
-            timeline_path.join(layer_file_name_1.file_name()),
-            &content_1,
-        )?;
-        std::fs::write(
-            timeline_path.join(layer_file_name_2.file_name()),
-            &content_2,
-        )?;
-        std::fs::write(timeline_path.join(layer_file_name_3.file_name()), content_3)?;

-        client.schedule_layer_file_upload(
-            &layer_file_name_1,
-            &LayerFileMetadata::new(content_1.len() as u64),
-        )?;
-        client.schedule_layer_file_upload(
-            &layer_file_name_2,
-            &LayerFileMetadata::new(content_2.len() as u64),
-        )?;
+        for (filename, content) in [
+            (&layer_file_name_1, &content_1),
+            (&layer_file_name_2, &content_2),
+            (&layer_file_name_3, &content_3),
+        ] {
+            std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
+        }
+
+        client
+            .schedule_layer_file_upload(
+                &layer_file_name_1,
+                &LayerFileMetadata::new(content_1.len() as u64),
+            )
+            .unwrap();
+        client
+            .schedule_layer_file_upload(
+                &layer_file_name_2,
+                &LayerFileMetadata::new(content_2.len() as u64),
+            )
+            .unwrap();

        // Check that they are started immediately, not queued
+        //
+        // this works because we running within block_on, so any futures are now queued up until
+        // our next await point.
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1517,7 +1551,9 @@ mod tests {

        // Schedule upload of index. Check that it is queued
        let metadata = dummy_metadata(Lsn(0x20));
-        client.schedule_index_upload_for_metadata_update(&metadata)?;
+        client
+            .schedule_index_upload_for_metadata_update(&metadata)
+            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1526,7 +1562,7 @@ mod tests {
        }

        // Wait for the uploads to finish
-        runtime.block_on(client.wait_completion())?;
+        client.wait_completion().await.unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1536,29 +1572,35 @@ mod tests {
        }

        // Download back the index.json, and check that the list of files is correct
-        let index_part = match runtime.block_on(client.download_index_file())? {
+        let index_part = match client.download_index_file().await.unwrap() {
            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
        };

        assert_file_list(
-            &index_part.timeline_layers,
+            &index_part
+                .layer_metadata
+                .keys()
+                .map(|f| f.to_owned())
+                .collect(),
            &[
                &layer_file_name_1.file_name(),
                &layer_file_name_2.file_name(),
            ],
        );
-        let downloaded_metadata = index_part.parse_metadata()?;
+        let downloaded_metadata = index_part.parse_metadata().unwrap();
        assert_eq!(downloaded_metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
-        let content_baz = dummy_contents("baz");
-        std::fs::write(timeline_path.join("baz"), &content_baz)?;
-        client.schedule_layer_file_upload(
-            &layer_file_name_3,
-            &LayerFileMetadata::new(content_baz.len() as u64),
-        )?;
-        client.schedule_layer_file_deletion(&[layer_file_name_1.clone()])?;
+        client
+            .schedule_layer_file_upload(
+                &layer_file_name_3,
+                &LayerFileMetadata::new(content_3.len() as u64),
+            )
+            .unwrap();
+        client
+            .schedule_layer_file_deletion(&[layer_file_name_1.clone()])
+            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut().unwrap();
@@ -1580,7 +1622,7 @@ mod tests {
        );

        // Finish them
-        runtime.block_on(client.wait_completion())?;
+        client.wait_completion().await.unwrap();

        assert_remote_files(
            &[
@@ -1590,23 +1632,24 @@ mod tests {
            ],
            &remote_timeline_dir,
        );
-
-        Ok(())
    }

-    #[test]
-    fn bytes_unfinished_gauge_for_layer_file_uploads() -> anyhow::Result<()> {
+    #[tokio::test]
+    async fn bytes_unfinished_gauge_for_layer_file_uploads() {
        // Setup

        let TestSetup {
-            runtime,
            harness,
+            tenant: _tenant,
+            timeline: _timeline,
            client,
            ..
-        } = TestSetup::new("metrics")?;
+        } = TestSetup::new("metrics").await.unwrap();

        let metadata = dummy_metadata(Lsn(0x10));
-        client.init_upload_queue_for_empty_remote(&metadata)?;
+        client
+            .init_upload_queue_for_empty_remote(&metadata)
+            .unwrap();

        let timeline_path = harness.timeline_path(&TIMELINE_ID);

@@ -1615,7 +1658,8 @@ mod tests {
        std::fs::write(
            timeline_path.join(layer_file_name_1.file_name()),
            &content_1,
-        )?;
+        )
+        .unwrap();

        #[derive(Debug, PartialEq)]
        struct BytesStartedFinished {
@@ -1641,14 +1685,16 @@ mod tests {

        let init = get_bytes_started_stopped();

-        client.schedule_layer_file_upload(
-            &layer_file_name_1,
-            &LayerFileMetadata::new(content_1.len() as u64),
-        )?;
+        client
+            .schedule_layer_file_upload(
+                &layer_file_name_1,
+                &LayerFileMetadata::new(content_1.len() as u64),
+            )
+            .unwrap();

        let pre = get_bytes_started_stopped();

-        runtime.block_on(client.wait_completion())?;
+        client.wait_completion().await.unwrap();

        let post = get_bytes_started_stopped();

@@ -1676,7 +1722,5 @@ mod tests {
                finished: Some(content_1.len())
            }
        );
-
-        Ok(())
    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -11,23 +11,17 @@ use std::time::Duration;
 use anyhow::{anyhow, Context};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
-
-use tracing::{info, warn};
+use utils::{backoff, crashsafe};

 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};

 use super::index::{IndexPart, LayerFileMetadata};
-use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
-
-async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
-    fs::File::open(path).await?.sync_all().await
-}
+use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};

 static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);

@@ -152,7 +146,7 @@ pub async fn download_layer_file<'a>(
        })
        .map_err(DownloadError::Other)?;

-    fsync_path(&local_path)
+    crashsafe::fsync_async(&local_path)
        .await
        .with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
        .map_err(DownloadError::Other)?;
@@ -268,7 +262,6 @@ pub(super) async fn download_index_part(
    Ok(index_part)
 }

-///
 /// Helper function to handle retries for a download operation.
 ///
 /// Remote operations can fail due to rate limits (IAM, S3), spurious network
@@ -276,47 +269,17 @@ pub(super) async fn download_index_part(
 /// with backoff.
 ///
 /// (See similar logic for uploads in `perform_upload_task`)
-async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
+async fn download_retry<T, O, F>(op: O, description: &str) -> Result<T, DownloadError>
 where
    O: FnMut() -> F,
    F: Future<Output = Result<T, DownloadError>>,
 {
-    let mut attempts = 0;
-    loop {
-        let result = op().await;
-        match result {
-            Ok(_) => {
-                if attempts > 0 {
-                    info!("{description} succeeded after {attempts} retries");
-                }
-                return result;
-            }
-
-            // These are "permanent" errors that should not be retried.
-            Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
-                return result;
-            }
-            // Assume that any other failure might be transient, and the operation might
-            // succeed if we just keep trying.
-            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
-                info!("{description} failed, will retry (attempt {attempts}): {err:#}");
-            }
-            Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
-                warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
-            }
-            Err(DownloadError::Other(ref err)) => {
-                // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
-                warn!("{description} still failed after {attempts} retries, giving up: {err:?}");
-                return result;
-            }
-        }
-        // sleep and retry
-        exponential_backoff(
-            attempts,
-            DEFAULT_BASE_BACKOFF_SECONDS,
-            DEFAULT_MAX_BACKOFF_SECONDS,
-        )
-        .await;
-        attempts += 1;
-    }
+    backoff::retry(
+        op,
+        |e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
+        FAILED_DOWNLOAD_WARN_THRESHOLD,
+        FAILED_REMOTE_OP_RETRIES,
+        description,
+    )
+    .await
 }
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -62,10 +62,9 @@ pub struct IndexPart {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub deleted_at: Option<NaiveDateTime>,

-    /// Layer names, which are stored on the remote storage.
-    ///
-    /// Additional metadata can might exist in `layer_metadata`.
-    pub timeline_layers: HashSet<LayerFileName>,
+    /// Legacy field: equal to the keys of `layer_metadata`, only written out for forward compat
+    #[serde(default, skip_deserializing)]
+    timeline_layers: HashSet<LayerFileName>,

    /// Per layer file name metadata, which can be present for a present or missing layer file.
    ///
@@ -74,9 +73,10 @@ pub struct IndexPart {
    pub layer_metadata: HashMap<LayerFileName, IndexLayerMetadata>,

    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
-    // It's duplicated here for convenience.
+    // It's duplicated for convenience when reading the serialized structure, but is
+    // private because internally we would read from metadata instead.
    #[serde_as(as = "DisplayFromStr")]
-    pub disk_consistent_lsn: Lsn,
+    disk_consistent_lsn: Lsn,
    metadata_bytes: Vec<u8>,
 }

@@ -85,7 +85,11 @@ impl IndexPart {
    /// used to understand later versions.
    ///
    /// Version is currently informative only.
-    const LATEST_VERSION: usize = 2;
+    /// Version history
+    /// - 2: added `deleted_at`
+    /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
+    ///      is always generated from the keys of `layer_metadata`)
+    const LATEST_VERSION: usize = 3;
    pub const FILE_NAME: &'static str = "index_part.json";

    pub fn new(
@@ -166,7 +170,7 @@ mod tests {
        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
-            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
+            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
@@ -203,7 +207,7 @@ mod tests {
        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
-            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
+            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
@@ -241,7 +245,7 @@ mod tests {
        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 2,
-            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
+            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -8,7 +8,7 @@ mod layer_desc;
 mod remote_layer;

 use crate::config::PageServerConf;
-use crate::context::RequestContext;
+use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Key;
 use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
@@ -241,10 +241,14 @@ impl LayerAccessStats {
        });
    }

-    fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) {
+    fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
+        if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
+            return;
+        }
+
        let this_access = LayerAccessStatFullDetails {
            when: SystemTime::now(),
-            task_kind,
+            task_kind: ctx.task_kind(),
            access_kind,
        };

@@ -252,7 +256,7 @@ impl LayerAccessStats {
        locked.iter_mut().for_each(|inner| {
            inner.first_access.get_or_insert(this_access);
            inner.count_by_access_kind[access_kind] += 1;
-            inner.task_kind_flag |= task_kind;
+            inner.task_kind_flag |= ctx.task_kind();
            inner.last_accesses.write(this_access);
        })
    }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -29,10 +29,10 @@
 //!
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::page_cache::{PageReadGuard, PAGE_SZ};
+use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
-use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
+use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
    PersistentLayer, ValueReconstructResult, ValueReconstructState,
@@ -176,10 +176,6 @@ impl DeltaKey {
        Lsn(u64::from_be_bytes(self.0[KEY_SIZE..].try_into().unwrap()))
    }

-    fn extract_key_from_buf(buf: &[u8]) -> Key {
-        Key::from_slice(&buf[..KEY_SIZE])
-    }
-
    fn extract_lsn_from_buf(buf: &[u8]) -> Lsn {
        let mut lsn_buf = [0u8; 8];
        lsn_buf.copy_from_slice(&buf[KEY_SIZE..]);
@@ -276,47 +272,42 @@ impl Layer for DeltaLayer {

        tree_reader.dump().await?;

-        let cursor = file.block_cursor();
+        let keys = DeltaLayerInner::load_keys(&Ref(&**inner)).await?;

        // A subroutine to dump a single blob
-        let dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
-            let buf = cursor.read_blob(blob_ref.pos())?;
-            let val = Value::des(&buf)?;
-            let desc = match val {
-                Value::Image(img) => {
-                    format!(" img {} bytes", img.len())
-                }
-                Value::WalRecord(rec) => {
-                    let wal_desc = walrecord::describe_wal_record(&rec)?;
-                    format!(
-                        " rec {} bytes will_init: {} {}",
-                        buf.len(),
-                        rec.will_init(),
-                        wal_desc
-                    )
-                }
-            };
-            Ok(desc)
+        let dump_blob = |val: ValueRef<_>| -> _ {
+            async move {
+                let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
+                let val = Value::des(&buf)?;
+                let desc = match val {
+                    Value::Image(img) => {
+                        format!(" img {} bytes", img.len())
+                    }
+                    Value::WalRecord(rec) => {
+                        let wal_desc = walrecord::describe_wal_record(&rec)?;
+                        format!(
+                            " rec {} bytes will_init: {} {}",
+                            buf.len(),
+                            rec.will_init(),
+                            wal_desc
+                        )
+                    }
+                };
+                Ok(desc)
+            }
        };

-        tree_reader
-            .visit(
-                &[0u8; DELTA_KEY_SIZE],
-                VisitDirection::Forwards,
-                |delta_key, val| {
-                    let blob_ref = BlobRef(val);
-                    let key = DeltaKey::extract_key_from_buf(delta_key);
-                    let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
-
-                    let desc = match dump_blob(blob_ref) {
-                        Ok(desc) => desc,
-                        Err(err) => format!("ERROR: {}", err),
-                    };
-                    println!("  key {} at {}: {}", key, lsn, desc);
-                    true
-                },
-            )
-            .await?;
+        for entry in keys {
+            let DeltaEntry { key, lsn, val, .. } = entry;
+            let desc = match dump_blob(val).await {
+                Ok(desc) => desc,
+                Err(err) => {
+                    let err: anyhow::Error = err;
+                    format!("ERROR: {err}")
+                }
+            };
+            println!("  key {key} at {lsn}: {desc}");
+        }

        Ok(())
    }
@@ -335,7 +326,6 @@ impl Layer for DeltaLayer {
        let inner = self
            .load(LayerAccessKind::GetValueReconstructData, ctx)
            .await?;
-
        inner
            .get_value_reconstruct_data(key, lsn_range, reconstruct_state)
            .await
@@ -452,8 +442,7 @@ impl DeltaLayer {
        access_kind: LayerAccessKind,
        ctx: &RequestContext,
    ) -> Result<&Arc<DeltaLayerInner>> {
-        self.access_stats
-            .record_access(access_kind, ctx.task_kind());
+        self.access_stats.record_access(access_kind, ctx);
        // Quick exit if already loaded
        self.inner
            .get_or_try_init(|| self.load_inner())
@@ -549,30 +538,20 @@ impl DeltaLayer {
            &self.layer_name(),
        )
    }
-
-    /// Obtains all keys and value references stored in the layer
+    /// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
    ///
    /// The value can be obtained via the [`ValueRef::load`] function.
-    pub async fn load_val_refs(
+    pub(crate) async fn load_keys(
        &self,
        ctx: &RequestContext,
-    ) -> Result<Vec<(Key, Lsn, ValueRef<Arc<DeltaLayerInner>>)>> {
-        let inner = self
-            .load(LayerAccessKind::Iter, ctx)
-            .await
-            .context("load delta layer")?;
-        DeltaLayerInner::load_val_refs(inner)
-            .await
-            .context("Layer index is corrupted")
-    }
-
-    /// Loads all keys stored in the layer. Returns key, lsn and value size.
-    pub async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, u64)>> {
+    ) -> Result<Vec<DeltaEntry<Ref<&'_ DeltaLayerInner>>>> {
        let inner = self
            .load(LayerAccessKind::KeyIter, ctx)
            .await
            .context("load delta layer keys")?;
-        DeltaLayerInner::load_keys(inner)
+
+        let inner = Ref(&**inner);
+        DeltaLayerInner::load_keys(&inner)
            .await
            .context("Layer index is corrupted")
    }
@@ -711,6 +690,17 @@ impl DeltaLayerWriterInner {
            .metadata()
            .context("get file metadata to determine size")?;

+        // 5GB limit for objects without multipart upload (which we don't want to use)
+        // Make it a little bit below to account for differing GB units
+        // https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html
+        const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
+        ensure!(
+            metadata.len() <= S3_UPLOAD_LIMIT,
+            "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
+            file.path.display(),
+            metadata.len()
+        );
+
        // Note: Because we opened the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
        // set inner.file here. The first read will have to re-open it.
@@ -913,12 +903,15 @@ impl DeltaLayerInner {
        let cursor = file.block_cursor();
        let mut buf = Vec::new();
        for (entry_lsn, pos) in offsets {
-            cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
-                format!(
-                    "Failed to read blob from virtual file {}",
-                    file.file.path.display()
-                )
-            })?;
+            cursor
+                .read_blob_into_buf(pos, &mut buf)
+                .await
+                .with_context(|| {
+                    format!(
+                        "Failed to read blob from virtual file {}",
+                        file.file.path.display()
+                    )
+                })?;
            let val = Value::des(&buf).with_context(|| {
                format!(
                    "Failed to deserialize file blob from virtual file {}",
@@ -952,15 +945,17 @@ impl DeltaLayerInner {
        }
    }

-    pub(super) async fn load_val_refs<T: AsRef<DeltaLayerInner> + Clone>(
+    pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
        this: &T,
-    ) -> Result<Vec<(Key, Lsn, ValueRef<T>)>> {
+    ) -> Result<Vec<DeltaEntry<T>>> {
        let dl = this.as_ref();
        let file = &dl.file;
+
        let tree_reader =
            DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);

-        let mut all_offsets = Vec::<(Key, Lsn, ValueRef<T>)>::new();
+        let mut all_keys: Vec<DeltaEntry<T>> = Vec::new();
+
        tree_reader
            .visit(
                &[0u8; DELTA_KEY_SIZE],
@@ -971,54 +966,63 @@ impl DeltaLayerInner {
                        blob_ref: BlobRef(value),
                        reader: BlockCursor::new(Adapter(this.clone())),
                    };
-                    all_offsets.push((delta_key.key(), delta_key.lsn(), val_ref));
-                    true
-                },
-            )
-            .await?;
-
-        Ok(all_offsets)
-    }
-
-    pub(super) async fn load_keys(&self) -> Result<Vec<(Key, Lsn, u64)>> {
-        let file = &self.file;
-        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-            self.index_start_blk,
-            self.index_root_blk,
-            file,
-        );
-
-        let mut all_keys: Vec<(Key, Lsn, u64)> = Vec::new();
-        tree_reader
-            .visit(
-                &[0u8; DELTA_KEY_SIZE],
-                VisitDirection::Forwards,
-                |key, value| {
-                    let delta_key = DeltaKey::from_slice(key);
                    let pos = BlobRef(value).pos();
                    if let Some(last) = all_keys.last_mut() {
-                        if last.0 == delta_key.key() {
-                            return true;
-                        } else {
-                            // subtract offset of new key BLOB and first blob of this key
-                            // to get total size if values associated with this key
-                            let first_pos = last.2;
-                            last.2 = pos - first_pos;
-                        }
+                        // subtract offset of the current and last entries to get the size
+                        // of the value associated with this (key, lsn) tuple
+                        let first_pos = last.size;
+                        last.size = pos - first_pos;
                    }
-                    all_keys.push((delta_key.key(), delta_key.lsn(), pos));
+                    let entry = DeltaEntry {
+                        key: delta_key.key(),
+                        lsn: delta_key.lsn(),
+                        size: pos,
+                        val: val_ref,
+                    };
+                    all_keys.push(entry);
                    true
                },
            )
            .await?;
        if let Some(last) = all_keys.last_mut() {
-            // Last key occupies all space till end of layer
-            last.2 = std::fs::metadata(&file.file.path)?.len() - last.2;
+            // Last key occupies all space till end of value storage,
+            // which corresponds to beginning of the index
+            last.size = dl.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
        }
        Ok(all_keys)
    }
 }

+/// Cloneable borrow wrapper to make borrows behave like smart pointers.
+///
+/// Shared references are trivially copyable. This wrapper avoids (confusion) to otherwise attempt
+/// cloning DeltaLayerInner.
+pub(crate) struct Ref<T>(T);
+
+impl<'a, T> AsRef<T> for Ref<&'a T> {
+    fn as_ref(&self) -> &T {
+        self.0
+    }
+}
+
+impl<'a, T> Clone for Ref<&'a T> {
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<'a, T> Copy for Ref<&'a T> {}
+
+/// A set of data associated with a delta layer key and its value
+pub struct DeltaEntry<T: AsRef<DeltaLayerInner>> {
+    pub key: Key,
+    pub lsn: Lsn,
+    /// Size of the stored value
+    pub size: u64,
+    /// Reference to the on-disk value
+    pub val: ValueRef<T>,
+}
+
 /// Reference to an on-disk value
 pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
    blob_ref: BlobRef,
@@ -1027,9 +1031,9 @@ pub struct ValueRef<T: AsRef<DeltaLayerInner>> {

 impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
    /// Loads the value from disk
-    pub fn load(&self) -> Result<Value> {
+    pub async fn load(&self) -> Result<Value> {
        // theoretically we *could* record an access time for each, but it does not really matter
-        let buf = self.reader.read_blob(self.blob_ref.pos())?;
+        let buf = self.reader.read_blob(self.blob_ref.pos()).await?;
        let val = Value::des(&buf)?;
        Ok(val)
    }
@@ -1038,9 +1042,7 @@ impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
 struct Adapter<T: AsRef<DeltaLayerInner>>(T);

 impl<T: AsRef<DeltaLayerInner>> BlockReader for Adapter<T> {
-    type BlockLease = PageReadGuard<'static>;
-
-    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        self.0.as_ref().file.read_blk(blknum)
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -323,8 +323,7 @@ impl ImageLayer {
        access_kind: LayerAccessKind,
        ctx: &RequestContext,
    ) -> Result<&ImageLayerInner> {
-        self.access_stats
-            .record_access(access_kind, ctx.task_kind());
+        self.access_stats.record_access(access_kind, ctx);
        self.inner
            .get_or_try_init(|| self.load_inner())
            .await
@@ -471,6 +470,7 @@ impl ImageLayerInner {
            let blob = file
                .block_cursor()
                .read_blob(offset)
+                .await
                .with_context(|| format!("failed to read value from offset {}", offset))?;
            let value = Bytes::from(blob);

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -16,6 +16,7 @@ use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
 use std::cell::RefCell;
 use std::collections::HashMap;
+use std::sync::OnceLock;
 use tracing::*;
 use utils::{
    bin_ser::BeSer,
@@ -27,7 +28,7 @@ use utils::{
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::ops::Range;
-use std::sync::RwLock;
+use tokio::sync::RwLock;

 use super::{DeltaLayer, DeltaLayerWriter, Layer};

@@ -42,14 +43,16 @@ pub struct InMemoryLayer {
    tenant_id: TenantId,
    timeline_id: TimelineId,

-    ///
    /// This layer contains all the changes from 'start_lsn'. The
    /// start is inclusive.
-    ///
    start_lsn: Lsn,

-    /// The above fields never change. The parts that do change are in 'inner',
-    /// and protected by mutex.
+    /// Frozen layers have an exclusive end LSN.
+    /// Writes are only allowed when this is `None`.
+    end_lsn: OnceLock<Lsn>,
+
+    /// The above fields never change, except for `end_lsn`, which is only set once.
+    /// All other changing parts are in `inner`, and protected by a mutex.
    inner: RwLock<InMemoryLayerInner>,
 }

@@ -57,21 +60,16 @@ impl std::fmt::Debug for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("InMemoryLayer")
            .field("start_lsn", &self.start_lsn)
+            .field("end_lsn", &self.end_lsn)
            .field("inner", &self.inner)
            .finish()
    }
 }

 pub struct InMemoryLayerInner {
-    /// Frozen layers have an exclusive end LSN.
-    /// Writes are only allowed when this is None
-    end_lsn: Option<Lsn>,
-
-    ///
    /// All versions of all pages in the layer are kept here.  Indexed
    /// by block number and LSN. The value is an offset into the
    /// ephemeral file where the page version is stored.
-    ///
    index: HashMap<Key, VecMap<Lsn, u64>>,

    /// The values are stored in a serialized format in this file.
@@ -82,15 +80,7 @@ pub struct InMemoryLayerInner {

 impl std::fmt::Debug for InMemoryLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("InMemoryLayerInner")
-            .field("end_lsn", &self.end_lsn)
-            .finish()
-    }
-}
-
-impl InMemoryLayerInner {
-    fn assert_writeable(&self) {
-        assert!(self.end_lsn.is_none());
+        f.debug_struct("InMemoryLayerInner").finish()
    }
 }

@@ -101,13 +91,21 @@ impl InMemoryLayer {

    pub fn info(&self) -> InMemoryLayerInfo {
        let lsn_start = self.start_lsn;
-        let lsn_end = self.inner.read().unwrap().end_lsn;

-        match lsn_end {
-            Some(lsn_end) => InMemoryLayerInfo::Frozen { lsn_start, lsn_end },
-            None => InMemoryLayerInfo::Open { lsn_start },
+        if let Some(&lsn_end) = self.end_lsn.get() {
+            InMemoryLayerInfo::Frozen { lsn_start, lsn_end }
+        } else {
+            InMemoryLayerInfo::Open { lsn_start }
        }
    }
+
+    fn assert_writable(&self) {
+        assert!(self.end_lsn.get().is_none());
+    }
+
+    fn end_lsn_or_max(&self) -> Lsn {
+        self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
+    }
 }

 #[async_trait::async_trait]
@@ -117,14 +115,7 @@ impl Layer for InMemoryLayer {
    }

    fn get_lsn_range(&self) -> Range<Lsn> {
-        let inner = self.inner.read().unwrap();
-
-        let end_lsn = if let Some(end_lsn) = inner.end_lsn {
-            end_lsn
-        } else {
-            Lsn(u64::MAX)
-        };
-        self.start_lsn..end_lsn
+        self.start_lsn..self.end_lsn_or_max()
    }

    fn is_incremental(&self) -> bool {
@@ -134,13 +125,9 @@ impl Layer for InMemoryLayer {

    /// debugging function to print out the contents of the layer
    async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
-        let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().await;

-        let end_str = inner
-            .end_lsn
-            .as_ref()
-            .map(Lsn::to_string)
-            .unwrap_or_default();
+        let end_str = self.end_lsn_or_max();

        println!(
            "----- in-memory layer for tli {} LSNs {}-{} ----",
@@ -156,7 +143,7 @@ impl Layer for InMemoryLayer {
        for (key, vec_map) in inner.index.iter() {
            for (lsn, pos) in vec_map.as_slice() {
                let mut desc = String::new();
-                cursor.read_blob_into_buf(*pos, &mut buf)?;
+                cursor.read_blob_into_buf(*pos, &mut buf).await?;
                let val = Value::des(&buf);
                match val {
                    Ok(Value::Image(img)) => {
@@ -194,7 +181,7 @@ impl Layer for InMemoryLayer {
        ensure!(lsn_range.start >= self.start_lsn);
        let mut need_image = true;

-        let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().await;

        let reader = inner.file.block_cursor();

@@ -202,7 +189,7 @@ impl Layer for InMemoryLayer {
        if let Some(vec_map) = inner.index.get(&key) {
            let slice = vec_map.slice_range(lsn_range);
            for (entry_lsn, pos) in slice.iter().rev() {
-                let buf = reader.read_blob(*pos)?;
+                let buf = reader.read_blob(*pos).await?;
                let value = Value::des(&buf)?;
                match value {
                    Value::Image(img) => {
@@ -236,20 +223,18 @@ impl Layer for InMemoryLayer {

 impl std::fmt::Display for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let inner = self.inner.read().unwrap();
-
-        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
+        let end_lsn = self.end_lsn_or_max();
        write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
    }
 }

 impl InMemoryLayer {
    ///
-    /// Get layer size on the disk
+    /// Get layer size.
    ///
-    pub fn size(&self) -> Result<u64> {
-        let inner = self.inner.read().unwrap();
-        Ok(inner.file.size)
+    pub async fn size(&self) -> Result<u64> {
+        let inner = self.inner.read().await;
+        Ok(inner.file.size())
    }

    ///
@@ -270,8 +255,8 @@ impl InMemoryLayer {
            timeline_id,
            tenant_id,
            start_lsn,
+            end_lsn: OnceLock::new(),
            inner: RwLock::new(InMemoryLayerInner {
-                end_lsn: None,
                index: HashMap::new(),
                file,
            }),
@@ -282,10 +267,10 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
+    pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-        let mut inner = self.inner.write().unwrap();
-        inner.assert_writeable();
+        let mut inner = self.inner.write().await;
+        self.assert_writable();

        let off = {
            SER_BUFFER.with(|x| -> Result<_> {
@@ -316,11 +301,11 @@ impl InMemoryLayer {
    /// Make the layer non-writeable. Only call once.
    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
-    pub fn freeze(&self, end_lsn: Lsn) {
-        let mut inner = self.inner.write().unwrap();
+    pub async fn freeze(&self, end_lsn: Lsn) {
+        let inner = self.inner.write().await;

        assert!(self.start_lsn < end_lsn);
-        inner.end_lsn = Some(end_lsn);
+        self.end_lsn.set(end_lsn).expect("end_lsn set only once");

        for vec_map in inner.index.values() {
            for (lsn, _pos) in vec_map.as_slice() {
@@ -332,7 +317,7 @@ impl InMemoryLayer {
    /// Write this frozen in-memory layer to disk.
    ///
    /// Returns a new delta layer with all the same data as this in-memory layer
-    pub fn write_to_disk(&self) -> Result<DeltaLayer> {
+    pub async fn write_to_disk(&self) -> Result<DeltaLayer> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
        // write lock on it, so we shouldn't block anyone. There's one exception
@@ -342,14 +327,16 @@ impl InMemoryLayer {
        // lock, it will see that it's not writeable anymore and retry, but it
        // would have to wait until we release it. That race condition is very
        // rare though, so we just accept the potential latency hit for now.
-        let inner = self.inner.read().unwrap();
+        let inner = self.inner.read().await;
+
+        let end_lsn = *self.end_lsn.get().unwrap();

        let mut delta_layer_writer = DeltaLayerWriter::new(
            self.conf,
            self.timeline_id,
            self.tenant_id,
            Key::MIN,
-            self.start_lsn..inner.end_lsn.unwrap(),
+            self.start_lsn..end_lsn,
        )?;

        let mut buf = Vec::new();
@@ -363,7 +350,7 @@ impl InMemoryLayer {
            let key = **key;
            // Write all page versions
            for (lsn, pos) in vec_map.as_slice() {
-                cursor.read_blob_into_buf(*pos, &mut buf)?;
+                cursor.read_blob_into_buf(*pos, &mut buf).await?;
                let will_init = Value::des(&buf)?.will_init();
                delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
            }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -35,8 +35,11 @@ use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};

-use crate::context::{DownloadBehavior, RequestContext};
+use crate::context::{
+    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
+};
 use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
+use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
    DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
    LayerAccessStats, LayerFileName, RemoteLayer,
@@ -137,6 +140,12 @@ fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
 fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
    drop(rlock)
 }
+
+/// The outward-facing resources required to build a Timeline
+pub struct TimelineResources {
+    pub remote_client: Option<RemoteTimelineClient>,
+}
+
 pub struct Timeline {
    conf: &'static PageServerConf,
    tenant_conf: Arc<RwLock<TenantConfOpt>>,
@@ -799,10 +808,15 @@ impl Timeline {
            .await
        {
            Ok((partitioning, lsn)) => {
+                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
+                let image_ctx = RequestContextBuilder::extend(ctx)
+                    .access_stats_behavior(AccessStatsBehavior::Skip)
+                    .build();
+
                // 2. Create new image layers for partitions that have been modified
                // "enough".
                let layer_paths_to_upload = self
-                    .create_image_layers(&partitioning, lsn, false, ctx)
+                    .create_image_layers(&partitioning, lsn, false, &image_ctx)
                    .await
                    .map_err(anyhow::Error::from)?;
                if let Some(remote_client) = &self.remote_client {
@@ -875,7 +889,7 @@ impl Timeline {
            let Some(open_layer) = layers.open_layer.as_ref() else {
                return Ok(());
            };
-            open_layer.size()?
+            open_layer.size().await?
        };
        let last_freeze_at = self.last_freeze_at.load();
        let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
@@ -919,7 +933,7 @@ impl Timeline {
    pub fn set_state(&self, new_state: TimelineState) {
        match (self.current_state(), new_state) {
            (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
-                warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+                info!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
            }
            (st, TimelineState::Loading) => {
                error!("ignoring transition from {st:?} into Loading state");
@@ -1366,7 +1380,7 @@ impl Timeline {
        timeline_id: TimelineId,
        tenant_id: TenantId,
        walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
-        remote_client: Option<RemoteTimelineClient>,
+        resources: TimelineResources,
        pg_version: u32,
        initial_logical_size_can_start: Option<completion::Barrier>,
        initial_logical_size_attempt: Option<completion::Completion>,
@@ -1401,7 +1415,7 @@ impl Timeline {
                walredo_mgr,
                walreceiver: Mutex::new(None),

-                remote_client: remote_client.map(Arc::new),
+                remote_client: resources.remote_client.map(Arc::new),

                // initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
                last_record_lsn: SeqWait::new(RecordLsn {
@@ -1722,7 +1736,7 @@ impl Timeline {

        let mut corrupted_local_layers = Vec::new();
        let mut added_remote_layers = Vec::new();
-        for remote_layer_name in &index_part.timeline_layers {
+        for remote_layer_name in index_part.layer_metadata.keys() {
            let local_layer = local_only_layers.remove(remote_layer_name);

            let remote_layer_metadata = index_part
@@ -1882,7 +1896,7 @@ impl Timeline {
            Some(index_part) => {
                info!(
                    "initializing upload queue from remote index with {} layer files",
-                    index_part.timeline_layers.len()
+                    index_part.layer_metadata.len()
                );
                remote_client.init_upload_queue(index_part)?;
                self.create_remote_layers(index_part, local_layers, disk_consistent_lsn)
@@ -2647,7 +2661,7 @@ impl Timeline {
    async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
        //info!("PUT: key {} at {}", key, lsn);
        let layer = self.get_layer_for_write(lsn).await?;
-        layer.put_value(key, lsn, val)?;
+        layer.put_value(key, lsn, val).await?;
        Ok(())
    }

@@ -2673,7 +2687,9 @@ impl Timeline {
            Some(self.write_lock.lock().await)
        };
        let mut guard = self.layers.write().await;
-        guard.try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at);
+        guard
+            .try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
+            .await;
    }

    /// Layer flusher task's main loop.
@@ -2955,7 +2971,11 @@ impl Timeline {
            let frozen_layer = Arc::clone(frozen_layer);
            move || {
                // Write it out
-                let new_delta = frozen_layer.write_to_disk()?;
+                // Keep this inside `spawn_blocking` and `Handle::current`
+                // as long as the write path is still sync and the read impl
+                // is still not fully async. Otherwise executor threads would
+                // be blocked.
+                let new_delta = Handle::current().block_on(frozen_layer.write_to_disk())?;
                let new_delta_path = new_delta.path();

                // Sync it to disk.
@@ -3299,10 +3319,10 @@ struct CompactLevel0Phase1StatsBuilder {
    timeline_id: Option<TimelineId>,
    read_lock_acquisition_micros: DurationRecorder,
    read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
+    read_lock_held_key_sort_micros: DurationRecorder,
    read_lock_held_prerequisites_micros: DurationRecorder,
    read_lock_held_compute_holes_micros: DurationRecorder,
    read_lock_drop_micros: DurationRecorder,
-    prepare_iterators_micros: DurationRecorder,
    write_layer_files_micros: DurationRecorder,
    level0_deltas_count: Option<usize>,
    new_deltas_count: Option<usize>,
@@ -3319,10 +3339,10 @@ struct CompactLevel0Phase1Stats {
    timeline_id: TimelineId,
    read_lock_acquisition_micros: RecordedDuration,
    read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
+    read_lock_held_key_sort_micros: RecordedDuration,
    read_lock_held_prerequisites_micros: RecordedDuration,
    read_lock_held_compute_holes_micros: RecordedDuration,
    read_lock_drop_micros: RecordedDuration,
-    prepare_iterators_micros: RecordedDuration,
    write_layer_files_micros: RecordedDuration,
    level0_deltas_count: usize,
    new_deltas_count: usize,
@@ -3349,6 +3369,10 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
                .read_lock_held_spawn_blocking_startup_micros
                .into_recorded()
                .ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
+            read_lock_held_key_sort_micros: value
+                .read_lock_held_key_sort_micros
+                .into_recorded()
+                .ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
            read_lock_held_prerequisites_micros: value
                .read_lock_held_prerequisites_micros
                .into_recorded()
@@ -3361,10 +3385,6 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
                .read_lock_drop_micros
                .into_recorded()
                .ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
-            prepare_iterators_micros: value
-                .prepare_iterators_micros
-                .into_recorded()
-                .ok_or_else(|| anyhow!("prepare_iterators_micros not set"))?,
            write_layer_files_micros: value
                .write_layer_files_micros
                .into_recorded()
@@ -3534,28 +3554,24 @@ impl Timeline {
        let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
        let mut prev: Option<Key> = None;

-        let mut all_value_refs = Vec::new();
        let mut all_keys = Vec::new();

-        for l in deltas_to_compact.iter() {
+        let downcast_deltas: Vec<_> = deltas_to_compact
+            .iter()
+            .map(|l| l.clone().downcast_delta_layer().expect("delta layer"))
+            .collect();
+        for dl in downcast_deltas.iter() {
            // TODO: replace this with an await once we fully go async
-            let delta = l.clone().downcast_delta_layer().expect("delta layer");
-            Handle::current().block_on(async {
-                all_value_refs.extend(delta.load_val_refs(ctx).await?);
-                all_keys.extend(delta.load_keys(ctx).await?);
-                anyhow::Ok(())
-            })?;
+            all_keys.extend(Handle::current().block_on(DeltaLayer::load_keys(dl, ctx))?);
        }

        // The current stdlib sorting implementation is designed in a way where it is
        // particularly fast where the slice is made up of sorted sub-ranges.
-        all_value_refs.sort_by_key(|(key, lsn, _value_ref)| (*key, *lsn));
+        all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));

-        // The current stdlib sorting implementation is designed in a way where it is
-        // particularly fast where the slice is made up of sorted sub-ranges.
-        all_keys.sort_by_key(|(key, lsn, _size)| (*key, *lsn));
+        stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();

-        for (next_key, _next_lsn, _size) in all_keys.iter() {
+        for DeltaEntry { key: next_key, .. } in all_keys.iter() {
            let next_key = *next_key;
            if let Some(prev_key) = prev {
                // just first fast filter
@@ -3579,8 +3595,7 @@ impl Timeline {
            }
            prev = Some(next_key.next());
        }
-        stats.read_lock_held_compute_holes_micros =
-            stats.read_lock_held_prerequisites_micros.till_now();
+        stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
        drop_rlock(guard);
        stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
        let mut holes = heap.into_vec();
@@ -3589,12 +3604,26 @@ impl Timeline {

        // This iterator walks through all key-value pairs from all the layers
        // we're compacting, in key, LSN order.
-        let all_values_iter = all_value_refs.into_iter();
+        let all_values_iter = all_keys.iter();

        // This iterator walks through all keys and is needed to calculate size used by each key
-        let mut all_keys_iter = all_keys.into_iter();
-
-        stats.prepare_iterators_micros = stats.read_lock_drop_micros.till_now();
+        let mut all_keys_iter = all_keys
+            .iter()
+            .map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size))
+            .coalesce(|mut prev, cur| {
+                // Coalesce keys that belong to the same key pair.
+                // This ensures that compaction doesn't put them
+                // into different layer files.
+                // Still limit this by the target file size,
+                // so that we keep the size of the files in
+                // check.
+                if prev.0 == cur.0 && prev.2 < target_file_size {
+                    prev.2 += cur.2;
+                    Ok(prev)
+                } else {
+                    Err((prev, cur))
+                }
+            });

        // Merge the contents of all the input delta layers into a new set
        // of delta layers, based on the current partitioning.
@@ -3646,104 +3675,127 @@ impl Timeline {
        let mut key_values_total_size = 0u64;
        let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
        let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
-        for (key, lsn, value_ref) in all_values_iter {
-            let value = value_ref.load()?;
-            let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
-            // We need to check key boundaries once we reach next key or end of layer with the same key
-            if !same_key || lsn == dup_end_lsn {
-                let mut next_key_size = 0u64;
-                let is_dup_layer = dup_end_lsn.is_valid();
-                dup_start_lsn = Lsn::INVALID;
-                if !same_key {
-                    dup_end_lsn = Lsn::INVALID;
+
+        // TODO remove this block_on wrapper once we fully go async
+        Handle::current().block_on(async {
+            for &DeltaEntry {
+                key, lsn, ref val, ..
+            } in all_values_iter
+            {
+                let value = val.load().await?;
+                let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
+                // We need to check key boundaries once we reach next key or end of layer with the same key
+                if !same_key || lsn == dup_end_lsn {
+                    let mut next_key_size = 0u64;
+                    let is_dup_layer = dup_end_lsn.is_valid();
+                    dup_start_lsn = Lsn::INVALID;
+                    if !same_key {
+                        dup_end_lsn = Lsn::INVALID;
+                    }
+                    // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
+                    for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
+                        next_key_size = next_size;
+                        if key != next_key {
+                            if dup_end_lsn.is_valid() {
+                                // We are writting segment with duplicates:
+                                // place all remaining values of this key in separate segment
+                                dup_start_lsn = dup_end_lsn; // new segments starts where old stops
+                                dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
+                            }
+                            break;
+                        }
+                        key_values_total_size += next_size;
+                        // Check if it is time to split segment: if total keys size is larger than target file size.
+                        // We need to avoid generation of empty segments if next_size > target_file_size.
+                        if key_values_total_size > target_file_size && lsn != next_lsn {
+                            // Split key between multiple layers: such layer can contain only single key
+                            dup_start_lsn = if dup_end_lsn.is_valid() {
+                                dup_end_lsn // new segment with duplicates starts where old one stops
+                            } else {
+                                lsn // start with the first LSN for this key
+                            };
+                            dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
+                            break;
+                        }
+                    }
+                    // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
+                    if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
+                        dup_start_lsn = dup_end_lsn;
+                        dup_end_lsn = lsn_range.end;
+                    }
+                    if writer.is_some() {
+                        let written_size = writer.as_mut().unwrap().size();
+                        let contains_hole =
+                            next_hole < holes.len() && key >= holes[next_hole].key_range.end;
+                        // check if key cause layer overflow or contains hole...
+                        if is_dup_layer
+                            || dup_end_lsn.is_valid()
+                            || written_size + key_values_total_size > target_file_size
+                            || contains_hole
+                        {
+                            // ... if so, flush previous layer and prepare to write new one
+                            new_layers.push(Arc::new(
+                                writer.take().unwrap().finish(prev_key.unwrap().next())?,
+                            ));
+                            writer = None;
+
+                            if contains_hole {
+                                // skip hole
+                                next_hole += 1;
+                            }
+                        }
+                    }
+                    // Remember size of key value because at next iteration we will access next item
+                    key_values_total_size = next_key_size;
                }
-                // Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
-                for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
-                    next_key_size = next_size;
-                    if key != next_key {
+                if writer.is_none() {
+                    // Create writer if not initiaized yet
+                    writer = Some(DeltaLayerWriter::new(
+                        self.conf,
+                        self.timeline_id,
+                        self.tenant_id,
+                        key,
                        if dup_end_lsn.is_valid() {
-                            // We are writting segment with duplicates:
-                            // place all remaining values of this key in separate segment
-                            dup_start_lsn = dup_end_lsn; // new segments starts where old stops
-                            dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
-                        }
-                        break;
-                    }
-                    key_values_total_size += next_size;
-                    // Check if it is time to split segment: if total keys size is larger than target file size.
-                    // We need to avoid generation of empty segments if next_size > target_file_size.
-                    if key_values_total_size > target_file_size && lsn != next_lsn {
-                        // Split key between multiple layers: such layer can contain only single key
-                        dup_start_lsn = if dup_end_lsn.is_valid() {
-                            dup_end_lsn // new segment with duplicates starts where old one stops
+                            // this is a layer containing slice of values of the same key
+                            debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
+                            dup_start_lsn..dup_end_lsn
                        } else {
-                            lsn // start with the first LSN for this key
-                        };
-                        dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
-                        break;
-                    }
+                            debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
+                            lsn_range.clone()
+                        },
+                    )?);
                }
-                // handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
-                if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
-                    dup_start_lsn = dup_end_lsn;
-                    dup_end_lsn = lsn_range.end;
-                }
-                if writer.is_some() {
-                    let written_size = writer.as_mut().unwrap().size();
-                    let contains_hole =
-                        next_hole < holes.len() && key >= holes[next_hole].key_range.end;
-                    // check if key cause layer overflow or contains hole...
-                    if is_dup_layer
-                        || dup_end_lsn.is_valid()
-                        || written_size + key_values_total_size > target_file_size
-                        || contains_hole
-                    {
-                        // ... if so, flush previous layer and prepare to write new one
-                        new_layers.push(Arc::new(
-                            writer.take().unwrap().finish(prev_key.unwrap().next())?,
-                        ));
-                        writer = None;

-                        if contains_hole {
-                            // skip hole
-                            next_hole += 1;
-                        }
-                    }
-                }
-                // Remember size of key value because at next iteration we will access next item
-                key_values_total_size = next_key_size;
+                fail_point!("delta-layer-writer-fail-before-finish", |_| {
+                    Result::<_>::Err(anyhow::anyhow!(
+                        "failpoint delta-layer-writer-fail-before-finish"
+                    ))
+                });
+
+                writer.as_mut().unwrap().put_value(key, lsn, value)?;
+                prev_key = Some(key);
            }
-            if writer.is_none() {
-                // Create writer if not initiaized yet
-                writer = Some(DeltaLayerWriter::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_id,
-                    key,
-                    if dup_end_lsn.is_valid() {
-                        // this is a layer containing slice of values of the same key
-                        debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
-                        dup_start_lsn..dup_end_lsn
-                    } else {
-                        debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
-                        lsn_range.clone()
-                    },
-                )?);
-            }
-
-            fail_point!("delta-layer-writer-fail-before-finish", |_| {
-                Err(anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into())
-            });
-
-            writer.as_mut().unwrap().put_value(key, lsn, value)?;
-            prev_key = Some(key);
-        }
+            Ok(())
+        })?;
        if let Some(writer) = writer {
            new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
        }

        // Sync layers
        if !new_layers.is_empty() {
+            // Print a warning if the created layer is larger than double the target size
+            // Add two pages for potential overhead. This should in theory be already
+            // accounted for in the target calculation, but for very small targets,
+            // we still might easily hit the limit otherwise.
+            let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
+            for layer in new_layers.iter() {
+                if layer.desc.file_size > warn_limit {
+                    warn!(
+                        %layer,
+                        "created delta file of size {} larger than double of target of {target_file_size}", layer.desc.file_size
+                    );
+                }
+            }
            let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();

            // Fsync all the layer files and directory using multiple threads to
@@ -3756,12 +3808,10 @@ impl Timeline {
            layer_paths.pop().unwrap();
        }

-        stats.write_layer_files_micros = stats.prepare_iterators_micros.till_now();
+        stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
        stats.new_deltas_count = Some(new_layers.len());
        stats.new_deltas_size = Some(new_layers.iter().map(|l| l.desc.file_size).sum());

-        drop(all_keys_iter); // So that deltas_to_compact is no longer borrowed
-
        match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
            .and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
        {
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -25,7 +25,7 @@ use crate::{
    InitializationOrder,
 };

-use super::Timeline;
+use super::{Timeline, TimelineResources};

 /// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
 async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
@@ -219,27 +219,13 @@ async fn delete_local_layer_files(
            }
        };

-        let r = if metadata.is_dir() {
-            // There shouldnt be any directories inside timeline dir as of current layout.
+        if metadata.is_dir() {
+            warn!(path=%entry.path().display(), "unexpected directory under timeline dir");
            tokio::fs::remove_dir(entry.path()).await
        } else {
            tokio::fs::remove_file(entry.path()).await
-        };
-
-        if let Err(e) = r {
-            if e.kind() == std::io::ErrorKind::NotFound {
-                warn!(
-                    timeline_dir=?local_timeline_directory,
-                    path=?entry.path().display(),
-                    "got not found err while removing timeline dir, proceeding anyway"
-                );
-                continue;
-            }
-            anyhow::bail!(anyhow::anyhow!(
-                "Failed to remove: {}. Error: {e}",
-                entry.path().display()
-            ));
        }
+        .with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
    }

    info!("finished deleting layer files, releasing layer_removal_cs.lock()");
@@ -293,6 +279,17 @@ async fn cleanup_remaining_timeline_fs_traces(
        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
    });

+    // Make sure previous deletions are ordered before mark removal.
+    // Otherwise there is no guarantee that they reach the disk before mark deletion.
+    // So its possible for mark to reach disk first and for other deletions
+    // to be reordered later and thus missed if a crash occurs.
+    // Note that we dont need to sync after mark file is removed
+    // because we can tolerate the case when mark file reappears on startup.
+    let timeline_path = conf.timelines_path(&tenant_id);
+    crashsafe::fsync_async(timeline_path)
+        .await
+        .context("fsync_pre_mark_remove")?;
+
    // Remove delete mark
    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
        .await
@@ -359,10 +356,11 @@ impl DeleteTimelineFlow {
    // NB: If this fails half-way through, and is retried, the retry will go through
    // all the same steps again. Make sure the code here is idempotent, and don't
    // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))]
+    #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))]
    pub async fn run(
        tenant: &Arc<Tenant>,
        timeline_id: TimelineId,
+        inplace: bool,
    ) -> Result<(), DeleteTimelineError> {
        let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;

@@ -380,7 +378,11 @@ impl DeleteTimelineFlow {
            ))?
        });

-        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        if inplace {
+            Self::background(guard, tenant.conf, tenant, &timeline).await?
+        } else {
+            Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        }

        Ok(())
    }
@@ -398,6 +400,8 @@ impl DeleteTimelineFlow {
    }

    /// Shortcut to create Timeline in stopping state and spawn deletion task.
+    /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
+    #[instrument(skip_all, fields(%timeline_id))]
    pub async fn resume_deletion(
        tenant: Arc<Tenant>,
        timeline_id: TimelineId,
@@ -412,7 +416,7 @@ impl DeleteTimelineFlow {
                timeline_id,
                local_metadata,
                None, // Ancestor is not needed for deletion.
-                remote_client,
+                TimelineResources { remote_client },
                init_order,
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
@@ -444,11 +448,15 @@ impl DeleteTimelineFlow {
        Ok(())
    }

+    #[instrument(skip_all, fields(%timeline_id))]
    pub async fn cleanup_remaining_timeline_fs_traces(
        tenant: &Tenant,
        timeline_id: TimelineId,
    ) -> anyhow::Result<()> {
-        cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await
+        let r =
+            cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await;
+        info!("Done");
+        r
    }

    fn prepare(
@@ -494,11 +502,17 @@ impl DeleteTimelineFlow {
        // At the end of the operation we're holding the guard and need to lock timelines map
        // to remove the timeline from it.
        // Always if you have two locks that are taken in different order this can result in a deadlock.
-        let delete_lock_guard = DeletionGuard(
-            Arc::clone(&timeline.delete_progress)
-                .try_lock_owned()
-                .map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
-        );
+
+        let delete_progress = Arc::clone(&timeline.delete_progress);
+        let delete_lock_guard = match delete_progress.try_lock_owned() {
+            Ok(guard) => DeletionGuard(guard),
+            Err(_) => {
+                // Unfortunately if lock fails arc is consumed.
+                return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
+                    &timeline.delete_progress,
+                )));
+            }
+        };

        timeline.set_state(TimelineState::Stopping);

@@ -553,10 +567,14 @@ impl DeleteTimelineFlow {

        remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;

-        *guard.0 = Self::Finished;
+        *guard = Self::Finished;

        Ok(())
    }
+
+    pub(crate) fn is_finished(&self) -> bool {
+        matches!(self, Self::Finished)
+    }
 }

 struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -163,7 +163,7 @@ impl LayerManager {
    }

    /// Called from `freeze_inmem_layer`, returns true if successfully frozen.
-    pub fn try_freeze_in_memory_layer(
+    pub async fn try_freeze_in_memory_layer(
        &mut self,
        Lsn(last_record_lsn): Lsn,
        last_freeze_at: &AtomicLsn,
@@ -173,7 +173,7 @@ impl LayerManager {
        if let Some(open_layer) = &self.layer_map.open_layer {
            let open_layer_rc = Arc::clone(open_layer);
            // Does this layer need freezing?
-            open_layer.freeze(end_lsn);
+            open_layer.freeze(end_lsn).await;

            // The layer is no longer open, update the layer map to reflect this.
            // We will replace it with on-disk historics below.
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -31,8 +31,10 @@ use storage_broker::Streaming;
 use tokio::select;
 use tracing::*;

-use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use postgres_connection::{parse_host_port, PgConnectionConfig};
+use utils::backoff::{
+    exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
+};
 use utils::{
    id::{NodeId, TenantTimelineId},
    lsn::Lsn,
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -140,23 +140,12 @@ impl UploadQueue {
            }
        }

-        let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
-        for layer_name in &index_part.timeline_layers {
-            match index_part
-                .layer_metadata
-                .get(layer_name)
-                .map(LayerFileMetadata::from)
-            {
-                Some(layer_metadata) => {
-                    files.insert(layer_name.to_owned(), layer_metadata);
-                }
-                None => {
-                    anyhow::bail!(
-                        "No remote layer metadata found for layer {}",
-                        layer_name.file_name()
-                    );
-                }
-            }
+        let mut files = HashMap::with_capacity(index_part.layer_metadata.len());
+        for (layer_name, layer_metadata) in &index_part.layer_metadata {
+            files.insert(
+                layer_name.to_owned(),
+                LayerFileMetadata::from(layer_metadata),
+            );
        }

        let index_part_metadata = index_part.parse_metadata()?;
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -312,7 +312,7 @@ impl<'a> WalIngest<'a> {
                // particular point in the WAL. For more fine-grained control,
                // we could peek into the message and only pause if it contains
                // a particular string, for example, but this is enough for now.
-                utils::failpoint_sleep_millis_async!("wal-ingest-logical-message-sleep");
+                crate::failpoint_support::sleep_millis_async!("wal-ingest-logical-message-sleep");
            }
        }

--- a/pgxn/neon/libpqwalproposer.c
+++ b/pgxn/neon/libpqwalproposer.c
@@ -74,7 +74,7 @@ walprop_connect_start(char *conninfo, char *password)
 	if (password)
 	{
 		keywords[n] = "password";
-		values[n] = neon_auth_token;
+		values[n] = password;
 		n++;
 	}
 	keywords[n] = "dbname";
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1393,8 +1393,22 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
 	char	   *err;
 	WalReceiverConn *wrconn;
 	WalRcvStreamOptions options;
+	char conninfo[MAXCONNINFO];

-	wrconn = walrcv_connect(safekeeper[donor].conninfo, false, "wal_proposer_recovery", &err);
+	if (!neon_auth_token)
+	{
+		memcpy(conninfo, safekeeper[donor].conninfo, MAXCONNINFO);
+	}
+	else
+	{
+		int written = 0;
+
+		written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, safekeeper[donor].conninfo);
+		if (written > MAXCONNINFO || written < 0)
+			elog(FATAL, "could not append password to the safekeeper connection string");
+	}
+
+	wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
 	if (!wrconn)
 	{
 		ereport(WARNING,
--- a/pgxn/neon/walproposer_utils.c
+++ b/pgxn/neon/walproposer_utils.c
@@ -37,68 +37,14 @@ static XLogSegNo walpropSegNo = 0;

 /* START cloned file-local variables and functions from walsender.c */

-/*
- * xlogreader used for replication.  Note that a WAL sender doing physical
- * replication does not need xlogreader to read WAL, but it needs one to
- * keep a state of its work.
- */
-static XLogReaderState *xlogreader = NULL;
-
-/*
- * These variables keep track of the state of the timeline we're currently
- * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric,
- * the timeline is not the latest timeline on this server, and the server's
- * history forked off from that timeline at sendTimeLineValidUpto.
- */
-static TimeLineID sendTimeLine = 0;
-static TimeLineID sendTimeLineNextTLI = 0;
-static bool sendTimeLineIsHistoric = false;
-static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr;
-
-/*
- * Timestamp of last ProcessRepliesIfAny() that saw a reply from the
- * standby. Set to 0 if wal_sender_timeout doesn't need to be active.
- */
-static TimestampTz last_reply_timestamp = 0;
-
-/* Have we sent a heartbeat message asking for reply, since last reply? */
-static bool waiting_for_ping_response = false;
-
-static bool streamingDoneSending;
-static bool streamingDoneReceiving;
-
-/* Are we there yet? */
-static bool WalSndCaughtUp = false;
-
-/* Flags set by signal handlers for later service in main loop */
-static volatile sig_atomic_t got_STOPPING = false;
-
 /*
 * How far have we sent WAL already? This is also advertised in
 * MyWalSnd->sentPtr.  (Actually, this is the next WAL location to send.)
 */
 static XLogRecPtr sentPtr = InvalidXLogRecPtr;

-/*
- * This is set while we are streaming. When not set
- * PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set,
- * the main loop is responsible for checking got_STOPPING and terminating when
- * it's set (after streaming any remaining WAL).
- */
-static volatile sig_atomic_t replication_active = false;
-
-typedef void (*WalSndSendDataCallback) (void);
-static void WalSndLoop(WalSndSendDataCallback send_data);
-static void XLogSendPhysical(void);
-#if PG_VERSION_NUM >= 150000
-static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *tli);
-#else
-static XLogRecPtr GetStandbyFlushRecPtr(void);
-#endif
-
-static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
-							  TimeLineID *tli_p);
-
+static void WalSndLoop(void);
+static void XLogBroadcastWalProposer(void);
 /* END cloned file-level variables and functions from walsender.c */

 int
@@ -506,7 +452,7 @@ XLogWalPropClose(XLogRecPtr recptr)
 /* START of cloned functions from walsender.c */

 /*
- * Handle START_REPLICATION command.
+ * Subscribe for new WAL and stream it in the loop to safekeepers.
 *
 * At the moment, this never returns, but an ereport(ERROR) will take us back
 * to the main loop.
@@ -524,18 +470,6 @@ StartProposerReplication(StartReplicationCmd *cmd)
 				 errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
 #endif

-	/* create xlogreader for physical replication */
-	xlogreader =
-		XLogReaderAllocate(wal_segment_size, NULL,
-						   XL_ROUTINE(.segment_open = WalSndSegmentOpen,
-									  .segment_close = wal_segment_close),
-						   NULL);
-
-	if (!xlogreader)
-		ereport(ERROR,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory")));
-
 	/*
 	 * We assume here that we're logging enough information in the WAL for
 	 * log-shipping, since this is checked in PostmasterMain().
@@ -569,341 +503,61 @@ StartProposerReplication(StartReplicationCmd *cmd)
 	 * we keep this code around to lighten the load for when we need it.
 	 */
 #if PG_VERSION_NUM >= 150000
-	if (am_cascading_walsender)
-	{
-		/* this also updates ThisTimeLineID */
-		FlushPtr = GetStandbyFlushRecPtr(&currTLI);
-	}
-	else
-		FlushPtr = GetFlushRecPtr(&currTLI);
+	FlushPtr = GetFlushRecPtr(&currTLI);
 #else
-	if (am_cascading_walsender)
-	{
-		/* this also updates ThisTimeLineID */
-		FlushPtr = GetStandbyFlushRecPtr();
-	}
-	else
-		FlushPtr = GetFlushRecPtr();
-
+	FlushPtr = GetFlushRecPtr();
 	currTLI = ThisTimeLineID;
 #endif

+	/*
+	 * When we first start replication the standby will be behind the
+	 * primary. For some applications, for example synchronous
+	 * replication, it is important to have a clear state for this initial
+	 * catchup mode, so we can trigger actions when we change streaming
+	 * state later. We may stay in this state for a long time, which is
+	 * exactly why we want to be able to monitor whether or not we are
+	 * still here.
+	 */
+	WalSndSetState(WALSNDSTATE_CATCHUP);

-	if (cmd->timeline != 0)
+	/*
+	 * Don't allow a request to stream from a future point in WAL that
+	 * hasn't been flushed to disk in this server yet.
+	 */
+	if (FlushPtr < cmd->startpoint)
 	{
-		XLogRecPtr	switchpoint;
-
-		sendTimeLine = cmd->timeline;
-		if (sendTimeLine == currTLI)
-		{
-			sendTimeLineIsHistoric = false;
-			sendTimeLineValidUpto = InvalidXLogRecPtr;
-		}
-		else
-		{
-			List	   *timeLineHistory;
-
-			sendTimeLineIsHistoric = true;
-
-			/*
-			 * Check that the timeline the client requested exists, and the
-			 * requested start location is on that timeline.
-			 */
-			timeLineHistory = readTimeLineHistory(currTLI);
-			switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory,
-										 &sendTimeLineNextTLI);
-			list_free_deep(timeLineHistory);
-
-			/*
-			 * Found the requested timeline in the history. Check that
-			 * requested startpoint is on that timeline in our history.
-			 *
-			 * This is quite loose on purpose. We only check that we didn't
-			 * fork off the requested timeline before the switchpoint. We
-			 * don't check that we switched *to* it before the requested
-			 * starting point. This is because the client can legitimately
-			 * request to start replication from the beginning of the WAL
-			 * segment that contains switchpoint, but on the new timeline, so
-			 * that it doesn't end up with a partial segment. If you ask for
-			 * too old a starting point, you'll get an error later when we
-			 * fail to find the requested WAL segment in pg_wal.
-			 *
-			 * XXX: we could be more strict here and only allow a startpoint
-			 * that's older than the switchpoint, if it's still in the same
-			 * WAL segment.
-			 */
-			if (!XLogRecPtrIsInvalid(switchpoint) &&
-				switchpoint < cmd->startpoint)
-			{
-				ereport(ERROR,
-						(errmsg("requested starting point %X/%X on timeline %u is not in this server's history",
-								LSN_FORMAT_ARGS(cmd->startpoint),
-								cmd->timeline),
-						 errdetail("This server's history forked from timeline %u at %X/%X.",
-								   cmd->timeline,
-								   LSN_FORMAT_ARGS(switchpoint))));
-			}
-			sendTimeLineValidUpto = switchpoint;
-		}
-	}
-	else
-	{
-		sendTimeLine = currTLI;
-		sendTimeLineValidUpto = InvalidXLogRecPtr;
-		sendTimeLineIsHistoric = false;
+		ereport(ERROR,
+				(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
+						LSN_FORMAT_ARGS(cmd->startpoint),
+						LSN_FORMAT_ARGS(FlushPtr))));
 	}

-	streamingDoneSending = streamingDoneReceiving = false;
+	/* Start streaming from the requested point */
+	sentPtr = cmd->startpoint;

-	/* If there is nothing to stream, don't even enter COPY mode */
-	if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto)
-	{
-		/*
-		 * When we first start replication the standby will be behind the
-		 * primary. For some applications, for example synchronous
-		 * replication, it is important to have a clear state for this initial
-		 * catchup mode, so we can trigger actions when we change streaming
-		 * state later. We may stay in this state for a long time, which is
-		 * exactly why we want to be able to monitor whether or not we are
-		 * still here.
-		 */
-		WalSndSetState(WALSNDSTATE_CATCHUP);
+	/* Initialize shared memory status, too */
+	SpinLockAcquire(&MyWalSnd->mutex);
+	MyWalSnd->sentPtr = sentPtr;
+	SpinLockRelease(&MyWalSnd->mutex);

-		/*
-		 * Don't allow a request to stream from a future point in WAL that
-		 * hasn't been flushed to disk in this server yet.
-		 */
-		if (FlushPtr < cmd->startpoint)
-		{
-			ereport(ERROR,
-					(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
-							LSN_FORMAT_ARGS(cmd->startpoint),
-							LSN_FORMAT_ARGS(FlushPtr))));
-		}
+	SyncRepInitConfig();

-		/* Start streaming from the requested point */
-		sentPtr = cmd->startpoint;
+	/* Infinite send loop, never returns */
+	WalSndLoop();

-		/* Initialize shared memory status, too */
-		SpinLockAcquire(&MyWalSnd->mutex);
-		MyWalSnd->sentPtr = sentPtr;
-		SpinLockRelease(&MyWalSnd->mutex);
-
-		SyncRepInitConfig();
-
-		/* Main loop of walsender */
-		replication_active = true;
-
-		WalSndLoop(XLogSendPhysical);
-
-		replication_active = false;
-		if (got_STOPPING)
-			proc_exit(0);
-		WalSndSetState(WALSNDSTATE_STARTUP);
-
-		Assert(streamingDoneSending && streamingDoneReceiving);
-	}
+	WalSndSetState(WALSNDSTATE_STARTUP);

 	if (cmd->slotname)
 		ReplicationSlotRelease();
-
-	/*
-	 * Copy is finished now. Send a single-row result set indicating the next
-	 * timeline.
-	 */
-	if (sendTimeLineIsHistoric)
-	{
-		char		startpos_str[8 + 1 + 8 + 1];
-		DestReceiver *dest;
-		TupOutputState *tstate;
-		TupleDesc	tupdesc;
-		Datum		values[2];
-		bool		nulls[2];
-
-		snprintf(startpos_str, sizeof(startpos_str), "%X/%X",
-				 LSN_FORMAT_ARGS(sendTimeLineValidUpto));
-
-		dest = CreateDestReceiver(DestRemoteSimple);
-		MemSet(nulls, false, sizeof(nulls));
-
-		/*
-		 * Need a tuple descriptor representing two columns. int8 may seem
-		 * like a surprising data type for this, but in theory int4 would not
-		 * be wide enough for this, as TimeLineID is unsigned.
-		 */
-		tupdesc = CreateTemplateTupleDesc(2);
-		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli",
-								  INT8OID, -1, 0);
-		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos",
-								  TEXTOID, -1, 0);
-
-		/* prepare for projection of tuple */
-		tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual);
-
-		values[0] = Int64GetDatum((int64) sendTimeLineNextTLI);
-		values[1] = CStringGetTextDatum(startpos_str);
-
-		/* send it to dest */
-		do_tup_output(tstate, values, nulls);
-
-		end_tup_output(tstate);
-	}
-
-	/* Send CommandComplete message */
-	EndReplicationCommand("START_STREAMING");
 }

-#if PG_VERSION_NUM >= 150000
-static XLogRecPtr
-GetStandbyFlushRecPtr(TimeLineID *tli)
-{
-	XLogRecPtr	replayPtr;
-	TimeLineID	replayTLI;
-	XLogRecPtr	receivePtr;
-	TimeLineID	receiveTLI;
-	XLogRecPtr	result;
-
-	/*
-	 * We can safely send what's already been replayed. Also, if walreceiver
-	 * is streaming WAL from the same timeline, we can send anything that it
-	 * has streamed, but hasn't been replayed yet.
-	 */
-
-	receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
-	replayPtr = GetXLogReplayRecPtr(&replayTLI);
-
-	*tli = replayTLI;
-
-	result = replayPtr;
-	if (receiveTLI == replayTLI && receivePtr > replayPtr)
-		result = receivePtr;
-
-	return result;
-}
-#else
 /*
- * Returns the latest point in WAL that has been safely flushed to disk, and
- * can be sent to the standby. This should only be called when in recovery,
- * ie. we're streaming to a cascaded standby.
- *
- * As a side-effect, ThisTimeLineID is updated to the TLI of the last
- * replayed WAL record.
+ * Main loop that waits for LSN updates and calls the walproposer.
+ * Synchronous replication sets latch in WalSndWakeup at walsender.c
 */
-static XLogRecPtr
-GetStandbyFlushRecPtr(void)
-{
-	XLogRecPtr	replayPtr;
-	TimeLineID	replayTLI;
-	XLogRecPtr	receivePtr;
-	TimeLineID	receiveTLI;
-	XLogRecPtr	result;
-
-	/*
-	 * We can safely send what's already been replayed. Also, if walreceiver
-	 * is streaming WAL from the same timeline, we can send anything that it
-	 * has streamed, but hasn't been replayed yet.
-	 */
-
-	receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
-	replayPtr = GetXLogReplayRecPtr(&replayTLI);
-
-	ThisTimeLineID = replayTLI;
-
-	result = replayPtr;
-	if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr)
-		result = receivePtr;
-
-	return result;
-}
-#endif
-
-
-
-/* XLogReaderRoutine->segment_open callback */
 static void
-WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
-				  TimeLineID *tli_p)
+WalSndLoop(void)
 {
-	char		path[MAXPGPATH];
-
-	/*-------
-	 * When reading from a historic timeline, and there is a timeline switch
-	 * within this segment, read from the WAL segment belonging to the new
-	 * timeline.
-	 *
-	 * For example, imagine that this server is currently on timeline 5, and
-	 * we're streaming timeline 4. The switch from timeline 4 to 5 happened at
-	 * 0/13002088. In pg_wal, we have these files:
-	 *
-	 * ...
-	 * 000000040000000000000012
-	 * 000000040000000000000013
-	 * 000000050000000000000013
-	 * 000000050000000000000014
-	 * ...
-	 *
-	 * In this situation, when requested to send the WAL from segment 0x13, on
-	 * timeline 4, we read the WAL from file 000000050000000000000013. Archive
-	 * recovery prefers files from newer timelines, so if the segment was
-	 * restored from the archive on this server, the file belonging to the old
-	 * timeline, 000000040000000000000013, might not exist. Their contents are
-	 * equal up to the switchpoint, because at a timeline switch, the used
-	 * portion of the old segment is copied to the new file.  -------
-	 */
-	*tli_p = sendTimeLine;
-	if (sendTimeLineIsHistoric)
-	{
-		XLogSegNo	endSegNo;
-
-		XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize);
-		if (nextSegNo == endSegNo)
-			*tli_p = sendTimeLineNextTLI;
-	}
-
-	XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize);
-	state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
-	if (state->seg.ws_file >= 0)
-		return;
-
-	/*
-	 * If the file is not found, assume it's because the standby asked for a
-	 * too old WAL segment that has already been removed or recycled.
-	 */
-	if (errno == ENOENT)
-	{
-		char		xlogfname[MAXFNAMELEN];
-		int			save_errno = errno;
-
-		XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size);
-		errno = save_errno;
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("requested WAL segment %s has already been removed",
-						xlogfname)));
-	}
-	else
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not open file \"%s\": %m",
-						path)));
-}
-
-
-/* Main loop of walsender process that streams the WAL over Copy messages. */
-static void
-WalSndLoop(WalSndSendDataCallback send_data)
-{
-	/*
-	 * Initialize the last reply timestamp. That enables timeout processing
-	 * from hereon.
-	 */
-	last_reply_timestamp = GetCurrentTimestamp();
-	waiting_for_ping_response = false;
-
-	/*
-	 * Loop until we reach the end of this timeline or the client requests to
-	 * stop streaming.
-	 */
 	for (;;)
 	{
 		/* Clear any already-pending wakeups */
@@ -911,153 +565,41 @@ WalSndLoop(WalSndSendDataCallback send_data)

 		CHECK_FOR_INTERRUPTS();

-		/* Process any requests or signals received recently */
-		if (ConfigReloadPending)
-		{
-			ConfigReloadPending = false;
-			ProcessConfigFile(PGC_SIGHUP);
-			SyncRepInitConfig();
-		}
+		XLogBroadcastWalProposer();

-		/* always true */
-		if (am_wal_proposer)
-		{
-			send_data();
-			if (WalSndCaughtUp)
-			{
-				if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
-					WalSndSetState(WALSNDSTATE_STREAMING);
-				WalProposerPoll();
-				WalSndCaughtUp = false;
-			}
-			continue;
-		}
+		if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
+			WalSndSetState(WALSNDSTATE_STREAMING);
+		WalProposerPoll();
 	}
 }

 /*
- * Send out the WAL in its normal physical/stored form.
- *
- * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
- * but not yet sent to the client, and buffer it in the libpq output
- * buffer.
- *
- * If there is no unsent WAL remaining, WalSndCaughtUp is set to true,
- * otherwise WalSndCaughtUp is set to false.
+ * Notify walproposer about the new WAL position.
 */
 static void
-XLogSendPhysical(void)
+XLogBroadcastWalProposer(void)
 {
-	XLogRecPtr	SendRqstPtr;
 	XLogRecPtr	startptr;
 	XLogRecPtr	endptr;
-	Size		nbytes PG_USED_FOR_ASSERTS_ONLY;
-	TimeLineID	currTLI;

-	/* If requested switch the WAL sender to the stopping state. */
-	if (got_STOPPING)
-		WalSndSetState(WALSNDSTATE_STOPPING);
+	/* Start from the last sent position */
+	startptr = sentPtr;

-	if (streamingDoneSending)
-	{
-		WalSndCaughtUp = true;
-		return;
-	}
-
-	/* Figure out how far we can safely send the WAL. */
-	if (sendTimeLineIsHistoric)
-	{
-		/*
-		 * Streaming an old timeline that's in this server's history, but is
-		 * not the one we're currently inserting or replaying. It can be
-		 * streamed up to the point where we switched off that timeline.
-		 */
-		SendRqstPtr = sendTimeLineValidUpto;
-	}
-	else if (am_cascading_walsender)
-	{
-		/*
-		 * Streaming the latest timeline on a standby.
-		 *
-		 * Attempt to send all WAL that has already been replayed, so that we
-		 * know it's valid. If we're receiving WAL through streaming
-		 * replication, it's also OK to send any WAL that has been received
-		 * but not replayed.
-		 *
-		 * The timeline we're recovering from can change, or we can be
-		 * promoted. In either case, the current timeline becomes historic. We
-		 * need to detect that so that we don't try to stream past the point
-		 * where we switched to another timeline. We check for promotion or
-		 * timeline switch after calculating FlushPtr, to avoid a race
-		 * condition: if the timeline becomes historic just after we checked
-		 * that it was still current, it's still be OK to stream it up to the
-		 * FlushPtr that was calculated before it became historic.
-		 */
-		bool		becameHistoric = false;
+	/*
+	 * Streaming the current timeline on a primary.
+	 *
+	 * Attempt to send all data that's already been written out and
+	 * fsync'd to disk.  We cannot go further than what's been written out
+	 * given the current implementation of WALRead().  And in any case
+	 * it's unsafe to send WAL that is not securely down to disk on the
+	 * primary: if the primary subsequently crashes and restarts, standbys
+	 * must not have applied any WAL that got lost on the primary.
+	 */
 #if PG_VERSION_NUM >= 150000
-		SendRqstPtr = GetStandbyFlushRecPtr(&currTLI);
+	endptr = GetFlushRecPtr(NULL);
 #else
-		SendRqstPtr = GetStandbyFlushRecPtr();
-		currTLI = ThisTimeLineID;
+	endptr = GetFlushRecPtr();
 #endif
-		if (!RecoveryInProgress())
-		{
-			/*
-			 * We have been promoted. RecoveryInProgress() updated
-			 * ThisTimeLineID to the new current timeline.
-			 */
-			am_cascading_walsender = false;
-			becameHistoric = true;
-		}
-		else
-		{
-			/*
-			 * Still a cascading standby. But is the timeline we're sending
-			 * still the one recovery is recovering from? currTLI was updated
-			 * by the GetStandbyFlushRecPtr() call above.
-			 */
-			if (sendTimeLine != currTLI)
-				becameHistoric = true;
-		}
-
-		if (becameHistoric)
-		{
-			/*
-			 * The timeline we were sending has become historic. Read the
-			 * timeline history file of the new timeline to see where exactly
-			 * we forked off from the timeline we were sending.
-			 */
-			List	   *history;
-
-			history = readTimeLineHistory(currTLI);
-			sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI);
-
-			Assert(sendTimeLine < sendTimeLineNextTLI);
-			list_free_deep(history);
-
-			sendTimeLineIsHistoric = true;
-
-			SendRqstPtr = sendTimeLineValidUpto;
-		}
-	}
-	else
-	{
-		/*
-		 * Streaming the current timeline on a primary.
-		 *
-		 * Attempt to send all data that's already been written out and
-		 * fsync'd to disk.  We cannot go further than what's been written out
-		 * given the current implementation of WALRead().  And in any case
-		 * it's unsafe to send WAL that is not securely down to disk on the
-		 * primary: if the primary subsequently crashes and restarts, standbys
-		 * must not have applied any WAL that got lost on the primary.
-		 */
-#if PG_VERSION_NUM >= 150000
-		SendRqstPtr = GetFlushRecPtr(NULL);
-#else
-		SendRqstPtr = GetFlushRecPtr();
-#endif
-	}

 	/*
 	 * Record the current system time as an approximation of the time at which
@@ -1083,91 +625,14 @@ XLogSendPhysical(void)
 	 * that arbitrary LSN is eventually reported as written, flushed and
 	 * applied, so that it can measure the elapsed time.
 	 */
-	LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp());
-
-	/*
-	 * If this is a historic timeline and we've reached the point where we
-	 * forked to the next timeline, stop streaming.
-	 *
-	 * Note: We might already have sent WAL > sendTimeLineValidUpto. The
-	 * startup process will normally replay all WAL that has been received
-	 * from the primary, before promoting, but if the WAL streaming is
-	 * terminated at a WAL page boundary, the valid portion of the timeline
-	 * might end in the middle of a WAL record. We might've already sent the
-	 * first half of that partial WAL record to the cascading standby, so that
-	 * sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't
-	 * replay the partial WAL record either, so it can still follow our
-	 * timeline switch.
-	 */
-	if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr)
-	{
-		/* close the current file. */
-		if (xlogreader->seg.ws_file >= 0)
-			wal_segment_close(xlogreader);
-
-		/* Send CopyDone */
-		pq_putmessage_noblock('c', NULL, 0);
-		streamingDoneSending = true;
-
-		WalSndCaughtUp = true;
-
-		elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
-			 LSN_FORMAT_ARGS(sendTimeLineValidUpto),
-			 LSN_FORMAT_ARGS(sentPtr));
-		return;
-	}
+	LagTrackerWrite(endptr, GetCurrentTimestamp());

 	/* Do we have any work to do? */
-	Assert(sentPtr <= SendRqstPtr);
-	if (SendRqstPtr <= sentPtr)
-	{
-		WalSndCaughtUp = true;
+	Assert(startptr <= endptr);
+	if (endptr <= startptr)
 		return;
-	}

-	/*
-	 * Figure out how much to send in one message. If there's no more than
-	 * MAX_SEND_SIZE bytes to send, send everything. Otherwise send
-	 * MAX_SEND_SIZE bytes, but round back to logfile or page boundary.
-	 *
-	 * The rounding is not only for performance reasons. Walreceiver relies on
-	 * the fact that we never split a WAL record across two messages. Since a
-	 * long WAL record is split at page boundary into continuation records,
-	 * page boundary is always a safe cut-off point. We also assume that
-	 * SendRqstPtr never points to the middle of a WAL record.
-	 */
-	startptr = sentPtr;
-	endptr = startptr;
-	endptr += MAX_SEND_SIZE;
-
-	/* if we went beyond SendRqstPtr, back off */
-	if (SendRqstPtr <= endptr)
-	{
-		endptr = SendRqstPtr;
-		if (sendTimeLineIsHistoric)
-			WalSndCaughtUp = false;
-		else
-			WalSndCaughtUp = true;
-	}
-	else
-	{
-		/* round down to page boundary. */
-		endptr -= (endptr % XLOG_BLCKSZ);
-		WalSndCaughtUp = false;
-	}
-
-	nbytes = endptr - startptr;
-	Assert(nbytes <= MAX_SEND_SIZE);
-
-	/* always true */
-	if (am_wal_proposer)
-	{
-		WalProposerBroadcast(startptr, endptr);
-	}
-	else
-	{
-		/* code removed for brevity */
-	}
+	WalProposerBroadcast(startptr, endptr);
 	sentPtr = endptr;

 	/* Update shared memory status */
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -13,6 +13,7 @@ bytes = { workspace = true, features = ["serde"] }
 chrono.workspace = true
 clap.workspace = true
 consumption_metrics.workspace = true
+dashmap.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hashbrown.workspace = true
@@ -29,7 +30,7 @@ metrics.workspace = true
 once_cell.workspace = true
 opentelemetry.workspace = true
 parking_lot.workspace = true
-pbkdf2.workspace = true
+pbkdf2 = { workspace = true, features = ["simple", "std"] }
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
 pq_proto.workspace = true
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -36,7 +36,18 @@ pub(super) async fn authenticate(
        AuthInfo::Scram(secret) => {
            info!("auth endpoint chooses SCRAM");
            let scram = auth::Scram(&secret);
-            let client_key = match flow.begin(scram).await?.authenticate().await? {
+
+            let auth_flow = flow.begin(scram).await.map_err(|error| {
+                warn!(?error, "error sending scram acknowledgement");
+                error
+            })?;
+
+            let auth_outcome = auth_flow.authenticate().await.map_err(|error| {
+                warn!(?error, "error processing scram messages");
+                error
+            })?;
+
+            let client_key = match auth_outcome {
                sasl::Outcome::Success(key) => key,
                sasl::Outcome::Failure(reason) => {
                    info!("auth backend failed with an error: {reason}");
@@ -51,7 +62,6 @@ pub(super) async fn authenticate(
        }
    };

-    info!("compute node's state has likely changed; requesting a wake-up");
    let mut num_retries = 0;
    let mut node = loop {
        let wake_res = api.wake_compute(extra, creds).await;
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -8,6 +8,7 @@ use super::{
 use crate::{auth::ClientCredentials, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
+use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};

@@ -47,7 +48,9 @@ impl Api {
                .build()?;

            info!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
            let response = self.endpoint.execute(request).await?;
+            info!(duration = ?start.elapsed(), "received http response");
            let body = match parse_body::<GetRoleSecret>(response).await {
                Ok(body) => body,
                // Error 404 is special: it's ok not to have a secret.
@@ -88,7 +91,9 @@ impl Api {
                .build()?;

            info!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
            let response = self.endpoint.execute(request).await?;
+            info!(duration = ?start.elapsed(), "received http response");
            let body = parse_body::<WakeCompute>(response).await?;

            // Unfortunately, ownership won't let us use `Option::ok_or` here.
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -7,11 +7,14 @@ pub mod server;
 pub mod sql_over_http;
 pub mod websocket;

-use std::time::Duration;
+use std::{sync::Arc, time::Duration};

+use futures::FutureExt;
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+use tokio::time::Instant;
+use tracing::trace;

 use crate::url::ApiUrl;
 use reqwest_middleware::RequestBuilder;
@@ -20,13 +23,21 @@ use reqwest_middleware::RequestBuilder;
 /// because it takes care of observability (OpenTelemetry).
 /// We deliberately don't want to replace this with a public static.
 pub fn new_client() -> ClientWithMiddleware {
-    reqwest_middleware::ClientBuilder::new(reqwest::Client::new())
+    let client = reqwest::ClientBuilder::new()
+        .dns_resolver(Arc::new(GaiResolver::default()))
+        .connection_verbose(true)
+        .build()
+        .expect("Failed to create http client");
+
+    reqwest_middleware::ClientBuilder::new(client)
        .with(reqwest_tracing::TracingMiddleware::default())
        .build()
 }

 pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
    let timeout_client = reqwest::ClientBuilder::new()
+        .dns_resolver(Arc::new(GaiResolver::default()))
+        .connection_verbose(true)
        .timeout(default_timout)
        .build()
        .expect("Failed to create http client with timeout");
@@ -39,6 +50,10 @@ pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware
        // As per docs, "This middleware always errors when given requests with streaming bodies".
        // That's all right because we only use this client to send `serde_json::RawValue`, which
        // is not a stream.
+        //
+        // ex-maintainer note:
+        // this limitation can be fixed if streaming is necessary.
+        // retries will still not be performed, but it wont error immediately
        .with(RetryTransientMiddleware::new_with_policy(retry_policy))
        .build()
 }
@@ -81,6 +96,37 @@ impl Endpoint {
    }
 }

+/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
+use hyper::{
+    client::connect::dns::{GaiResolver as HyperGaiResolver, Name},
+    service::Service,
+};
+use reqwest::dns::{Addrs, Resolve, Resolving};
+#[derive(Debug)]
+pub struct GaiResolver(HyperGaiResolver);
+
+impl Default for GaiResolver {
+    fn default() -> Self {
+        Self(HyperGaiResolver::new())
+    }
+}
+
+impl Resolve for GaiResolver {
+    fn resolve(&self, name: Name) -> Resolving {
+        let this = &mut self.0.clone();
+        let start = Instant::now();
+        Box::pin(
+            Service::<Name>::call(this, name.clone()).map(move |result| {
+                let resolve_duration = start.elapsed();
+                trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
+                result
+                    .map(|addrs| -> Addrs { Box::new(addrs) })
+                    .map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
+            }),
+        )
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -1,10 +1,21 @@
 use anyhow::Context;
 use async_trait::async_trait;
-use parking_lot::Mutex;
+use dashmap::DashMap;
+use futures::future::poll_fn;
+use parking_lot::RwLock;
+use pbkdf2::{
+    password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString},
+    Params, Pbkdf2,
+};
 use pq_proto::StartupMessageParams;
-use std::fmt;
+use std::sync::atomic::{self, AtomicUsize};
 use std::{collections::HashMap, sync::Arc};
+use std::{
+    fmt,
+    task::{ready, Poll},
+};
 use tokio::time;
+use tokio_postgres::AsyncMessage;

 use crate::{auth, console};
 use crate::{compute, config};
@@ -13,8 +24,8 @@ use super::sql_over_http::MAX_RESPONSE_SIZE;

 use crate::proxy::ConnectMechanism;

-use tracing::error;
-use tracing::info;
+use tracing::{error, warn};
+use tracing::{info, info_span, Instrument};

 pub const APP_NAME: &str = "sql_over_http";
 const MAX_CONNS_PER_ENDPOINT: usize = 20;
@@ -42,23 +53,44 @@ impl fmt::Display for ConnInfo {
 }

 struct ConnPoolEntry {
-    conn: tokio_postgres::Client,
+    conn: Client,
    _last_access: std::time::Instant,
 }

-// Per-endpoint connection pool, (dbname, username) -> Vec<ConnPoolEntry>
+// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
 pub struct EndpointConnPool {
-    pools: HashMap<(String, String), Vec<ConnPoolEntry>>,
+    pools: HashMap<(String, String), DbUserConnPool>,
    total_conns: usize,
 }

+/// 4096 is the number of rounds that SCRAM-SHA-256 recommends.
+/// It's not the 600,000 that OWASP recommends... but our passwords are high entropy anyway.
+///
+/// Still takes 1.4ms to hash on my hardware.
+/// We don't want to ruin the latency improvements of using the pool by making password verification take too long
+const PARAMS: Params = Params {
+    rounds: 4096,
+    output_length: 32,
+};
+
+#[derive(Default)]
+pub struct DbUserConnPool {
+    conns: Vec<ConnPoolEntry>,
+    password_hash: Option<PasswordHashString>,
+}
+
 pub struct GlobalConnPool {
    // endpoint -> per-endpoint connection pool
    //
    // That should be a fairly conteded map, so return reference to the per-endpoint
    // pool as early as possible and release the lock.
-    global_pool: Mutex<HashMap<String, Arc<Mutex<EndpointConnPool>>>>,
+    global_pool: DashMap<String, Arc<RwLock<EndpointConnPool>>>,
+
+    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
+    /// That seems like far too much effort, so we're using a relaxed increment counter instead.
+    /// It's only used for diagnostics.
+    global_pool_size: AtomicUsize,

    // Maximum number of connections per one endpoint.
    // Can mix different (dbname, username) connections.
@@ -67,85 +99,173 @@ pub struct GlobalConnPool {
    max_conns_per_endpoint: usize,

    proxy_config: &'static crate::config::ProxyConfig,
+
+    // Using a lock to remove any race conditions.
+    // Eg cleaning up connections while a new connection is returned
+    closed: RwLock<bool>,
 }

 impl GlobalConnPool {
    pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
        Arc::new(Self {
-            global_pool: Mutex::new(HashMap::new()),
+            global_pool: DashMap::new(),
+            global_pool_size: AtomicUsize::new(0),
            max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
            proxy_config: config,
+            closed: RwLock::new(false),
        })
    }

+    pub fn shutdown(&self) {
+        *self.closed.write() = true;
+
+        self.global_pool.retain(|_, endpoint_pool| {
+            let mut pool = endpoint_pool.write();
+            // by clearing this hashmap, we remove the slots that a connection can be returned to.
+            // when returning, it drops the connection if the slot doesn't exist
+            pool.pools.clear();
+            pool.total_conns = 0;
+
+            false
+        });
+    }
+
    pub async fn get(
        &self,
        conn_info: &ConnInfo,
        force_new: bool,
-    ) -> anyhow::Result<tokio_postgres::Client> {
-        let mut client: Option<tokio_postgres::Client> = None;
+        session_id: uuid::Uuid,
+    ) -> anyhow::Result<Client> {
+        let mut client: Option<Client> = None;

+        let mut hash_valid = false;
        if !force_new {
-            let pool = self.get_endpoint_pool(&conn_info.hostname).await;
+            let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
+            let mut hash = None;

            // find a pool entry by (dbname, username) if exists
-            let mut pool = pool.lock();
-            let pool_entries = pool.pools.get_mut(&conn_info.db_and_user());
-            if let Some(pool_entries) = pool_entries {
-                if let Some(entry) = pool_entries.pop() {
-                    client = Some(entry.conn);
-                    pool.total_conns -= 1;
+            {
+                let pool = pool.read();
+                if let Some(pool_entries) = pool.pools.get(&conn_info.db_and_user()) {
+                    if !pool_entries.conns.is_empty() {
+                        hash = pool_entries.password_hash.clone();
+                    }
+                }
+            }
+
+            // a connection exists in the pool, verify the password hash
+            if let Some(hash) = hash {
+                let pw = conn_info.password.clone();
+                let validate = tokio::task::spawn_blocking(move || {
+                    Pbkdf2.verify_password(pw.as_bytes(), &hash.password_hash())
+                })
+                .await?;
+
+                // if the hash is invalid, don't error
+                // we will continue with the regular connection flow
+                if validate.is_ok() {
+                    hash_valid = true;
+                    let mut pool = pool.write();
+                    if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
+                        if let Some(entry) = pool_entries.conns.pop() {
+                            client = Some(entry.conn);
+                            pool.total_conns -= 1;
+                        }
+                    }
                }
            }
        }

        // ok return cached connection if found and establish a new one otherwise
-        if let Some(client) = client {
-            if client.is_closed() {
+        let new_client = if let Some(client) = client {
+            if client.inner.is_closed() {
                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
-                connect_to_compute(self.proxy_config, conn_info).await
+                connect_to_compute(self.proxy_config, conn_info, session_id).await
            } else {
                info!("pool: reusing connection '{conn_info}'");
-                Ok(client)
+                client.session.send(session_id)?;
+                return Ok(client);
            }
        } else {
            info!("pool: opening a new connection '{conn_info}'");
-            connect_to_compute(self.proxy_config, conn_info).await
+            connect_to_compute(self.proxy_config, conn_info, session_id).await
+        };
+
+        match &new_client {
+            // clear the hash. it's no longer valid
+            // TODO: update tokio-postgres fork to allow access to this error kind directly
+            Err(err)
+                if hash_valid && err.to_string().contains("password authentication failed") =>
+            {
+                let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
+                let mut pool = pool.write();
+                if let Some(entry) = pool.pools.get_mut(&conn_info.db_and_user()) {
+                    entry.password_hash = None;
+                }
+            }
+            // new password is valid and we should insert/update it
+            Ok(_) if !force_new && !hash_valid => {
+                let pw = conn_info.password.clone();
+                let new_hash = tokio::task::spawn_blocking(move || {
+                    let salt = SaltString::generate(rand::rngs::OsRng);
+                    Pbkdf2
+                        .hash_password_customized(pw.as_bytes(), None, None, PARAMS, &salt)
+                        .map(|s| s.serialize())
+                })
+                .await??;
+
+                let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
+                let mut pool = pool.write();
+                pool.pools
+                    .entry(conn_info.db_and_user())
+                    .or_default()
+                    .password_hash = Some(new_hash);
+            }
+            _ => {}
        }
+
+        new_client
    }

-    pub async fn put(
-        &self,
-        conn_info: &ConnInfo,
-        client: tokio_postgres::Client,
-    ) -> anyhow::Result<()> {
-        let pool = self.get_endpoint_pool(&conn_info.hostname).await;
+    pub fn put(&self, conn_info: &ConnInfo, client: Client) -> anyhow::Result<()> {
+        // We want to hold this open while we return. This ensures that the pool can't close
+        // while we are in the middle of returning the connection.
+        let closed = self.closed.read();
+        if *closed {
+            info!("pool: throwing away connection '{conn_info}' because pool is closed");
+            return Ok(());
+        }
+
+        if client.inner.is_closed() {
+            info!("pool: throwing away connection '{conn_info}' because connection is closed");
+            return Ok(());
+        }
+
+        let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);

        // return connection to the pool
-        let mut total_conns;
        let mut returned = false;
        let mut per_db_size = 0;
-        {
-            let mut pool = pool.lock();
-            total_conns = pool.total_conns;
+        let total_conns = {
+            let mut pool = pool.write();

-            let pool_entries: &mut Vec<ConnPoolEntry> = pool
-                .pools
-                .entry(conn_info.db_and_user())
-                .or_insert_with(|| Vec::with_capacity(1));
-            if total_conns < self.max_conns_per_endpoint {
-                pool_entries.push(ConnPoolEntry {
-                    conn: client,
-                    _last_access: std::time::Instant::now(),
-                });
+            if pool.total_conns < self.max_conns_per_endpoint {
+                // we create this db-user entry in get, so it should not be None
+                if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
+                    pool_entries.conns.push(ConnPoolEntry {
+                        conn: client,
+                        _last_access: std::time::Instant::now(),
+                    });

-                total_conns += 1;
-                returned = true;
-                per_db_size = pool_entries.len();
+                    returned = true;
+                    per_db_size = pool_entries.conns.len();

-                pool.total_conns += 1;
+                    pool.total_conns += 1;
+                }
            }
-        }
+
+            pool.total_conns
+        };

        // do logging outside of the mutex
        if returned {
@@ -157,25 +277,35 @@ impl GlobalConnPool {
        Ok(())
    }

-    async fn get_endpoint_pool(&self, endpoint: &String) -> Arc<Mutex<EndpointConnPool>> {
+    fn get_or_create_endpoint_pool(&self, endpoint: &String) -> Arc<RwLock<EndpointConnPool>> {
+        // fast path
+        if let Some(pool) = self.global_pool.get(endpoint) {
+            return pool.clone();
+        }
+
+        // slow path
+        let new_pool = Arc::new(RwLock::new(EndpointConnPool {
+            pools: HashMap::new(),
+            total_conns: 0,
+        }));
+
        // find or create a pool for this endpoint
        let mut created = false;
-        let mut global_pool = self.global_pool.lock();
-        let pool = global_pool
+        let pool = self
+            .global_pool
            .entry(endpoint.clone())
            .or_insert_with(|| {
                created = true;
-                Arc::new(Mutex::new(EndpointConnPool {
-                    pools: HashMap::new(),
-                    total_conns: 0,
-                }))
+                new_pool
            })
            .clone();
-        let global_pool_size = global_pool.len();
-        drop(global_pool);

        // log new global pool size
        if created {
+            let global_pool_size = self
+                .global_pool_size
+                .fetch_add(1, atomic::Ordering::Relaxed)
+                + 1;
            info!(
                "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}"
            );
@@ -187,11 +317,12 @@ impl GlobalConnPool {

 struct TokioMechanism<'a> {
    conn_info: &'a ConnInfo,
+    session_id: uuid::Uuid,
 }

 #[async_trait]
 impl ConnectMechanism for TokioMechanism<'_> {
-    type Connection = tokio_postgres::Client;
+    type Connection = Client;
    type ConnectError = tokio_postgres::Error;
    type Error = anyhow::Error;

@@ -200,7 +331,7 @@ impl ConnectMechanism for TokioMechanism<'_> {
        node_info: &console::CachedNodeInfo,
        timeout: time::Duration,
    ) -> Result<Self::Connection, Self::ConnectError> {
-        connect_to_compute_once(node_info, self.conn_info, timeout).await
+        connect_to_compute_once(node_info, self.conn_info, timeout, self.session_id).await
    }

    fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
@@ -213,7 +344,8 @@ impl ConnectMechanism for TokioMechanism<'_> {
 async fn connect_to_compute(
    config: &config::ProxyConfig,
    conn_info: &ConnInfo,
-) -> anyhow::Result<tokio_postgres::Client> {
+    session_id: uuid::Uuid,
+) -> anyhow::Result<Client> {
    let tls = config.tls_config.as_ref();
    let common_names = tls.and_then(|tls| tls.common_names.clone());

@@ -244,17 +376,27 @@ async fn connect_to_compute(
        .await?
        .context("missing cache entry from wake_compute")?;

-    crate::proxy::connect_to_compute(&TokioMechanism { conn_info }, node_info, &extra, &creds).await
+    crate::proxy::connect_to_compute(
+        &TokioMechanism {
+            conn_info,
+            session_id,
+        },
+        node_info,
+        &extra,
+        &creds,
+    )
+    .await
 }

 async fn connect_to_compute_once(
    node_info: &console::CachedNodeInfo,
    conn_info: &ConnInfo,
    timeout: time::Duration,
-) -> Result<tokio_postgres::Client, tokio_postgres::Error> {
+    mut session: uuid::Uuid,
+) -> Result<Client, tokio_postgres::Error> {
    let mut config = (*node_info.config).clone();

-    let (client, connection) = config
+    let (client, mut connection) = config
        .user(&conn_info.username)
        .password(&conn_info.password)
        .dbname(&conn_info.dbname)
@@ -263,11 +405,55 @@ async fn connect_to_compute_once(
        .connect(tokio_postgres::NoTls)
        .await?;

-    tokio::spawn(async move {
-        if let Err(e) = connection.await {
-            error!("connection error: {}", e);
-        }
+    let (tx, mut rx) = tokio::sync::watch::channel(session);
+
+    let conn_id = uuid::Uuid::new_v4();
+    let span = info_span!(parent: None, "connection", %conn_id);
+    span.in_scope(|| {
+        info!(%conn_info, %session, "new connection");
    });

-    Ok(client)
+    tokio::spawn(
+        poll_fn(move |cx| {
+            if matches!(rx.has_changed(), Ok(true)) {
+                session = *rx.borrow_and_update();
+                info!(%session, "changed session");
+            }
+
+            loop {
+                let message = ready!(connection.poll_message(cx));
+
+                match message {
+                    Some(Ok(AsyncMessage::Notice(notice))) => {
+                        info!(%session, "notice: {}", notice);
+                    }
+                    Some(Ok(AsyncMessage::Notification(notif))) => {
+                        warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
+                    }
+                    Some(Ok(_)) => {
+                        warn!(%session, "unknown message");
+                    }
+                    Some(Err(e)) => {
+                        error!(%session, "connection error: {}", e);
+                        return Poll::Ready(())
+                    }
+                    None => {
+                        info!("connection closed");
+                        return Poll::Ready(())
+                    }
+                }
+            }
+        })
+        .instrument(span)
+    );
+
+    Ok(Client {
+        inner: client,
+        session: tx,
+    })
+}
+
+pub struct Client {
+    pub inner: tokio_postgres::Client,
+    session: tokio::sync::watch::Sender<uuid::Uuid>,
 }
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -1,7 +1,6 @@
 use std::sync::Arc;

 use anyhow::bail;
-use anyhow::Context;
 use futures::pin_mut;
 use futures::StreamExt;
 use hashbrown::HashMap;
@@ -28,14 +27,19 @@ struct QueryData {
    params: Vec<serde_json::Value>,
 }

+#[derive(serde::Deserialize)]
+struct BatchQueryData {
+    queries: Vec<QueryData>,
+}
+
 #[derive(serde::Deserialize)]
 #[serde(untagged)]
 enum Payload {
    Single(QueryData),
-    Batch(Vec<QueryData>),
+    Batch(BatchQueryData),
 }

-pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
+pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB

 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
@@ -43,6 +47,7 @@ static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
 static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
 static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
 static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");
+static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrable");

 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");

@@ -50,7 +55,7 @@ static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
 // Convert json non-string types to strings, so that they can be passed to Postgres
 // as parameters.
 //
-fn json_to_pg_text(json: &[Value]) -> Result<Vec<Option<String>>, serde_json::Error> {
+fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<Option<String>>, serde_json::Error> {
    json.iter()
        .map(|value| {
            match value {
@@ -176,6 +181,7 @@ pub async fn handle(
    request: Request<Body>,
    sni_hostname: Option<String>,
    conn_pool: Arc<GlobalConnPool>,
+    session_id: uuid::Uuid,
 ) -> anyhow::Result<(Value, HashMap<HeaderName, HeaderValue>)> {
    //
    // Determine the destination and connection params
@@ -191,7 +197,7 @@ pub async fn handle(
    // Allow connection pooling only if explicitly requested
    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);

-    // isolation level and read only
+    // isolation level, read only and deferrable

    let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
    let txn_isolation_level = match txn_isolation_level_raw {
@@ -205,8 +211,8 @@ pub async fn handle(
        None => None,
    };

-    let txn_read_only_raw = headers.get(&TXN_READ_ONLY).cloned();
-    let txn_read_only = txn_read_only_raw.as_ref() == Some(&HEADER_VALUE_TRUE);
+    let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
+    let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);

    let request_content_length = match request.body().size_hint().upper() {
        Some(v) => v,
@@ -215,7 +221,7 @@ pub async fn handle(

    if request_content_length > MAX_REQUEST_SIZE {
        return Err(anyhow::anyhow!(
-            "request is too large (max {MAX_REQUEST_SIZE} bytes)"
+            "request is too large (max is {MAX_REQUEST_SIZE} bytes)"
        ));
    }

@@ -225,58 +231,65 @@ pub async fn handle(
    let body = hyper::body::to_bytes(request.into_body()).await?;
    let payload: Payload = serde_json::from_slice(&body)?;

-    let mut client = conn_pool.get(&conn_info, !allow_pool).await?;
+    let mut client = conn_pool.get(&conn_info, !allow_pool, session_id).await?;

    //
    // Now execute the query and return the result
    //
    let result = match payload {
-        Payload::Single(query) => query_to_json(&client, &query, raw_output, array_mode)
+        Payload::Single(query) => query_to_json(&client.inner, query, raw_output, array_mode)
            .await
            .map(|x| (x, HashMap::default())),
-        Payload::Batch(queries) => {
+        Payload::Batch(batch_query) => {
            let mut results = Vec::new();
-            let mut builder = client.build_transaction();
+            let mut builder = client.inner.build_transaction();
            if let Some(isolation_level) = txn_isolation_level {
                builder = builder.isolation_level(isolation_level);
            }
            if txn_read_only {
                builder = builder.read_only(true);
            }
+            if txn_deferrable {
+                builder = builder.deferrable(true);
+            }
            let transaction = builder.start().await?;
-            for (idx, query) in queries.into_iter().enumerate() {
-                let result = query_to_json(&transaction, &query, raw_output, array_mode)
-                    .await
-                    .with_context(|| {
-                        format!("error when executing queries[{}] \"{}\"", idx, query.query)
-                    });
+            for query in batch_query.queries {
+                let result = query_to_json(&transaction, query, raw_output, array_mode).await;
                match result {
                    Ok(r) => results.push(r),
                    Err(e) => {
-                        transaction.rollback().await.with_context(|| {
-                            format!("error when rollback queries[{}] \"{}\"", idx, query.query)
-                        })?;
+                        transaction.rollback().await?;
                        return Err(e);
                    }
                }
            }
            transaction.commit().await?;
            let mut headers = HashMap::default();
-            headers.insert(
-                TXN_READ_ONLY.clone(),
-                HeaderValue::try_from(txn_read_only.to_string())?,
-            );
-            if let Some(txn_isolation_level_raw) = txn_isolation_level_raw {
-                headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level_raw);
+            if txn_read_only {
+                headers.insert(
+                    TXN_READ_ONLY.clone(),
+                    HeaderValue::try_from(txn_read_only.to_string())?,
+                );
+            }
+            if txn_deferrable {
+                headers.insert(
+                    TXN_DEFERRABLE.clone(),
+                    HeaderValue::try_from(txn_deferrable.to_string())?,
+                );
+            }
+            if let Some(txn_isolation_level) = txn_isolation_level_raw {
+                headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
            }
            Ok((json!({ "results": results }), headers))
        }
    };

    if allow_pool {
+        let current_span = tracing::Span::current();
        // return connection to the pool
-        tokio::task::spawn(async move {
-            let _ = conn_pool.put(&conn_info, client).await;
+        tokio::task::spawn_blocking(move || {
+            let _span = current_span.enter();
+            let _ = conn_pool.put(&conn_info, client);
        });
    }

@@ -285,13 +298,13 @@ pub async fn handle(

 async fn query_to_json<T: GenericClient>(
    client: &T,
-    data: &QueryData,
+    data: QueryData,
    raw_output: bool,
    array_mode: bool,
 ) -> anyhow::Result<Value> {
-    let query_params = json_to_pg_text(&data.params)?;
+    let query_params = json_to_pg_text(data.params)?;
    let row_stream = client
-        .query_raw_txt(&data.query, query_params)
+        .query_raw_txt::<String, _>(data.query, query_params)
        .await?;

    // Manually drain the stream into a vector to leave row_stream hanging
@@ -299,13 +312,15 @@ async fn query_to_json<T: GenericClient>(
    // big.
    pin_mut!(row_stream);
    let mut rows: Vec<tokio_postgres::Row> = Vec::new();
-    let mut curret_size = 0;
+    let mut current_size = 0;
    while let Some(row) = row_stream.next().await {
        let row = row?;
-        curret_size += row.body_len();
+        current_size += row.body_len();
        rows.push(row);
-        if curret_size > MAX_RESPONSE_SIZE {
-            return Err(anyhow::anyhow!("response too large"));
+        if current_size > MAX_RESPONSE_SIZE {
+            return Err(anyhow::anyhow!(
+                "response is too large (max is {MAX_RESPONSE_SIZE} bytes)"
+            ));
        }
    }

@@ -540,22 +555,22 @@ mod tests {
    #[test]
    fn test_atomic_types_to_pg_params() {
        let json = vec![Value::Bool(true), Value::Bool(false)];
-        let pg_params = json_to_pg_text(&json).unwrap();
+        let pg_params = json_to_pg_text(json).unwrap();
        assert_eq!(
            pg_params,
            vec![Some("true".to_owned()), Some("false".to_owned())]
        );

        let json = vec![Value::Number(serde_json::Number::from(42))];
-        let pg_params = json_to_pg_text(&json).unwrap();
+        let pg_params = json_to_pg_text(json).unwrap();
        assert_eq!(pg_params, vec![Some("42".to_owned())]);

        let json = vec![Value::String("foo\"".to_string())];
-        let pg_params = json_to_pg_text(&json).unwrap();
+        let pg_params = json_to_pg_text(json).unwrap();
        assert_eq!(pg_params, vec![Some("foo\"".to_owned())]);

        let json = vec![Value::Null];
-        let pg_params = json_to_pg_text(&json).unwrap();
+        let pg_params = json_to_pg_text(json).unwrap();
        assert_eq!(pg_params, vec![None]);
    }

@@ -564,7 +579,7 @@ mod tests {
        // atoms and escaping
        let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]";
        let json: Value = serde_json::from_str(json).unwrap();
-        let pg_params = json_to_pg_text(&[json]).unwrap();
+        let pg_params = json_to_pg_text(vec![json]).unwrap();
        assert_eq!(
            pg_params,
            vec![Some(
@@ -575,7 +590,7 @@ mod tests {
        // nested arrays
        let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]";
        let json: Value = serde_json::from_str(json).unwrap();
-        let pg_params = json_to_pg_text(&[json]).unwrap();
+        let pg_params = json_to_pg_text(vec![json]).unwrap();
        assert_eq!(
            pg_params,
            vec![Some(
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -187,19 +187,23 @@ async fn ws_handler(
        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
            .map_err(|e| ApiError::BadRequest(e.into()))?;

-        tokio::spawn(async move {
-            if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
-            {
-                error!(session_id = ?session_id, "error in websocket connection: {e:?}");
+        tokio::spawn(
+            async move {
+                if let Err(e) =
+                    serve_websocket(websocket, config, &cancel_map, session_id, host).await
+                {
+                    error!(session_id = ?session_id, "error in websocket connection: {e:#}");
+                }
            }
-        });
+            .in_current_span(),
+        );

        // Return the response so the spawned future can continue.
        Ok(response)
    // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
    // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        let result = sql_over_http::handle(request, sni_hostname, conn_pool)
+        let result = sql_over_http::handle(request, sni_hostname, conn_pool, session_id)
            .instrument(info_span!("sql-over-http"))
            .await;
        let status_code = match result {
@@ -217,6 +221,10 @@ async fn ws_handler(
                    },
                    None => Value::Null,
                };
+                error!(
+                    ?code,
+                    "sql-over-http per-client task finished with an error: {e:#}"
+                );
                (
                    json!({ "message": message, "code": code }),
                    HashMap::default(),
@@ -261,6 +269,18 @@ pub async fn task_main(

    let conn_pool: Arc<GlobalConnPool> = GlobalConnPool::new(config);

+    // shutdown the connection pool
+    tokio::spawn({
+        let cancellation_token = cancellation_token.clone();
+        let conn_pool = conn_pool.clone();
+        async move {
+            cancellation_token.cancelled().await;
+            tokio::task::spawn_blocking(move || conn_pool.shutdown())
+                .await
+                .unwrap();
+        }
+    });
+
    let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config());
    let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config {
        Some(config) => config.into(),
@@ -299,7 +319,7 @@ pub async fn task_main(
                        ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
                            .instrument(info_span!(
                                "ws-client",
-                                session = format_args!("{session_id}")
+                                session = %session_id
                            ))
                            .await
                    }
--- a/proxy/src/sasl/stream.rs
+++ b/proxy/src/sasl/stream.rs
@@ -4,6 +4,7 @@ use super::{messages::ServerMessage, Mechanism};
 use crate::stream::PqStream;
 use std::io;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::info;

 /// Abstracts away all peculiarities of the libpq's protocol.
 pub struct SaslStream<'a, S> {
@@ -68,7 +69,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> SaslStream<'_, S> {
    ) -> super::Result<Outcome<M::Output>> {
        loop {
            let input = self.recv().await?;
-            let step = mechanism.exchange(input)?;
+            let step = mechanism.exchange(input).map_err(|error| {
+                info!(?error, "error during SASL exchange");
+                error
+            })?;

            use super::Step;
            return Ok(match step {
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -15,6 +15,7 @@ use toml_edit::Document;
 use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
 use std::path::{Path, PathBuf};
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 use storage_broker::Uri;
@@ -122,9 +123,24 @@ struct Args {
    /// WAL backup horizon.
    #[arg(long)]
    disable_wal_backup: bool,
-    /// Path to a .pem public key which is used to check JWT tokens.
-    #[arg(long)]
-    auth_validation_public_key_path: Option<PathBuf>,
+    /// If given, enables auth on incoming connections to WAL service endpoint
+    /// (--listen-pg). Value specifies path to a .pem public key used for
+    /// validations of JWT tokens. Empty string is allowed and means disabling
+    /// auth.
+    #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
+    pg_auth_public_key_path: Option<PathBuf>,
+    /// If given, enables auth on incoming connections to tenant only WAL
+    /// service endpoint (--listen-pg-tenant-only). Value specifies path to a
+    /// .pem public key used for validations of JWT tokens. Empty string is
+    /// allowed and means disabling auth.
+    #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
+    pg_tenant_only_auth_public_key_path: Option<PathBuf>,
+    /// If given, enables auth on incoming connections to http management
+    /// service endpoint (--listen-http). Value specifies path to a .pem public
+    /// key used for validations of JWT tokens. Empty string is allowed and
+    /// means disabling auth.
+    #[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
+    http_auth_public_key_path: Option<PathBuf>,
    /// Format for logging, either 'plain' or 'json'.
    #[arg(long, default_value = "plain")]
    log_format: String,
@@ -134,9 +150,39 @@ struct Args {
    current_thread_runtime: bool,
 }

+// Like PathBufValueParser, but allows empty string.
+fn opt_pathbuf_parser(s: &str) -> Result<PathBuf, String> {
+    Ok(PathBuf::from_str(s).unwrap())
+}
+
 #[tokio::main(flavor = "current_thread")]
 async fn main() -> anyhow::Result<()> {
-    let args = Args::parse();
+    // We want to allow multiple occurences of the same arg (taking the last) so
+    // that neon_local could generate command with defaults + overrides without
+    // getting 'argument cannot be used multiple times' error. This seems to be
+    // impossible with pure Derive API, so convert struct to Command, modify it,
+    // parse arguments, and then fill the struct back.
+    let cmd = <Args as clap::CommandFactory>::command().args_override_self(true);
+    let mut matches = cmd.get_matches();
+    let mut args = <Args as clap::FromArgMatches>::from_arg_matches_mut(&mut matches)?;
+
+    // I failed to modify opt_pathbuf_parser to return Option<PathBuf> in
+    // reasonable time, so turn empty string into option post factum.
+    if let Some(pb) = &args.pg_auth_public_key_path {
+        if pb.as_os_str().is_empty() {
+            args.pg_auth_public_key_path = None;
+        }
+    }
+    if let Some(pb) = &args.pg_tenant_only_auth_public_key_path {
+        if pb.as_os_str().is_empty() {
+            args.pg_tenant_only_auth_public_key_path = None;
+        }
+    }
+    if let Some(pb) = &args.http_auth_public_key_path {
+        if pb.as_os_str().is_empty() {
+            args.http_auth_public_key_path = None;
+        }
+    }

    if let Some(addr) = args.dump_control_file {
        let state = control_file::FileStorage::load_control_file(addr)?;
@@ -170,13 +216,40 @@ async fn main() -> anyhow::Result<()> {
        return Ok(());
    }

-    let auth = match args.auth_validation_public_key_path.as_ref() {
+    let pg_auth = match args.pg_auth_public_key_path.as_ref() {
        None => {
-            info!("auth is disabled");
+            info!("pg auth is disabled");
            None
        }
        Some(path) => {
-            info!("loading JWT auth key from {}", path.display());
+            info!("loading pg auth JWT key from {}", path.display());
+            Some(Arc::new(
+                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
+            ))
+        }
+    };
+    let pg_tenant_only_auth = match args.pg_tenant_only_auth_public_key_path.as_ref() {
+        None => {
+            info!("pg tenant only auth is disabled");
+            None
+        }
+        Some(path) => {
+            info!(
+                "loading pg tenant only auth JWT key from {}",
+                path.display()
+            );
+            Some(Arc::new(
+                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
+            ))
+        }
+    };
+    let http_auth = match args.http_auth_public_key_path.as_ref() {
+        None => {
+            info!("http auth is disabled");
+            None
+        }
+        Some(path) => {
+            info!("loading http auth JWT key from {}", path.display());
            Some(Arc::new(
                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
            ))
@@ -199,7 +272,9 @@ async fn main() -> anyhow::Result<()> {
        max_offloader_lag_bytes: args.max_offloader_lag,
        wal_backup_enabled: !args.disable_wal_backup,
        backup_parallel_jobs: args.wal_backup_parallel_jobs,
-        auth,
+        pg_auth,
+        pg_tenant_only_auth,
+        http_auth,
        current_thread_runtime: args.current_thread_runtime,
    };

@@ -288,7 +363,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
        .spawn(wal_service::task_main(
            conf_,
            pg_listener,
-            Some(Scope::SafekeeperData),
+            Scope::SafekeeperData,
        ))
        // wrap with task name for error reporting
        .map(|res| ("WAL service main".to_owned(), res));
@@ -302,7 +377,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
            .spawn(wal_service::task_main(
                conf_,
                pg_listener_tenant_only,
-                Some(Scope::Tenant),
+                Scope::Tenant,
            ))
            // wrap with task name for error reporting
            .map(|res| ("WAL service tenant only main".to_owned(), res));
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -2,8 +2,9 @@
 //! protocol commands.

 use anyhow::Context;
-use std::str;
 use std::str::FromStr;
+use std::str::{self};
+use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span, Instrument};

@@ -11,6 +12,7 @@ use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};

 use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED};
+use crate::safekeeper::Term;
 use crate::timeline::TimelineError;
 use crate::wal_service::ConnectionId;
 use crate::{GlobalTimelines, SafeKeeperConf};
@@ -19,7 +21,7 @@ use postgres_backend::{self, PostgresBackend};
 use postgres_ffi::PG_TLI;
 use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
 use regex::Regex;
-use utils::auth::{Claims, Scope};
+use utils::auth::{Claims, JwtAuth, Scope};
 use utils::{
    id::{TenantId, TenantTimelineId, TimelineId},
    lsn::Lsn,
@@ -35,8 +37,8 @@ pub struct SafekeeperPostgresHandler {
    pub ttid: TenantTimelineId,
    /// Unique connection id is logged in spans for observability.
    pub conn_id: ConnectionId,
-    /// Auth scope allowed on the connections. None if auth is not configured.
-    allowed_auth_scope: Option<Scope>,
+    /// Auth scope allowed on the connections and public key used to check auth tokens. None if auth is not configured.
+    auth: Option<(Scope, Arc<JwtAuth>)>,
    claims: Option<Claims>,
    io_metrics: Option<TrafficMetrics>,
 }
@@ -44,7 +46,7 @@ pub struct SafekeeperPostgresHandler {
 /// Parsed Postgres command.
 enum SafekeeperPostgresCommand {
    StartWalPush,
-    StartReplication { start_lsn: Lsn },
+    StartReplication { start_lsn: Lsn, term: Option<Term> },
    IdentifySystem,
    TimelineStatus,
    JSONCtrl { cmd: AppendLogicalMessage },
@@ -55,15 +57,21 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
        Ok(SafekeeperPostgresCommand::StartWalPush)
    } else if cmd.starts_with("START_REPLICATION") {
        let re = Regex::new(
-            r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)",
+            // We follow postgres START_REPLICATION LOGICAL options to pass term.
+            r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)(?: \(term='(\d+)'\))?",
        )
        .unwrap();
-        let mut caps = re.captures_iter(cmd);
-        let start_lsn = caps
-            .next()
-            .map(|cap| Lsn::from_str(&cap[1]))
-            .context("parse start LSN from START_REPLICATION command")??;
-        Ok(SafekeeperPostgresCommand::StartReplication { start_lsn })
+        let caps = re
+            .captures(cmd)
+            .context(format!("failed to parse START_REPLICATION command {}", cmd))?;
+        let start_lsn =
+            Lsn::from_str(&caps[1]).context("parse start LSN from START_REPLICATION command")?;
+        let term = if let Some(m) = caps.get(2) {
+            Some(m.as_str().parse::<u64>().context("invalid term")?)
+        } else {
+            None
+        };
+        Ok(SafekeeperPostgresCommand::StartReplication { start_lsn, term })
    } else if cmd.starts_with("IDENTIFY_SYSTEM") {
        Ok(SafekeeperPostgresCommand::IdentifySystem)
    } else if cmd.starts_with("TIMELINE_STATUS") {
@@ -147,18 +155,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
    ) -> Result<(), QueryError> {
        // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
        // which requires auth to be present
-        let data = self
-            .conf
+        let (allowed_auth_scope, auth) = self
            .auth
            .as_ref()
-            .unwrap()
-            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
+            .expect("auth_type is configured but .auth of handler is missing");
+        let data =
+            auth.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;

-        let scope = self
-            .allowed_auth_scope
-            .expect("auth is enabled but scope is not configured");
        // The handler might be configured to allow only tenant scope tokens.
-        if matches!(scope, Scope::Tenant) && !matches!(data.claims.scope, Scope::Tenant) {
+        if matches!(allowed_auth_scope, Scope::Tenant)
+            && !matches!(data.claims.scope, Scope::Tenant)
+        {
            return Err(QueryError::Other(anyhow::anyhow!(
                "passed JWT token is for full access, but only tenant scope is allowed"
            )));
@@ -218,8 +225,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                    .instrument(info_span!("WAL receiver", ttid = %span_ttid))
                    .await
            }
-            SafekeeperPostgresCommand::StartReplication { start_lsn } => {
-                self.handle_start_replication(pgb, start_lsn)
+            SafekeeperPostgresCommand::StartReplication { start_lsn, term } => {
+                self.handle_start_replication(pgb, start_lsn, term)
                    .instrument(info_span!("WAL sender", ttid = %span_ttid))
                    .await
            }
@@ -237,7 +244,7 @@ impl SafekeeperPostgresHandler {
        conf: SafeKeeperConf,
        conn_id: u32,
        io_metrics: Option<TrafficMetrics>,
-        allowed_auth_scope: Option<Scope>,
+        auth: Option<(Scope, Arc<JwtAuth>)>,
    ) -> Self {
        SafekeeperPostgresHandler {
            conf,
@@ -247,7 +254,7 @@ impl SafekeeperPostgresHandler {
            ttid: TenantTimelineId::empty(),
            conn_id,
            claims: None,
-            allowed_auth_scope,
+            auth,
            io_metrics,
        }
    }
@@ -255,7 +262,7 @@ impl SafekeeperPostgresHandler {
    // when accessing management api supply None as an argument
    // when using to authorize tenant pass corresponding tenant id
    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
-        if self.conf.auth.is_none() {
+        if self.auth.is_none() {
            // auth is set to Trust, nothing to check so just return ok
            return Ok(());
        }
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -359,7 +359,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
 /// Safekeeper http router.
 pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
    let mut router = endpoint::make_router();
-    if conf.auth.is_some() {
+    if conf.http_auth.is_some() {
        router = router.middleware(auth_middleware(|request| {
            #[allow(clippy::mutable_key_type)]
            static ALLOWLIST_ROUTES: Lazy<HashSet<Uri>> =
@@ -375,7 +375,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>

    // NB: on any changes do not forget to update the OpenAPI spec
    // located nearby (/safekeeper/src/http/openapi_spec.yaml).
-    let auth = conf.auth.clone();
+    let auth = conf.http_auth.clone();
    router
        .data(Arc::new(conf))
        .data(auth)
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -65,7 +65,9 @@ pub struct SafeKeeperConf {
    pub max_offloader_lag_bytes: u64,
    pub backup_parallel_jobs: usize,
    pub wal_backup_enabled: bool,
-    pub auth: Option<Arc<JwtAuth>>,
+    pub pg_auth: Option<Arc<JwtAuth>>,
+    pub pg_tenant_only_auth: Option<Arc<JwtAuth>>,
+    pub http_auth: Option<Arc<JwtAuth>>,
    pub current_thread_runtime: bool,
 }

@@ -99,7 +101,9 @@ impl SafeKeeperConf {
            broker_keepalive_interval: Duration::from_secs(5),
            wal_backup_enabled: true,
            backup_parallel_jobs: 1,
-            auth: None,
+            pg_auth: None,
+            pg_tenant_only_auth: None,
+            http_auth: None,
            heartbeat_timeout: Duration::new(5, 0),
            max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
            current_thread_runtime: false,
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -2,6 +2,7 @@
 //! with the "START_REPLICATION" message, and registry of walsenders.

 use crate::handler::SafekeeperPostgresHandler;
+use crate::safekeeper::Term;
 use crate::timeline::Timeline;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
@@ -359,8 +360,12 @@ impl SafekeeperPostgresHandler {
        &mut self,
        pgb: &mut PostgresBackend<IO>,
        start_pos: Lsn,
+        term: Option<Term>,
    ) -> Result<(), QueryError> {
-        if let Err(end) = self.handle_start_replication_guts(pgb, start_pos).await {
+        if let Err(end) = self
+            .handle_start_replication_guts(pgb, start_pos, term)
+            .await
+        {
            // Log the result and probably send it to the client, closing the stream.
            pgb.handle_copy_stream_end(end).await;
        }
@@ -371,6 +376,7 @@ impl SafekeeperPostgresHandler {
        &mut self,
        pgb: &mut PostgresBackend<IO>,
        start_pos: Lsn,
+        term: Option<Term>,
    ) -> Result<(), CopyStreamHandlerEnd> {
        let appname = self.appname.clone();
        let tli =
@@ -440,6 +446,7 @@ impl SafekeeperPostgresHandler {
            start_pos,
            end_pos,
            stop_pos,
+            term,
            commit_lsn_watch_rx,
            ws_guard: ws_guard.clone(),
            wal_reader,
@@ -476,6 +483,10 @@ struct WalSender<'a, IO> {
    // If present, terminate after reaching this position; used by walproposer
    // in recovery.
    stop_pos: Option<Lsn>,
+    /// When streaming uncommitted part, the term the client acts as the leader
+    /// in. Streaming is stopped if local term changes to a different (higher)
+    /// value.
+    term: Option<Term>,
    commit_lsn_watch_rx: Receiver<Lsn>,
    ws_guard: Arc<WalSenderGuard>,
    wal_reader: WalReader,
@@ -518,8 +529,18 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                .0 as usize;
            send_size = min(send_size, self.send_buf.len());
            let send_buf = &mut self.send_buf[..send_size];
-            // read wal into buffer
-            send_size = self.wal_reader.read(send_buf).await?;
+            let send_size: usize;
+            {
+                // If uncommitted part is being pulled, check that the term is
+                // still the expected one.
+                let _term_guard = if let Some(t) = self.term {
+                    Some(self.tli.acquire_term(t).await?)
+                } else {
+                    None
+                };
+                // read wal into buffer
+                send_size = self.wal_reader.read(send_buf).await?
+            };
            let send_buf = &send_buf[..send_size];

            // and send it
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -499,6 +499,19 @@ impl Timeline {
        false
    }

+    /// Ensure taht current term is t, erroring otherwise, and lock the state.
+    pub async fn acquire_term(&self, t: Term) -> Result<MutexGuard<SharedState>> {
+        let ss = self.write_shared_state().await;
+        if ss.sk.state.acceptor_state.term != t {
+            bail!(
+                "failed to acquire term {}, current term {}",
+                t,
+                ss.sk.state.acceptor_state.term
+            );
+        }
+        Ok(ss)
+    }
+
    /// Returns whether s3 offloading is required and sets current status as
    /// matching it.
    pub async fn wal_backup_attend(&self) -> bool {
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -16,10 +16,13 @@ use crate::SafeKeeperConf;
 use postgres_backend::{AuthType, PostgresBackend};

 /// Accept incoming TCP connections and spawn them into a background thread.
+/// allowed_auth_scope is either SafekeeperData (wide JWT tokens giving access
+/// to any tenant are allowed) or Tenant (only tokens giving access to specific
+/// tenant are allowed). Doesn't matter if auth is disabled in conf.
 pub async fn task_main(
    conf: SafeKeeperConf,
    pg_listener: std::net::TcpListener,
-    allowed_auth_scope: Option<Scope>,
+    allowed_auth_scope: Scope,
 ) -> anyhow::Result<()> {
    // Tokio's from_std won't do this for us, per its comment.
    pg_listener.set_nonblocking(true)?;
@@ -50,7 +53,7 @@ async fn handle_socket(
    socket: TcpStream,
    conf: SafeKeeperConf,
    conn_id: ConnectionId,
-    allowed_auth_scope: Option<Scope>,
+    allowed_auth_scope: Scope,
 ) -> Result<(), QueryError> {
    socket.set_nodelay(true)?;
    let peer_addr = socket.peer_addr()?;
@@ -82,16 +85,17 @@ async fn handle_socket(
        },
    );

-    let auth_type = match conf.auth {
+    let auth_key = match allowed_auth_scope {
+        Scope::Tenant => conf.pg_tenant_only_auth.clone(),
+        _ => conf.pg_auth.clone(),
+    };
+    let auth_type = match auth_key {
        None => AuthType::Trust,
        Some(_) => AuthType::NeonJWT,
    };
-    let mut conn_handler = SafekeeperPostgresHandler::new(
-        conf,
-        conn_id,
-        Some(traffic_metrics.clone()),
-        allowed_auth_scope,
-    );
+    let auth_pair = auth_key.map(|key| (allowed_auth_scope, key));
+    let mut conn_handler =
+        SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()), auth_pair);
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
    // libpq protocol between safekeeper and walproposer / pageserver
    // We don't use shutdown.
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -32,6 +32,7 @@ import requests
 from _pytest.config import Config
 from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
+from mypy_boto3_s3 import S3Client

 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
@@ -426,13 +427,14 @@ class NeonEnvBuilder:
        default_branch_name: str = DEFAULT_BRANCH_NAME,
        preserve_database_files: bool = False,
        initial_tenant: Optional[TenantId] = None,
+        initial_timeline: Optional[TimelineId] = None,
    ):
        self.repo_dir = repo_dir
        self.rust_log_override = rust_log_override
        self.port_distributor = port_distributor
        self.remote_storage = remote_storage
        self.ext_remote_storage: Optional[S3Storage] = None
-        self.remote_storage_client: Optional[Any] = None
+        self.remote_storage_client: Optional[S3Client] = None
        self.remote_storage_users = remote_storage_users
        self.broker = broker
        self.run_id = run_id
@@ -451,6 +453,7 @@ class NeonEnvBuilder:
        self.pg_version = pg_version
        self.preserve_database_files = preserve_database_files
        self.initial_tenant = initial_tenant or TenantId.generate()
+        self.initial_timeline = initial_timeline or TimelineId.generate()

    def init_configs(self) -> NeonEnv:
        # Cannot create more than one environment from one builder
@@ -472,9 +475,10 @@ class NeonEnvBuilder:
            f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
        )
        initial_tenant, initial_timeline = env.neon_cli.create_tenant(
-            tenant_id=env.initial_tenant, conf=initial_tenant_conf
+            tenant_id=env.initial_tenant, conf=initial_tenant_conf, timeline_id=env.initial_timeline
        )
-        env.initial_timeline = initial_timeline
+        assert env.initial_tenant == initial_tenant
+        assert env.initial_timeline == initial_timeline
        log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully")

        return env
@@ -783,7 +787,7 @@ class NeonEnv:
        # generate initial tenant ID here instead of letting 'neon init' generate it,
        # so that we don't need to dig it out of the config file afterwards.
        self.initial_tenant = config.initial_tenant
-        self.initial_timeline: Optional[TimelineId] = None
+        self.initial_timeline = config.initial_timeline

        # Create a config file corresponding to the options
        toml = textwrap.dedent(
@@ -875,7 +879,14 @@ class NeonEnv:

    def timeline_dir(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
        """Get a timeline directory's path based on the repo directory of the test environment"""
-        return self.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
+        return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id)
+
+    def tenant_dir(
+        self,
+        tenant_id: TenantId,
+    ) -> Path:
+        """Get a tenant directory's path based on the repo directory of the test environment"""
+        return self.repo_dir / "tenants" / str(tenant_id)

    def get_pageserver_version(self) -> str:
        bin_pageserver = str(self.neon_binpath / "pageserver")
@@ -1305,12 +1316,20 @@ class NeonCli(AbstractNeonCli):
        log.info(f"Stopping pageserver with {cmd}")
        return self.raw_cli(cmd)

-    def safekeeper_start(self, id: int) -> "subprocess.CompletedProcess[str]":
+    def safekeeper_start(
+        self, id: int, extra_opts: Optional[List[str]] = None
+    ) -> "subprocess.CompletedProcess[str]":
        s3_env_vars = None
        if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
            s3_env_vars = self.env.remote_storage.access_env_vars()

-        return self.raw_cli(["safekeeper", "start", str(id)], extra_env_vars=s3_env_vars)
+        if extra_opts is not None:
+            extra_opts = [f"-e={opt}" for opt in extra_opts]
+        else:
+            extra_opts = []
+        return self.raw_cli(
+            ["safekeeper", "start", str(id), *extra_opts], extra_env_vars=s3_env_vars
+        )

    def safekeeper_stop(
        self, id: Optional[int] = None, immediate=False
@@ -1753,6 +1772,15 @@ class VanillaPostgres(PgProtocol):
        with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file:
            conf_file.write("\n".join(options))

+    def edit_hba(self, hba: List[str]):
+        """Prepend hba lines into pg_hba.conf file."""
+        assert not self.running
+        with open(os.path.join(self.pgdatadir, "pg_hba.conf"), "r+") as conf_file:
+            data = conf_file.read()
+            conf_file.seek(0)
+            conf_file.write("\n".join(hba) + "\n")
+            conf_file.write(data)
+
    def start(self, log_path: Optional[str] = None):
        assert not self.running
        self.running = True
@@ -2150,15 +2178,18 @@ def static_proxy(
 ) -> Iterator[NeonProxy]:
    """Neon proxy that routes directly to vanilla postgres."""

-    # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql`
-    vanilla_pg.start()
-    vanilla_pg.safe_psql("create user proxy with login superuser password 'password'")
-
    port = vanilla_pg.default_options["port"]
    host = vanilla_pg.default_options["host"]
    dbname = vanilla_pg.default_options["dbname"]
    auth_endpoint = f"postgres://proxy:password@{host}:{port}/{dbname}"

+    # require password for 'http_auth' user
+    vanilla_pg.edit_hba([f"host {dbname} http_auth {host} password"])
+
+    # For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql`
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create user proxy with login superuser password 'password'")
+
    proxy_port = port_distributor.get_port()
    mgmt_port = port_distributor.get_port()
    http_port = port_distributor.get_port()
@@ -2499,9 +2530,9 @@ class Safekeeper:
    id: int
    running: bool = False

-    def start(self) -> "Safekeeper":
+    def start(self, extra_opts: Optional[List[str]] = None) -> "Safekeeper":
        assert self.running is False
-        self.env.neon_cli.safekeeper_start(self.id)
+        self.env.neon_cli.safekeeper_start(self.id, extra_opts=extra_opts)
        self.running = True
        # wait for wal acceptor start by checking its status
        started_at = time.time()
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -210,6 +210,10 @@ class PageserverHttpClient(requests.Session):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
        self.verbose_error(res)

+    def tenant_delete(self, tenant_id: TenantId):
+        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
+        self.verbose_error(res)
+
    def tenant_load(self, tenant_id: TenantId):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
        self.verbose_error(res)
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -1,9 +1,13 @@
 import time
-from typing import Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional
+
+from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef

 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.remote_storage import RemoteStorageKind, S3Storage
 from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import wait_until


 def assert_tenant_state(
@@ -17,15 +21,6 @@ def assert_tenant_state(
    assert tenant_status["state"]["slug"] == expected_state, message or tenant_status


-def tenant_exists(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
-    tenants = pageserver_http.tenant_list()
-    matching = [t for t in tenants if TenantId(t["id"]) == tenant_id]
-    assert len(matching) < 2
-    if len(matching) == 0:
-        return None
-    return matching[0]
-
-
 def remote_consistent_lsn(
    pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
 ) -> Lsn:
@@ -198,21 +193,24 @@ def wait_timeline_detail_404(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    iterations: int,
+    interval: Optional[float] = None,
 ):
-    last_exc = None
-    for _ in range(iterations):
-        time.sleep(0.250)
+    if interval is None:
+        interval = 0.25
+
+    def timeline_is_missing():
+        data = {}
        try:
            data = pageserver_http.timeline_detail(tenant_id, timeline_id)
-            log.info(f"detail {data}")
+            log.info(f"timeline detail {data}")
        except PageserverApiException as e:
            log.debug(e)
            if e.status_code == 404:
                return

-            last_exc = e
+        raise RuntimeError(f"Timeline exists state {data.get('state')}")

-    raise last_exc or RuntimeError(f"Timeline wasnt deleted in time, state: {data['state']}")
+    wait_until(iterations, interval, func=timeline_is_missing)


 def timeline_delete_wait_completed(
@@ -220,7 +218,101 @@ def timeline_delete_wait_completed(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    iterations: int = 20,
+    interval: Optional[float] = None,
    **delete_args,
 ):
    pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
-    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations)
+    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)
+
+
+if TYPE_CHECKING:
+    # TODO avoid by combining remote storage related stuff in single type
+    # and just passing in this type instead of whole builder
+    from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
+    response = list_prefix(neon_env_builder, prefix)
+    objects = response.get("Contents")
+    assert (
+        response["KeyCount"] == 0
+    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+
+
+def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
+    response = list_prefix(neon_env_builder, prefix)
+    assert response["KeyCount"] != 0, f"remote dir with prefix {prefix} is empty: {response}"
+
+
+def list_prefix(
+    neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None
+) -> ListObjectsV2OutputTypeDef:
+    """
+    Note that this function takes into account prefix_in_bucket.
+    """
+    # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
+    assert neon_env_builder.remote_storage_kind in (
+        RemoteStorageKind.MOCK_S3,
+        RemoteStorageKind.REAL_S3,
+    )
+    # For mypy
+    assert isinstance(neon_env_builder.remote_storage, S3Storage)
+    assert neon_env_builder.remote_storage_client is not None
+
+    prefix_in_bucket = neon_env_builder.remote_storage.prefix_in_bucket or ""
+    if not prefix:
+        prefix = prefix_in_bucket
+    else:
+        # real s3 tests have uniqie per test prefix
+        # mock_s3 tests use special pageserver prefix for pageserver stuff
+        prefix = "/".join((prefix_in_bucket, prefix))
+
+    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
+    response = neon_env_builder.remote_storage_client.list_objects_v2(
+        Delimiter="/",
+        Bucket=neon_env_builder.remote_storage.bucket_name,
+        Prefix=prefix,
+    )
+    return response
+
+
+def wait_tenant_status_404(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    iterations: int,
+    interval: float = 0.250,
+):
+    def tenant_is_missing():
+        data = {}
+        try:
+            data = pageserver_http.tenant_status(tenant_id)
+            log.info(f"tenant status {data}")
+        except PageserverApiException as e:
+            log.debug(e)
+            if e.status_code == 404:
+                return
+
+        raise RuntimeError(f"Timeline exists state {data.get('state')}")
+
+    wait_until(iterations, interval=interval, func=tenant_is_missing)
+
+
+def tenant_delete_wait_completed(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    iterations: int,
+):
+    pageserver_http.tenant_delete(tenant_id=tenant_id)
+    wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)
+
+
+MANY_SMALL_LAYERS_TENANT_CONFIG = {
+    "gc_period": "0s",
+    "compaction_period": "0s",
+    "checkpoint_distance": f"{1024**2}",
+    "image_creation_threshold": "100",
+}
+
+
+def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
+    return 40 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 15
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -7,6 +7,9 @@ from pathlib import Path
 from typing import Dict, List, Optional, Union

 from fixtures.log_helper import log
+from fixtures.types import TenantId, TimelineId
+
+TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"


 class MockS3Server:
@@ -89,6 +92,19 @@ def available_s3_storages() -> List[RemoteStorageKind]:
 class LocalFsStorage:
    root: Path

+    def tenant_path(self, tenant_id: TenantId) -> Path:
+        return self.root / "tenants" / str(tenant_id)
+
+    def timeline_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
+        return self.tenant_path(tenant_id) / "timelines" / str(timeline_id)
+
+    def index_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
+        return self.timeline_path(tenant_id, timeline_id) / TIMELINE_INDEX_PART_FILE_NAME
+
+    def index_content(self, tenant_id: TenantId, timeline_id: TimelineId):
+        with self.index_path(tenant_id, timeline_id).open("r") as f:
+            return json.load(f)
+

@dataclass
 class S3Storage:
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -6,13 +6,16 @@ import subprocess
 import tarfile
 import time
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Tuple, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, TypeVar
 from urllib.parse import urlencode

 import allure
 from psycopg2.extensions import cursor

 from fixtures.log_helper import log
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import PgBin
 from fixtures.types import TimelineId

 Fn = TypeVar("Fn", bound=Callable[..., Any])
@@ -300,17 +303,13 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn):
    raise Exception("timed out while waiting for %s" % func) from last_exception


-def wait_while(number_of_iterations: int, interval: float, func):
+def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
    """
-    Wait until 'func' returns false, or throws an exception.
+    Fast way to populate data.
+    For more layers consider combining with these tenant settings:
+    {
+        "checkpoint_distance": 1024 ** 2,
+        "image_creation_threshold": 100,
+    }
    """
-    for i in range(number_of_iterations):
-        try:
-            if not func():
-                return
-            log.info("waiting for %s iteration %s failed", func, i + 1)
-            time.sleep(interval)
-            continue
-        except Exception:
-            return
-    raise Exception("timed out while waiting for %s" % func)
+    pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", connstr])
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -394,13 +394,7 @@ def check_neon_works(
        test_output_dir / "dump-from-wal.filediff",
    )

-    # TODO: Run pg_amcheck unconditionally after the next release
-    try:
-        pg_bin.run(["psql", connstr, "--command", "CREATE EXTENSION IF NOT EXISTS amcheck"])
-    except subprocess.CalledProcessError:
-        log.info("Extension amcheck is not available, skipping pg_amcheck")
-    else:
-        pg_bin.run_capture(["pg_amcheck", connstr, "--install-missing", "--verbose"])
+    pg_bin.run_capture(["pg_amcheck", connstr, "--install-missing", "--verbose"])

    # Check that we can interract with the data
    pg_bin.run_capture(["pgbench", "--time=10", "--progress=2", connstr])
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -1,4 +1,3 @@
-import shutil
 import time
 from dataclasses import dataclass
 from typing import Dict, Tuple
@@ -14,7 +13,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_upload_queue_empty
-from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
+from fixtures.remote_storage import RemoteStorageKind
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import wait_until

@@ -138,22 +137,14 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev

    neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}")

-    env = neon_env_builder.init_start()
+    # initial tenant will not be present on this pageserver
+    env = neon_env_builder.init_configs()
+    env.start()
    pageserver_http = env.pageserver.http_client()

    # allow because we are invoking this manually; we always warn on executing disk based eviction
    env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")

-    # remove the initial tenant
-    assert env.initial_timeline
-    pageserver_http.tenant_detach(env.initial_tenant)
-    assert isinstance(env.remote_storage, LocalFsStorage)
-    tenant_remote_storage = env.remote_storage.root / "tenants" / str(env.initial_tenant)
-    assert tenant_remote_storage.is_dir()
-    shutil.rmtree(tenant_remote_storage)
-    env.initial_tenant = TenantId("0" * 32)
-    env.initial_timeline = None
-
    # Choose small layer_size so that we can use low pgbench_scales and still get a large count of layers.
    # Large count of layers and small layer size is good for testing because it makes evictions predictable.
    # Predictable in the sense that many layer evictions will be required to reach the eviction target, because
--- a/test_runner/regress/test_gc_aggressive.py
+++ b/test_runner/regress/test_gc_aggressive.py
@@ -11,8 +11,7 @@ from fixtures.neon_fixtures import (
    wait_for_last_flush_lsn,
 )
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import TenantId, TimelineId
-from fixtures.utils import query_scalar
+from fixtures.types import TimelineId

 # Test configuration
 #
@@ -71,13 +70,11 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
    # Disable pitr, because here we want to test branch creation after GC
    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
    env = neon_env_builder.init_start()
-    env.neon_cli.create_branch("test_gc_aggressive", "main")
+    timeline = env.neon_cli.create_branch("test_gc_aggressive", "main")
    endpoint = env.endpoints.create_start("test_gc_aggressive")
    log.info("postgres is running on test_gc_aggressive branch")

    with endpoint.cursor() as cur:
-        timeline = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
-
        # Create table, and insert the first 100 rows
        cur.execute("CREATE TABLE foo (id int, counter int, t text)")
        cur.execute(
@@ -109,7 +106,8 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
    )

    env = neon_env_builder.init_start()
-    env.neon_cli.create_branch("test_gc_index_upload", "main")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_gc_index_upload", "main")
    endpoint = env.endpoints.create_start("test_gc_index_upload")

    pageserver_http = env.pageserver.http_client()
@@ -117,9 +115,6 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
    pg_conn = endpoint.connect()
    cur = pg_conn.cursor()

-    tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id"))
-    timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
-
    cur.execute("CREATE TABLE foo (id int, counter int, t text)")
    cur.execute(
        """
--- a/test_runner/regress/test_gc_cutoff.py
+++ b/test_runner/regress/test_gc_cutoff.py
@@ -12,13 +12,8 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 # test anyway, so it doesn't need any special attention here.
@pytest.mark.timeout(600)
 def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    env = neon_env_builder.init_start()
-
-    pageserver_http = env.pageserver.http_client()
-
-    # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
-    tenant_id, _ = env.neon_cli.create_tenant(
-        conf={
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
            "gc_period": "10 s",
            "gc_horizon": f"{1024 ** 2}",
            "checkpoint_distance": f"{1024 ** 2}",
@@ -29,6 +24,11 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
            "image_creation_threshold": "2",
        }
    )
+
+    pageserver_http = env.pageserver.http_client()
+
+    # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
+    tenant_id = env.initial_tenant
    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
    connstr = endpoint.connstr(options="-csynchronous_commit=off")
    pg_bin.run_capture(["pgbench", "-i", "-s10", connstr])
@@ -39,5 +39,4 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
        with pytest.raises(subprocess.SubprocessError):
            pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])
        env.pageserver.stop()
-        env.pageserver.start()
-        pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
+        env.pageserver.start(extra_env_vars={"FAILPOINTS": "after-timeline-gc-removed-layers=exit"})
--- a/test_runner/regress/test_large_schema.py
+++ b/test_runner/regress/test_large_schema.py
@@ -74,9 +74,9 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder):
    cur.execute("select * from pg_depend order by refclassid, refobjid, refobjsubid")

    # Check layer file sizes
-    tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
-    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
-    timeline_path = "{}/tenants/{}/timelines/{}/".format(env.repo_dir, tenant_id, timeline_id)
+    timeline_path = "{}/tenants/{}/timelines/{}/".format(
+        env.repo_dir, env.initial_tenant, env.initial_timeline
+    )
    for filename in os.listdir(timeline_path):
        if filename.startswith("00000"):
            log.info(f"layer {filename} size is {os.path.getsize(timeline_path + filename)}")
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -8,7 +8,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.types import Lsn
 from fixtures.utils import query_scalar


@@ -34,8 +34,8 @@ def test_basic_eviction(
    client = env.pageserver.http_client()
    endpoint = env.endpoints.create_start("main")

-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline

    # Create a number of layers in the tenant
    with endpoint.cursor() as cur:
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -18,8 +18,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import TenantId, TimelineId
-from fixtures.utils import query_scalar
+from fixtures.types import TenantId
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -115,15 +114,13 @@ def test_metric_collection(
    # Order of fixtures shutdown is not specified, and if http server gets down
    # before pageserver, pageserver log might contain such errors in the end.
    env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
-    env.neon_cli.create_branch("test_metric_collection")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_metric_collection")
    endpoint = env.endpoints.create_start("test_metric_collection")

    pg_conn = endpoint.connect()
    cur = pg_conn.cursor()

-    tenant_id = TenantId(query_scalar(cur, "SHOW neon.tenant_id"))
-    timeline_id = TimelineId(query_scalar(cur, "SHOW neon.timeline_id"))
-
    cur.execute("CREATE TABLE foo (id int, counter int, t text)")
    cur.execute(
        """
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -78,8 +78,8 @@ def test_ondemand_download_large_rel(

    client = env.pageserver.http_client()

-    tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
-    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline

    # We want to make sure that the data is large enough that the keyspace is partitioned.
    num_rows = 1000000
@@ -183,8 +183,8 @@ def test_ondemand_download_timetravel(

    client = env.pageserver.http_client()

-    tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
-    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline

    lsns = []

@@ -342,8 +342,8 @@ def test_download_remote_layers_api(

    client = env.pageserver.http_client()

-    tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
-    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline

    table_len = 10000
    with endpoint.cursor() as cur:
@@ -516,7 +516,6 @@ def test_compaction_downloads_on_demand_without_image_creation(

    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline
-    assert timeline_id is not None

    with env.endpoints.create_start("main") as endpoint:
        # no particular reason to create the layers like this, but we are sure
@@ -590,7 +589,6 @@ def test_compaction_downloads_on_demand_with_image_creation(
    env = neon_env_builder.init_start(initial_tenant_conf=stringify(conf))
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline
-    assert timeline_id is not None

    pageserver_http = env.pageserver.http_client()

--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -265,16 +265,23 @@ def test_sql_over_http_output_options(static_proxy: NeonProxy):
 def test_sql_over_http_batch(static_proxy: NeonProxy):
    static_proxy.safe_psql("create role http with login password 'http' superuser")

-    def qq(queries: List[Tuple[str, Optional[List[Any]]]], read_only: bool = False) -> Any:
+    def qq(
+        queries: List[Tuple[str, Optional[List[Any]]]],
+        read_only: bool = False,
+        deferrable: bool = False,
+    ) -> Any:
        connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
        response = requests.post(
            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
-            data=json.dumps(list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))),
+            data=json.dumps(
+                {"queries": list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))}
+            ),
            headers={
                "Content-Type": "application/sql",
                "Neon-Connection-String": connstr,
                "Neon-Batch-Isolation-Level": "Serializable",
                "Neon-Batch-Read-Only": "true" if read_only else "false",
+                "Neon-Batch-Deferrable": "true" if deferrable else "false",
            },
            verify=str(static_proxy.test_output_dir / "proxy.crt"),
        )
@@ -297,7 +304,8 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
    )

    assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
-    assert headers["Neon-Batch-Read-Only"] == "false"
+    assert "Neon-Batch-Read-Only" not in headers
+    assert "Neon-Batch-Deferrable" not in headers

    assert result[0]["rows"] == [{"answer": 42}]
    assert result[1]["rows"] == [{"answer": "42"}]
@@ -325,8 +333,57 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
            ("select 42 as answer", None),
        ],
        True,
+        True,
    )
    assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
    assert headers["Neon-Batch-Read-Only"] == "true"
+    assert headers["Neon-Batch-Deferrable"] == "true"

    assert result[0]["rows"] == [{"answer": 42}]
+
+
+def test_sql_over_http_pool(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create user http_auth with password 'http' superuser")
+
+    def get_pid(status: int, pw: str) -> Any:
+        connstr = (
+            f"postgresql://http_auth:{pw}@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
+        )
+        response = requests.post(
+            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+            data=json.dumps(
+                {"query": "SELECT pid FROM pg_stat_activity WHERE state = 'active'", "params": []}
+            ),
+            headers={
+                "Content-Type": "application/sql",
+                "Neon-Connection-String": connstr,
+                "Neon-Pool-Opt-In": "true",
+            },
+            verify=str(static_proxy.test_output_dir / "proxy.crt"),
+        )
+        assert response.status_code == status
+        return response.json()
+
+    pid1 = get_pid(200, "http")["rows"][0]["pid"]
+
+    # query should be on the same connection
+    rows = get_pid(200, "http")["rows"]
+    assert rows == [{"pid": pid1}]
+
+    # incorrect password should not work
+    res = get_pid(400, "foobar")
+    assert "password authentication failed for user" in res["message"]
+
+    static_proxy.safe_psql("alter user http_auth with password 'http2'")
+
+    # after password change, should open a new connection to verify it
+    pid2 = get_pid(200, "http2")["rows"][0]["pid"]
+    assert pid1 != pid2
+
+    # query should be on an existing connection
+    pid = get_pid(200, "http2")["rows"][0]["pid"]
+    assert pid in [pid1, pid2]
+
+    # old password should not work
+    res = get_pid(400, "http")
+    assert "password authentication failed for user" in res["message"]
--- a/test_runner/regress/test_read_trace.py
+++ b/test_runner/regress/test_read_trace.py
@@ -2,7 +2,7 @@ from contextlib import closing

 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.pageserver.utils import wait_for_last_record_lsn
-from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.types import Lsn
 from fixtures.utils import query_scalar


@@ -12,24 +12,21 @@ from fixtures.utils import query_scalar
 # Additionally, tests that pageserver is able to create tenants with custom configs.
 def test_read_request_tracing(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.num_safekeepers = 1
-    env = neon_env_builder.init_start()
-
-    tenant, _ = env.neon_cli.create_tenant(
-        conf={
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
            "trace_read_requests": "true",
        }
    )

-    timeline = env.neon_cli.create_timeline("test_trace_replay", tenant_id=tenant)
-    endpoint = env.endpoints.create_start("test_trace_replay", "main", tenant)
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    endpoint = env.endpoints.create_start("main")

    with closing(endpoint.connect()) as conn:
        with conn.cursor() as cur:
            cur.execute("create table t (i integer);")
            cur.execute(f"insert into t values (generate_series(1,{10000}));")
            cur.execute("select count(*) from t;")
-            tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-            timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
            current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
    # wait until pageserver receives that data
    pageserver_http = env.pageserver.http_client()
@@ -38,5 +35,5 @@ def test_read_request_tracing(neon_env_builder: NeonEnvBuilder):
    # Stop postgres so we drop the connection and flush the traces
    endpoint.stop()

-    trace_path = env.repo_dir / "traces" / str(tenant) / str(timeline)
+    trace_path = env.repo_dir / "traces" / str(tenant_id) / str(timeline_id)
    assert trace_path.exists()
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -24,6 +24,7 @@ from fixtures.pageserver.utils import (
    wait_until_tenant_state,
 )
 from fixtures.remote_storage import (
+    TIMELINE_INDEX_PART_FILE_NAME,
    LocalFsStorage,
    RemoteStorageKind,
    available_remote_storages,
@@ -94,8 +95,13 @@ def test_remote_storage_backup_and_restore(

    client = env.pageserver.http_client()

-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Thats because of UnreliableWrapper's injected failures
+    env.pageserver.allowed_errors.append(
+        f".*failed to fetch tenant deletion mark at tenants/{tenant_id}/deleted attempt 1.*"
+    )

    checkpoint_numbers = range(1, 3)

@@ -168,9 +174,7 @@ def test_remote_storage_backup_and_restore(
    #
    # The initiated attach operation should survive the restart, and continue from where it was.
    env.pageserver.stop()
-    layer_download_failed_regex = (
-        r"download.*[0-9A-F]+-[0-9A-F]+.*open a download stream for layer.*simulated failure"
-    )
+    layer_download_failed_regex = r"Failed to download a remote file: simulated failure of remote operation Download.*[0-9A-F]+-[0-9A-F]+"
    assert not env.pageserver.log_contains(
        layer_download_failed_regex
    ), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint"
@@ -203,7 +207,7 @@ def test_remote_storage_backup_and_restore(
                == f"{data}|{checkpoint_number}"
            )

-    log.info("ensure that we neede to retry downloads due to test_remote_failures=1")
+    log.info("ensure that we needed to retry downloads due to test_remote_failures=1")
    assert env.pageserver.log_contains(layer_download_failed_regex)


@@ -266,7 +270,7 @@ def test_remote_storage_upload_queue_retries(
                f"""
               INSERT INTO foo (id, val)
               SELECT g, '{data}'
-               FROM generate_series(1, 10000) g
+               FROM generate_series(1, 20000) g
               ON CONFLICT (id) DO UPDATE
               SET val = EXCLUDED.val
               """,
@@ -367,7 +371,7 @@ def test_remote_storage_upload_queue_retries(
    log.info("restarting postgres to validate")
    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
    with endpoint.cursor() as cur:
-        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
+        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 20000


@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
@@ -399,8 +403,7 @@ def test_remote_timeline_client_calls_started_metric(
    )

    tenant_id = env.initial_tenant
-    assert env.initial_timeline is not None
-    timeline_id: TimelineId = env.initial_timeline
+    timeline_id = env.initial_timeline

    client = env.pageserver.http_client()

@@ -415,7 +418,7 @@ def test_remote_timeline_client_calls_started_metric(
                f"""
               INSERT INTO foo (id, val)
               SELECT g, '{data}'
-               FROM generate_series(1, 10000) g
+               FROM generate_series(1, 20000) g
               ON CONFLICT (id) DO UPDATE
               SET val = EXCLUDED.val
               """,
@@ -506,7 +509,7 @@ def test_remote_timeline_client_calls_started_metric(
    log.info("restarting postgres to validate")
    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
    with endpoint.cursor() as cur:
-        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
+        assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 20000

    # ensure that we updated the calls_started download metric
    fetch_calls_started()
@@ -538,8 +541,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
        }
    )
    tenant_id = env.initial_tenant
-    assert env.initial_timeline is not None
-    timeline_id: TimelineId = env.initial_timeline
+    timeline_id = env.initial_timeline

    timeline_path = env.timeline_dir(tenant_id, timeline_id)

@@ -604,15 +606,15 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
        ".* ERROR .*Error processing HTTP request: InternalServerError\\(timeline is Stopping"
    )

-    timeline_delete_wait_completed(client, tenant_id, timeline_id)
+    # Generous timeout, because currently deletions can get blocked waiting for compaction
+    # This can be reduced when https://github.com/neondatabase/neon/issues/4998 is fixed.
+    timeline_delete_wait_completed(client, tenant_id, timeline_id, iterations=30, interval=1)

    assert not timeline_path.exists()

    # to please mypy
    assert isinstance(env.remote_storage, LocalFsStorage)
-    remote_timeline_path = (
-        env.remote_storage.root / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
-    )
+    remote_timeline_path = env.remote_storage.timeline_path(tenant_id, timeline_id)

    assert not list(remote_timeline_path.iterdir())

@@ -717,15 +719,14 @@ def test_empty_branch_remote_storage_upload_on_restart(
    # index upload is now hitting the failpoint, it should block the shutdown
    env.pageserver.stop(immediate=True)

-    timeline_path = (
-        Path("tenants") / str(env.initial_tenant) / "timelines" / str(new_branch_timeline_id)
-    )
-
-    local_metadata = env.repo_dir / timeline_path / "metadata"
+    local_metadata = env.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata"
    assert local_metadata.is_file()

    assert isinstance(env.remote_storage, LocalFsStorage)
-    new_branch_on_remote_storage = env.remote_storage.root / timeline_path
+
+    new_branch_on_remote_storage = env.remote_storage.timeline_path(
+        env.initial_tenant, new_branch_timeline_id
+    )
    assert (
        not new_branch_on_remote_storage.exists()
    ), "failpoint should had prohibited index_part.json upload"
@@ -774,7 +775,7 @@ def test_empty_branch_remote_storage_upload_on_restart(
        assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id)

        assert (
-            new_branch_on_remote_storage / "index_part.json"
+            new_branch_on_remote_storage / TIMELINE_INDEX_PART_FILE_NAME
        ).is_file(), "uploads scheduled during initial load should had been awaited for"
    finally:
        create_thread.join()
@@ -805,8 +806,7 @@ def test_compaction_delete_before_upload(
    )

    tenant_id = env.initial_tenant
-    assert env.initial_timeline is not None
-    timeline_id: TimelineId = env.initial_timeline
+    timeline_id = env.initial_timeline

    client = env.pageserver.http_client()

--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -0,0 +1,406 @@
+import enum
+import os
+import shutil
+from pathlib import Path
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    PgBin,
+    last_flush_lsn_upload,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.http import PageserverApiException
+from fixtures.pageserver.utils import (
+    MANY_SMALL_LAYERS_TENANT_CONFIG,
+    assert_prefix_empty,
+    assert_prefix_not_empty,
+    poll_for_remote_storage_iterations,
+    tenant_delete_wait_completed,
+    wait_tenant_status_404,
+    wait_until_tenant_active,
+    wait_until_tenant_state,
+)
+from fixtures.remote_storage import (
+    RemoteStorageKind,
+    available_remote_storages,
+    available_s3_storages,
+)
+from fixtures.types import TenantId
+from fixtures.utils import run_pg_bench_small
+
+
+@pytest.mark.parametrize(
+    "remote_storage_kind", [RemoteStorageKind.NOOP, *available_remote_storages()]
+)
+def test_tenant_delete_smoke(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    pg_bin: PgBin,
+):
+    neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_tenant_delete_smoke",
+    )
+
+    env = neon_env_builder.init_start()
+
+    # lucky race with stopping from flushing a layer we fail to schedule any uploads
+    env.pageserver.allowed_errors.append(
+        ".*layer flush task.+: could not flush frozen layer: update_metadata_file"
+    )
+
+    ps_http = env.pageserver.http_client()
+
+    # first try to delete non existing tenant
+    tenant_id = TenantId.generate()
+    env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
+    with pytest.raises(PageserverApiException, match=f"NotFound: tenant {tenant_id}"):
+        ps_http.tenant_delete(tenant_id=tenant_id)
+
+    env.neon_cli.create_tenant(
+        tenant_id=tenant_id,
+        conf=MANY_SMALL_LAYERS_TENANT_CONFIG,
+    )
+
+    # create two timelines one being the parent of another
+    parent = None
+    for timeline in ["first", "second"]:
+        timeline_id = env.neon_cli.create_branch(
+            timeline, tenant_id=tenant_id, ancestor_branch_name=parent
+        )
+        with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
+            run_pg_bench_small(pg_bin, endpoint.connstr())
+            wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
+
+            if remote_storage_kind in available_s3_storages():
+                assert_prefix_not_empty(
+                    neon_env_builder,
+                    prefix="/".join(
+                        (
+                            "tenants",
+                            str(tenant_id),
+                        )
+                    ),
+                )
+
+        parent = timeline
+
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
+
+    tenant_path = env.tenant_dir(tenant_id=tenant_id)
+    assert not tenant_path.exists()
+
+    if remote_storage_kind in available_s3_storages():
+        assert_prefix_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )
+
+
+class Check(enum.Enum):
+    RETRY_WITHOUT_RESTART = enum.auto()
+    RETRY_WITH_RESTART = enum.auto()
+
+
+FAILPOINTS = [
+    "tenant-delete-before-shutdown",
+    "tenant-delete-before-create-remote-mark",
+    "tenant-delete-before-create-local-mark",
+    "tenant-delete-before-background",
+    "tenant-delete-before-polling-ongoing-deletions",
+    "tenant-delete-before-cleanup-remaining-fs-traces",
+    "tenant-delete-before-remove-timelines-dir",
+    "tenant-delete-before-remove-deleted-mark",
+    "tenant-delete-before-remove-tenant-dir",
+    # Some failpoints from timeline deletion
+    "timeline-delete-before-index-deleted-at",
+    "timeline-delete-before-rm",
+    "timeline-delete-before-index-delete",
+    "timeline-delete-after-rm-dir",
+]
+
+FAILPOINTS_BEFORE_BACKGROUND = [
+    "timeline-delete-before-schedule",
+    "tenant-delete-before-shutdown",
+    "tenant-delete-before-create-remote-mark",
+    "tenant-delete-before-create-local-mark",
+    "tenant-delete-before-background",
+]
+
+
+def combinations():
+    result = []
+
+    remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
+    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
+        remotes.append(RemoteStorageKind.REAL_S3)
+
+    for remote_storage_kind in remotes:
+        for delete_failpoint in FAILPOINTS:
+            if remote_storage_kind is RemoteStorageKind.NOOP and delete_failpoint in (
+                "timeline-delete-before-index-delete",
+            ):
+                # the above failpoint are not relevant for config without remote storage
+                continue
+
+            # Simulate failures for only one type of remote storage
+            # to avoid log pollution and make tests run faster
+            if remote_storage_kind is RemoteStorageKind.MOCK_S3:
+                simulate_failures = True
+            else:
+                simulate_failures = False
+            result.append((remote_storage_kind, delete_failpoint, simulate_failures))
+    return result
+
+
+@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations())
+@pytest.mark.parametrize("check", list(Check))
+def test_delete_tenant_exercise_crash_safety_failpoints(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    failpoint: str,
+    simulate_failures: bool,
+    check: Check,
+    pg_bin: PgBin,
+):
+    if simulate_failures:
+        neon_env_builder.pageserver_config_override = "test_remote_failures=1"
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind, "test_delete_tenant_exercise_crash_safety_failpoints"
+    )
+
+    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+
+    tenant_id = env.initial_tenant
+
+    env.pageserver.allowed_errors.extend(
+        [
+            # From deletion polling
+            f".*NotFound: tenant {env.initial_tenant}.*",
+            # allow errors caused by failpoints
+            f".*failpoint: {failpoint}",
+            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
+            ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            # We may leave some upload tasks in the queue. They're likely deletes.
+            # For uploads we explicitly wait with `last_flush_lsn_upload` below.
+            # So by ignoring these instead of waiting for empty upload queue
+            # we execute more distinct code paths.
+            '.*stopping left-over name="remote upload".*',
+        ]
+    )
+
+    ps_http = env.pageserver.http_client()
+
+    timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
+    with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint:
+        # generate enough layers
+        run_pg_bench_small(pg_bin, endpoint.connstr())
+        if remote_storage_kind is RemoteStorageKind.NOOP:
+            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+        else:
+            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
+
+            if remote_storage_kind in available_s3_storages():
+                assert_prefix_not_empty(
+                    neon_env_builder,
+                    prefix="/".join(
+                        (
+                            "tenants",
+                            str(tenant_id),
+                        )
+                    ),
+                )
+
+    ps_http.configure_failpoints((failpoint, "return"))
+
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    # These failpoints are earlier than background task is spawned.
+    # so they result in api request failure.
+    if failpoint in FAILPOINTS_BEFORE_BACKGROUND:
+        with pytest.raises(PageserverApiException, match=failpoint):
+            ps_http.tenant_delete(tenant_id)
+
+    else:
+        ps_http.tenant_delete(tenant_id)
+        tenant_info = wait_until_tenant_state(
+            pageserver_http=ps_http,
+            tenant_id=tenant_id,
+            expected_state="Broken",
+            iterations=iterations,
+        )
+
+        reason = tenant_info["state"]["data"]["reason"]
+        log.info(f"tenant broken: {reason}")
+
+        # failpoint may not be the only error in the stack
+        assert reason.endswith(f"failpoint: {failpoint}"), reason
+
+    if check is Check.RETRY_WITH_RESTART:
+        env.pageserver.stop()
+        env.pageserver.start()
+
+        if (
+            remote_storage_kind is RemoteStorageKind.NOOP
+            and failpoint == "tenant-delete-before-create-local-mark"
+        ):
+            tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
+        elif failpoint in (
+            "tenant-delete-before-shutdown",
+            "tenant-delete-before-create-remote-mark",
+        ):
+            wait_until_tenant_active(
+                ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25
+            )
+            tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
+        else:
+            # Pageserver should've resumed deletion after restart.
+            wait_tenant_status_404(ps_http, tenant_id, iterations=iterations + 10)
+    elif check is Check.RETRY_WITHOUT_RESTART:
+        # this should succeed
+        # this also checks that delete can be retried even when tenant is in Broken state
+        ps_http.configure_failpoints((failpoint, "off"))
+
+        tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
+
+    tenant_dir = env.tenant_dir(tenant_id)
+    # Check local is empty
+    assert not tenant_dir.exists()
+
+    # Check remote is empty
+    if remote_storage_kind in available_s3_storages():
+        assert_prefix_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )
+
+
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_tenant_delete_is_resumed_on_attach(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    pg_bin: PgBin,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_deleted_tenant_ignored_on_attach",
+    )
+
+    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+
+    tenant_id = env.initial_tenant
+
+    ps_http = env.pageserver.http_client()
+    # create two timelines
+    for timeline in ["first", "second"]:
+        timeline_id = env.neon_cli.create_timeline(timeline, tenant_id=tenant_id)
+        with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
+            run_pg_bench_small(pg_bin, endpoint.connstr())
+            wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
+
+    # sanity check, data should be there
+    if remote_storage_kind in available_s3_storages():
+        assert_prefix_not_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )
+
+    # failpoint before we remove index_part from s3
+    failpoint = "timeline-delete-before-index-delete"
+    ps_http.configure_failpoints((failpoint, "return"))
+
+    env.pageserver.allowed_errors.extend(
+        (
+            # allow errors caused by failpoints
+            f".*failpoint: {failpoint}",
+            # From deletion polling
+            f".*NotFound: tenant {env.initial_tenant}.*",
+            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
+            ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            # error from http response is also logged
+            ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
+            '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
+        )
+    )
+
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    ps_http.tenant_delete(tenant_id)
+
+    tenant_info = wait_until_tenant_state(
+        pageserver_http=ps_http,
+        tenant_id=tenant_id,
+        expected_state="Broken",
+        iterations=iterations,
+    )
+
+    if remote_storage_kind in available_s3_storages():
+        assert_prefix_not_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )
+
+    reason = tenant_info["state"]["data"]["reason"]
+    # failpoint may not be the only error in the stack
+    assert reason.endswith(f"failpoint: {failpoint}"), reason
+
+    # now we stop pageserver and remove local tenant state
+    env.endpoints.stop_all()
+    env.pageserver.stop()
+
+    dir_to_clear = Path(env.repo_dir) / "tenants"
+    shutil.rmtree(dir_to_clear)
+    os.mkdir(dir_to_clear)
+
+    env.pageserver.start()
+
+    # now we call attach
+    ps_http.tenant_attach(tenant_id=tenant_id)
+
+    # delete should be resumed
+    wait_tenant_status_404(ps_http, tenant_id, iterations)
+
+    # we shouldn've created tenant dir on disk
+    tenant_path = env.tenant_dir(tenant_id=tenant_id)
+    assert not tenant_path.exists()
+
+    if remote_storage_kind in available_s3_storages():
+        assert_prefix_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                )
+            ),
+        )
+
+
+# TODO test concurrent deletions with "hang" failpoint
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -66,6 +66,10 @@ def test_tenant_reattach(
    env.pageserver.allowed_errors.append(
        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
    )
+    # Thats because of UnreliableWrapper's injected failures
+    env.pageserver.allowed_errors.append(
+        f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
+    )

    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
        with endpoint.cursor() as cur:
@@ -459,8 +463,8 @@ def test_detach_while_attaching(

    client = env.pageserver.http_client()

-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline

    # Attempts to connect from compute to pageserver while the tenant is
    # temporarily detached produces these errors in the pageserver log.
@@ -611,8 +615,8 @@ def test_ignored_tenant_download_missing_layers(
    pageserver_http = env.pageserver.http_client()
    endpoint = env.endpoints.create_start("main")

-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline

    # Attempts to connect from compute to pageserver while the tenant is
    # temporarily detached produces these errors in the pageserver log.
@@ -675,10 +679,10 @@ def test_ignored_tenant_stays_broken_without_metadata(
    )
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()
-    endpoint = env.endpoints.create_start("main")
+    env.endpoints.create_start("main")

-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline

    # Attempts to connect from compute to pageserver while the tenant is
    # temporarily detached produces these errors in the pageserver log.
@@ -719,9 +723,9 @@ def test_load_attach_negatives(
    )
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()
-    endpoint = env.endpoints.create_start("main")
+    env.endpoints.create_start("main")

-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    tenant_id = env.initial_tenant

    # Attempts to connect from compute to pageserver while the tenant is
    # temporarily detached produces these errors in the pageserver log.
@@ -769,8 +773,8 @@ def test_ignore_while_attaching(

    pageserver_http = env.pageserver.http_client()

-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline

    # Attempts to connect from compute to pageserver while the tenant is
    # temporarily detached produces these errors in the pageserver log.
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -17,9 +17,9 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
    assert_tenant_state,
-    tenant_exists,
    wait_for_last_record_lsn,
    wait_for_upload,
+    wait_tenant_status_404,
 )
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
@@ -29,7 +29,6 @@ from fixtures.utils import (
    start_in_background,
    subprocess_capture,
    wait_until,
-    wait_while,
 )


@@ -269,11 +268,16 @@ def test_tenant_relocation(

    env = neon_env_builder.init_start()

+    tenant_id = TenantId("74ee8b079a0e437eb0afea7d26a07209")
+
    # FIXME: Is this expected?
    env.pageserver.allowed_errors.append(
        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
    )

+    # Needed for detach polling.
+    env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
+
    # create folder for remote storage mock
    remote_storage_mock_path = env.repo_dir / "local_fs_remote_storage"

@@ -283,9 +287,7 @@ def test_tenant_relocation(

    pageserver_http = env.pageserver.http_client()

-    tenant_id, initial_timeline_id = env.neon_cli.create_tenant(
-        TenantId("74ee8b079a0e437eb0afea7d26a07209")
-    )
+    _, initial_timeline_id = env.neon_cli.create_tenant(tenant_id)
    log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id)

    env.neon_cli.create_branch("test_tenant_relocation_main", tenant_id=tenant_id)
@@ -469,11 +471,8 @@ def test_tenant_relocation(
        pageserver_http.tenant_detach(tenant_id)

        # Wait a little, so that the detach operation has time to finish.
-        wait_while(
-            number_of_iterations=100,
-            interval=1,
-            func=lambda: tenant_exists(pageserver_http, tenant_id),
-        )
+        wait_tenant_status_404(pageserver_http, tenant_id, iterations=100, interval=1)
+
        post_migration_check(ep_main, 500500, old_local_path_main)
        post_migration_check(ep_second, 1001000, old_local_path_second)

--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -7,7 +7,6 @@
 #

 import asyncio
-import json
 import os
 from pathlib import Path
 from typing import List, Tuple
@@ -143,8 +142,13 @@ def test_tenants_attached_after_download(

    client = env.pageserver.http_client()

-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    # Thats because of UnreliableWrapper's injected failures
+    env.pageserver.allowed_errors.append(
+        f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
+    )

    for checkpoint_number in range(1, 3):
        with endpoint.cursor() as cur:
@@ -220,10 +224,11 @@ def test_tenants_attached_after_download(
 # FIXME: test index_part.json getting downgraded from imaginary new version


-@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_tenant_redownloads_truncated_file_on_startup(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+    neon_env_builder: NeonEnvBuilder,
 ):
+    remote_storage_kind = RemoteStorageKind.LOCAL_FS
+
    # since we now store the layer file length metadata, we notice on startup that a layer file is of wrong size, and proceed to redownload it.
    neon_env_builder.enable_remote_storage(
        remote_storage_kind=remote_storage_kind,
@@ -232,6 +237,8 @@ def test_tenant_redownloads_truncated_file_on_startup(

    env = neon_env_builder.init_start()

+    assert isinstance(env.remote_storage, LocalFsStorage)
+
    env.pageserver.allowed_errors.append(
        ".*removing local file .* because it has unexpected length.*"
    )
@@ -245,8 +252,8 @@ def test_tenant_redownloads_truncated_file_on_startup(
    pageserver_http = env.pageserver.http_client()
    endpoint = env.endpoints.create_start("main")

-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline

    with endpoint.cursor() as cur:
        cur.execute("CREATE TABLE t1 AS VALUES (123, 'foobar');")
@@ -274,7 +281,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
    (path, expected_size) = local_layer_truncated

    # ensure the same size is found from the index_part.json
-    index_part = local_fs_index_part(env, tenant_id, timeline_id)
+    index_part = env.remote_storage.index_content(tenant_id, timeline_id)
    assert index_part["layer_metadata"][path.name]["file_size"] == expected_size

    ## Start the pageserver. It will notice that the file size doesn't match, and
@@ -304,7 +311,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
    assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded"

    # the remote side of local_layer_truncated
-    remote_layer_path = local_fs_index_part_path(env, tenant_id, timeline_id).parent / path.name
+    remote_layer_path = env.remote_storage.timeline_path(tenant_id, timeline_id) / path.name

    # if the upload ever was ongoing, this check would be racy, but at least one
    # extra http request has been made in between so assume it's enough delay
@@ -329,27 +336,3 @@ def test_tenant_redownloads_truncated_file_on_startup(
    assert (
        os.stat(remote_layer_path).st_size == expected_size
    ), "truncated file should not had been uploaded after next checkpoint"
-
-
-def local_fs_index_part(env, tenant_id, timeline_id):
-    """
-    Return json.load parsed index_part.json of tenant and timeline from LOCAL_FS
-    """
-    timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id)
-    with open(timeline_path, "r") as timeline_file:
-        return json.load(timeline_file)
-
-
-def local_fs_index_part_path(env, tenant_id, timeline_id):
-    """
-    Return path to the LOCAL_FS index_part.json of the tenant and timeline.
-    """
-    assert isinstance(env.remote_storage, LocalFsStorage)
-    return (
-        env.remote_storage.root
-        / "tenants"
-        / str(tenant_id)
-        / "timelines"
-        / str(timeline_id)
-        / "index_part.json"
-    )
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -10,7 +10,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import LayerMapInfo
 from fixtures.remote_storage import RemoteStorageKind
-from fixtures.types import TimelineId
 from pytest_httpserver import HTTPServer

 # NB: basic config change tests are in test_tenant_conf.py
@@ -45,7 +44,6 @@ def test_threshold_based_eviction(
    )

    tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
-    assert isinstance(timeline_id, TimelineId)

    ps_http = env.pageserver.http_client()
    assert ps_http.tenant_config(tenant_id).effective_config["eviction_policy"] == {
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -4,7 +4,6 @@ import queue
 import shutil
 import threading
 from pathlib import Path
-from typing import Optional

 import pytest
 import requests
@@ -18,6 +17,10 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
+    MANY_SMALL_LAYERS_TENANT_CONFIG,
+    assert_prefix_empty,
+    assert_prefix_not_empty,
+    poll_for_remote_storage_iterations,
    timeline_delete_wait_completed,
    wait_for_last_record_lsn,
    wait_for_upload,
@@ -26,12 +29,13 @@ from fixtures.pageserver.utils import (
    wait_until_timeline_state,
 )
 from fixtures.remote_storage import (
+    LocalFsStorage,
    RemoteStorageKind,
-    S3Storage,
    available_remote_storages,
+    available_s3_storages,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import query_scalar, wait_until
+from fixtures.utils import query_scalar, run_pg_bench_small, wait_until


 def test_timeline_delete(neon_simple_env: NeonEnv):
@@ -187,10 +191,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
    8. Retry or restart without the failpoint and check the result.
    """

-    if remote_storage_kind is not None:
-        neon_env_builder.enable_remote_storage(
-            remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints"
-        )
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints"
+    )

    env = neon_env_builder.init_start(
        initial_tenant_conf={
@@ -206,12 +209,25 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
    timeline_id = env.neon_cli.create_timeline("delete")
    with env.endpoints.create_start("delete") as endpoint:
        # generate enough layers
-        pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", endpoint.connstr()])
+        run_pg_bench_small(pg_bin, endpoint.connstr())
        if remote_storage_kind is RemoteStorageKind.NOOP:
            wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline_id)
        else:
            last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)

+            if remote_storage_kind in available_s3_storages():
+                assert_prefix_not_empty(
+                    neon_env_builder,
+                    prefix="/".join(
+                        (
+                            "tenants",
+                            str(env.initial_tenant),
+                            "timelines",
+                            str(timeline_id),
+                        )
+                    ),
+                )
+
    env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
    # It appears when we stopped flush loop during deletion and then pageserver is stopped
    env.pageserver.allowed_errors.append(
@@ -231,7 +247,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(

    ps_http.configure_failpoints((failpoint, "return"))

-    iterations = 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 4
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)

    # These failpoints are earlier than background task is spawned.
    # so they result in api request failure.
@@ -280,14 +296,14 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
                        "remote_storage_s3_request_seconds_count",
                        filter={"request_type": "get_object", "result": "err"},
                    ).value
-                    == 1
+                    == 2  # One is missing tenant deletion mark, second is missing index part
                )
                assert (
                    m.query_one(
                        "remote_storage_s3_request_seconds_count",
                        filter={"request_type": "get_object", "result": "ok"},
                    ).value
-                    == 1
+                    == 1  # index part for initial timeline
                )
    elif check is Check.RETRY_WITHOUT_RESTART:
        # this should succeed
@@ -298,7 +314,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
            ps_http, env.initial_tenant, timeline_id, iterations=iterations
        )

-    # Check remote is impty
+    # Check remote is empty
    if remote_storage_kind is RemoteStorageKind.MOCK_S3:
        assert_prefix_empty(
            neon_env_builder,
@@ -343,8 +359,8 @@ def test_timeline_resurrection_on_attach(
    ps_http = env.pageserver.http_client()
    pg = env.endpoints.create_start("main")

-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    main_timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = env.initial_tenant
+    main_timeline_id = env.initial_timeline

    with pg.cursor() as cur:
        cur.execute("CREATE TABLE f (i integer);")
@@ -413,27 +429,6 @@ def test_timeline_resurrection_on_attach(
    assert all([tl["state"] == "Active" for tl in timelines])


-def assert_prefix_empty(neon_env_builder: NeonEnvBuilder, prefix: Optional[str] = None):
-    # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
-    assert neon_env_builder.remote_storage_kind in (
-        RemoteStorageKind.MOCK_S3,
-        RemoteStorageKind.REAL_S3,
-    )
-    # For mypy
-    assert isinstance(neon_env_builder.remote_storage, S3Storage)
-
-    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
-    assert neon_env_builder.remote_storage_client is not None
-    response = neon_env_builder.remote_storage_client.list_objects_v2(
-        Bucket=neon_env_builder.remote_storage.bucket_name,
-        Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
-    )
-    objects = response.get("Contents")
-    assert (
-        response["KeyCount"] == 0
-    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
-
-
 def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuilder):
    """
    When deleting a timeline, if we succeed in setting the deleted flag remotely
@@ -493,15 +488,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
    # Wait for tenant to finish loading.
    wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=1)

-    try:
-        data = ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
-        log.debug(f"detail {data}")
-    except PageserverApiException as e:
-        log.debug(e)
-        if e.status_code != 404:
-            raise
-    else:
-        raise Exception("detail succeeded (it should return 404)")
+    wait_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id, iterations=4)

    assert (
        not leaf_timeline_path.exists()
@@ -525,8 +512,6 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
        ),
    )

-    assert env.initial_timeline is not None
-
    for timeline_id in (intermediate_timeline_id, env.initial_timeline):
        timeline_delete_wait_completed(
            ps_http, tenant_id=env.initial_tenant, timeline_id=timeline_id
@@ -729,13 +714,9 @@ def test_timeline_delete_works_for_remote_smoke(
    ps_http = env.pageserver.http_client()
    pg = env.endpoints.create_start("main")

-    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
-    main_timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline

-    assert tenant_id == env.initial_tenant
-    assert main_timeline_id == env.initial_timeline
-
-    assert env.initial_timeline is not None
    timeline_ids = [env.initial_timeline]
    for i in range(2):
        branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main")
@@ -756,9 +737,21 @@ def test_timeline_delete_works_for_remote_smoke(
            log.info("waiting for checkpoint upload")
            wait_for_upload(ps_http, tenant_id, branch_timeline_id, current_lsn)
            log.info("upload of checkpoint is done")
-            timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])

-        timeline_ids.append(timeline_id)
+        timeline_ids.append(branch_timeline_id)
+
+    for timeline_id in timeline_ids:
+        assert_prefix_not_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(env.initial_tenant),
+                    "timelines",
+                    str(timeline_id),
+                )
+            ),
+        )

    for timeline_id in reversed(timeline_ids):
        # note that we need to finish previous deletion before scheduling next one
@@ -779,8 +772,183 @@ def test_timeline_delete_works_for_remote_smoke(

    # for some reason the check above doesnt immediately take effect for the below.
    # Assume it is mock server inconsistency and check twice.
-    wait_until(
-        2,
-        0.5,
-        lambda: assert_prefix_empty(neon_env_builder),
+    wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder))
+
+
+def test_delete_orphaned_objects(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+):
+    remote_storage_kind = RemoteStorageKind.LOCAL_FS
+    neon_env_builder.enable_remote_storage(remote_storage_kind, "test_delete_orphaned_objects")
+
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            "checkpoint_distance": f"{1024 ** 2}",
+            "image_creation_threshold": "100",
+        }
    )
+
+    assert isinstance(env.remote_storage, LocalFsStorage)
+
+    ps_http = env.pageserver.http_client()
+
+    timeline_id = env.neon_cli.create_timeline("delete")
+    with env.endpoints.create_start("delete") as endpoint:
+        # generate enough layers
+        run_pg_bench_small(pg_bin, endpoint.connstr())
+        last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
+
+    # write orphaned file that is missing from the index
+    remote_timeline_path = env.remote_storage.timeline_path(env.initial_tenant, timeline_id)
+    orphans = [remote_timeline_path / f"orphan_{i}" for i in range(3)]
+    for orphan in orphans:
+        orphan.write_text("I shouldnt be there")
+
+    # trigger failpoint after orphaned file deletion to check that index_part is not deleted as well.
+    failpoint = "timeline-delete-before-index-delete"
+    ps_http.configure_failpoints((failpoint, "return"))
+
+    env.pageserver.allowed_errors.append(f".*failpoint: {failpoint}")
+
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    ps_http.timeline_delete(env.initial_tenant, timeline_id)
+    timeline_info = wait_until_timeline_state(
+        pageserver_http=ps_http,
+        tenant_id=env.initial_tenant,
+        timeline_id=timeline_id,
+        expected_state="Broken",
+        iterations=iterations,
+    )
+
+    reason = timeline_info["state"]["Broken"]["reason"]
+    assert reason.endswith(f"failpoint: {failpoint}"), reason
+
+    for orphan in orphans:
+        assert not orphan.exists()
+        assert env.pageserver.log_contains(
+            f"deleting a file not referenced from index_part.json name={orphan.stem}"
+        )
+
+    assert env.remote_storage.index_path(env.initial_tenant, timeline_id).exists()
+
+
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_timeline_delete_resumed_on_attach(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    pg_bin: PgBin,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_deleted_tenant_ignored_on_attach",
+    )
+
+    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
+
+    tenant_id = env.initial_tenant
+
+    ps_http = env.pageserver.http_client()
+
+    timeline_id = env.neon_cli.create_timeline("delete")
+    with env.endpoints.create_start("delete") as endpoint:
+        # generate enough layers
+        run_pg_bench_small(pg_bin, endpoint.connstr())
+        last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
+
+        if remote_storage_kind in available_s3_storages():
+            assert_prefix_not_empty(
+                neon_env_builder,
+                prefix="/".join(
+                    (
+                        "tenants",
+                        str(env.initial_tenant),
+                        "timelines",
+                        str(timeline_id),
+                    )
+                ),
+            )
+
+    # failpoint before we remove index_part from s3
+    failpoint = "timeline-delete-during-rm"
+    ps_http.configure_failpoints((failpoint, "return"))
+
+    env.pageserver.allowed_errors.extend(
+        (
+            # allow errors caused by failpoints
+            f".*failpoint: {failpoint}",
+            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
+            ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            # error from http response is also logged
+            ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
+            # Polling after attach may fail with this
+            f".*InternalServerError\\(Tenant {tenant_id} is not active.*",
+            '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
+        )
+    )
+
+    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+
+    ps_http.timeline_delete(tenant_id, timeline_id)
+
+    timeline_info = wait_until_timeline_state(
+        pageserver_http=ps_http,
+        tenant_id=env.initial_tenant,
+        timeline_id=timeline_id,
+        expected_state="Broken",
+        iterations=iterations,
+    )
+
+    reason = timeline_info["state"]["Broken"]["reason"]
+    log.info(f"timeline broken: {reason}")
+
+    # failpoint may not be the only error in the stack
+    assert reason.endswith(f"failpoint: {failpoint}"), reason
+
+    if remote_storage_kind in available_s3_storages():
+        assert_prefix_not_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_id),
+                    "timelines",
+                    str(timeline_id),
+                )
+            ),
+        )
+
+    # now we stop pageserver and remove local tenant state
+    env.endpoints.stop_all()
+    env.pageserver.stop()
+
+    dir_to_clear = Path(env.repo_dir) / "tenants"
+    shutil.rmtree(dir_to_clear)
+    os.mkdir(dir_to_clear)
+
+    env.pageserver.start()
+
+    # now we call attach
+    ps_http.tenant_attach(tenant_id=tenant_id)
+
+    # delete should be resumed
+    wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id, iterations=iterations)
+
+    tenant_path = env.timeline_dir(tenant_id=tenant_id, timeline_id=timeline_id)
+    assert not tenant_path.exists()
+
+    if remote_storage_kind in available_s3_storages():
+        assert_prefix_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(timeline_id),
+                    "timelines",
+                    str(timeline_id),
+                )
+            ),
+        )
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -270,7 +270,8 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.enable_local_fs_remote_storage()
    env = neon_env_builder.init_start()

-    env.neon_cli.create_branch("test_broker", "main")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_broker", "main")

    # FIXME: Is this expected?
    env.pageserver.allowed_errors.append(
@@ -280,10 +281,6 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
    endpoint = env.endpoints.create_start("test_broker")
    endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")

-    # learn neon timeline from compute
-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
-
    # wait until remote_consistent_lsn gets advanced on all safekeepers
    clients = [sk.http_client() for sk in env.safekeepers]
    stat_before = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
@@ -325,7 +322,8 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
    )

-    env.neon_cli.create_branch("test_safekeepers_wal_removal")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_safekeepers_wal_removal")
    endpoint = env.endpoints.create_start("test_safekeepers_wal_removal")

    # Note: it is important to insert at least two segments, as currently
@@ -338,9 +336,6 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
        ]
    )

-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
-
    # force checkpoint to advance remote_consistent_lsn
    pageserver_conn_options = {}
    if auth_enabled:
@@ -451,13 +446,10 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot

    env = neon_env_builder.init_start()

-    env.neon_cli.create_branch("test_safekeepers_wal_backup")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_safekeepers_wal_backup")
    endpoint = env.endpoints.create_start("test_safekeepers_wal_backup")

-    # learn neon timeline from compute
-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
-
    pg_conn = endpoint.connect()
    cur = pg_conn.cursor()
    cur.execute("create table t(key int, value text)")
@@ -505,14 +497,11 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
    neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER

    env = neon_env_builder.init_start()
-    env.neon_cli.create_branch("test_s3_wal_replay")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_s3_wal_replay")

    endpoint = env.endpoints.create_start("test_s3_wal_replay")

-    # learn neon timeline from compute
-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
-
    expected_sum = 0

    with closing(endpoint.connect()) as conn:
@@ -543,8 +532,13 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
            last_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))

            for sk in env.safekeepers:
-                # require WAL to be trimmed, so no more than one segment is left on disk
-                target_size_mb = 16 * 1.5
+                # require WAL to be trimmed, so no more than one segment is left
+                # on disk
+                # TODO: WAL removal uses persistent values and control
+                # file is fsynced roughly once in a segment, so there is a small
+                # chance that two segments are left on disk, not one. We can
+                # force persist cf and have 16 instead of 32 here.
+                target_size_mb = 32 * 1.5
                wait(
                    partial(is_wal_trimmed, sk, tenant_id, timeline_id, target_size_mb),
                    f"sk_id={sk.id} to trim WAL to {target_size_mb:.2f}MB",
@@ -791,15 +785,12 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
    neon_env_builder.auth_enabled = auth_enabled
    env = neon_env_builder.init_start()

-    env.neon_cli.create_branch("test_timeline_status")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_timeline_status")
    endpoint = env.endpoints.create_start("test_timeline_status")

    wa = env.safekeepers[0]

-    # learn neon timeline from compute
-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
-
    if not auth_enabled:
        wa_http_cli = wa.http_client()
        wa_http_cli.check_status()
@@ -869,20 +860,57 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
    assert debug_dump_1["config"]["id"] == env.safekeepers[0].id


-# Test auth on WAL service (postgres protocol) ports.
+class DummyConsumer(object):
+    def __call__(self, msg):
+        pass
+
+
+def test_start_replication_term(neon_env_builder: NeonEnvBuilder):
+    """
+    Test START_REPLICATION of uncommitted part specifying leader term. It must
+    error if safekeeper switched to different term.
+    """
+
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_start_replication_term")
+    endpoint = env.endpoints.create_start("test_start_replication_term")
+
+    endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
+
+    sk = env.safekeepers[0]
+    sk_http_cli = sk.http_client()
+    tli_status = sk_http_cli.timeline_status(tenant_id, timeline_id)
+    timeline_start_lsn = tli_status.timeline_start_lsn
+
+    conn_opts = {
+        "host": "127.0.0.1",
+        "options": f"-c timeline_id={timeline_id} tenant_id={tenant_id}",
+        "port": sk.port.pg,
+        "connection_factory": psycopg2.extras.PhysicalReplicationConnection,
+    }
+    sk_pg_conn = psycopg2.connect(**conn_opts)  # type: ignore
+    with sk_pg_conn.cursor() as cur:
+        # should fail, as first start has term 2
+        cur.start_replication_expert(f"START_REPLICATION {timeline_start_lsn} (term='3')")
+        dummy_consumer = DummyConsumer()
+        with pytest.raises(psycopg2.errors.InternalError_) as excinfo:
+            cur.consume_stream(dummy_consumer)
+        assert "failed to acquire term 3" in str(excinfo.value)
+
+
+# Test auth on all ports: WAL service (postgres protocol), WAL service tenant only and http.
 def test_sk_auth(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.auth_enabled = True
    env = neon_env_builder.init_start()

-    env.neon_cli.create_branch("test_sk_auth")
-    endpoint = env.endpoints.create_start("test_sk_auth")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_sk_auth")
+    env.endpoints.create_start("test_sk_auth")

    sk = env.safekeepers[0]

-    # learn neon timeline from compute
-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
-
    tenant_token = env.auth_keys.generate_tenant_token(tenant_id)
    full_token = env.auth_keys.generate_safekeeper_token()

@@ -903,6 +931,64 @@ def test_sk_auth(neon_env_builder: NeonEnvBuilder):
    with pytest.raises(psycopg2.OperationalError):
        connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg_tenant_only, password=full_token)

+    # Now test that auth on http/pg can be enabled separately.
+
+    # By default, neon_local enables auth on all services if auth is configured,
+    # so http must require the token.
+    sk_http_cli_noauth = sk.http_client()
+    sk_http_cli_auth = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id))
+    with pytest.raises(sk_http_cli_noauth.HTTPError, match="Forbidden|Unauthorized"):
+        sk_http_cli_noauth.timeline_status(tenant_id, timeline_id)
+    sk_http_cli_auth.timeline_status(tenant_id, timeline_id)
+
+    # now, disable auth on http
+    sk.stop()
+    sk.start(extra_opts=["--http-auth-public-key-path="])
+    sk_http_cli_noauth.timeline_status(tenant_id, timeline_id)  # must work without token
+    # but pg should still require the token
+    with pytest.raises(psycopg2.OperationalError):
+        connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg)
+    connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg, password=tenant_token)
+
+    # now also disable auth on pg, but leave on pg tenant only
+    sk.stop()
+    sk.start(extra_opts=["--http-auth-public-key-path=", "--pg-auth-public-key-path="])
+    sk_http_cli_noauth.timeline_status(tenant_id, timeline_id)  # must work without token
+    connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg)  # must work without token
+    # but pg tenant only should still require the token
+    with pytest.raises(psycopg2.OperationalError):
+        connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg_tenant_only)
+    connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg_tenant_only, password=tenant_token)
+
+
+# Try restarting endpoint with enabled auth.
+def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.auth_enabled = True
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch("test_sk_auth_restart_endpoint")
+    endpoint = env.endpoints.create_start("test_sk_auth_restart_endpoint")
+
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("create table t(i int)")
+
+    # Restarting endpoints and random safekeepers, to trigger recovery.
+    for _i in range(3):
+        random_sk = random.choice(env.safekeepers)
+        random_sk.stop()
+
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                start = random.randint(1, 100000)
+                end = start + random.randint(1, 10000)
+                cur.execute("insert into t select generate_series(%s,%s)", (start, end))
+
+        endpoint.stop()
+        random_sk.start()
+        endpoint.start()
+

 class SafekeeperEnv:
    def __init__(
@@ -1079,7 +1165,8 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder):

    neon_env_builder.num_safekeepers = 4
    env = neon_env_builder.init_start()
-    env.neon_cli.create_branch("test_replace_safekeeper")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_replace_safekeeper")

    log.info("Use only first 3 safekeepers")
    env.safekeepers[3].stop()
@@ -1087,10 +1174,6 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder):
    endpoint.active_safekeepers = [1, 2, 3]
    endpoint.start()

-    # learn neon timeline from compute
-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
-
    execute_payload(endpoint)
    show_statuses(env.safekeepers, tenant_id, timeline_id)

@@ -1342,7 +1425,8 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder):

    neon_env_builder.num_safekeepers = 4
    env = neon_env_builder.init_start()
-    env.neon_cli.create_branch("test_pull_timeline")
+    tenant_id = env.initial_tenant
+    timeline_id = env.neon_cli.create_branch("test_pull_timeline")

    log.info("Use only first 3 safekeepers")
    env.safekeepers[3].stop()
@@ -1350,10 +1434,6 @@ def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
    endpoint.active_safekeepers = [1, 2, 3]
    endpoint.start()

-    # learn neon timeline from compute
-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
-
    execute_payload(endpoint)
    show_statuses(env.safekeepers, tenant_id, timeline_id)

--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,4 +1,4 @@
 {
-    "postgres-v15": "553f2d3618a6d4893bde67f1c065926ee8a3a118",
-    "postgres-v14": "28bf5ccfa2fda9677566a25abd450e714d9ed055"
+    "postgres-v15": "026d6b093d49e25cec44dd04598152329ceac027",
+    "postgres-v14": "5d5cfee12783f0989a9c9fe13bb40b5585812568"
 }